tu_query.c revision 7ec681f3
1/* 2 * Copyrigh 2016 Red Hat Inc. 3 * Based on anv: 4 * Copyright © 2015 Intel Corporation 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 22 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 23 * DEALINGS IN THE SOFTWARE. 24 */ 25 26#include "tu_private.h" 27 28#include <assert.h> 29#include <fcntl.h> 30#include <stdbool.h> 31#include <string.h> 32#include <unistd.h> 33 34#include "adreno_pm4.xml.h" 35#include "adreno_common.xml.h" 36#include "a6xx.xml.h" 37 38#include "nir/nir_builder.h" 39#include "util/os_time.h" 40 41#include "tu_cs.h" 42#include "vk_util.h" 43 44#define NSEC_PER_SEC 1000000000ull 45#define WAIT_TIMEOUT 5 46#define STAT_COUNT ((REG_A6XX_RBBM_PRIMCTR_10_LO - REG_A6XX_RBBM_PRIMCTR_0_LO) / 2 + 1) 47 48struct PACKED query_slot { 49 uint64_t available; 50}; 51 52struct PACKED occlusion_slot_value { 53 /* Seems sample counters are placed to be 16-byte aligned 54 * even though this query needs an 8-byte slot. */ 55 uint64_t value; 56 uint64_t _padding; 57}; 58 59struct PACKED occlusion_query_slot { 60 struct query_slot common; 61 uint64_t result; 62 63 struct occlusion_slot_value begin; 64 struct occlusion_slot_value end; 65}; 66 67struct PACKED timestamp_query_slot { 68 struct query_slot common; 69 uint64_t result; 70}; 71 72struct PACKED primitive_slot_value { 73 uint64_t values[2]; 74}; 75 76struct PACKED pipeline_stat_query_slot { 77 struct query_slot common; 78 uint64_t results[STAT_COUNT]; 79 80 uint64_t begin[STAT_COUNT]; 81 uint64_t end[STAT_COUNT]; 82}; 83 84struct PACKED primitive_query_slot { 85 struct query_slot common; 86 /* The result of transform feedback queries is two integer values: 87 * results[0] is the count of primitives written, 88 * results[1] is the count of primitives generated. 89 * Also a result for each stream is stored at 4 slots respectively. 90 */ 91 uint64_t results[2]; 92 93 /* Primitive counters also need to be 16-byte aligned. */ 94 uint64_t _padding; 95 96 struct primitive_slot_value begin[4]; 97 struct primitive_slot_value end[4]; 98}; 99 100struct PACKED perfcntr_query_slot { 101 uint64_t result; 102 uint64_t begin; 103 uint64_t end; 104}; 105 106struct PACKED perf_query_slot { 107 struct query_slot common; 108 struct perfcntr_query_slot perfcntr; 109}; 110 111/* Returns the IOVA of a given uint64_t field in a given slot of a query 112 * pool. */ 113#define query_iova(type, pool, query, field) \ 114 pool->bo.iova + pool->stride * (query) + offsetof(type, field) 115 116#define occlusion_query_iova(pool, query, field) \ 117 query_iova(struct occlusion_query_slot, pool, query, field) 118 119#define pipeline_stat_query_iova(pool, query, field) \ 120 pool->bo.iova + pool->stride * (query) + \ 121 offsetof(struct pipeline_stat_query_slot, field) 122 123#define primitive_query_iova(pool, query, field, i) \ 124 query_iova(struct primitive_query_slot, pool, query, field) + \ 125 offsetof(struct primitive_slot_value, values[i]) 126 127#define perf_query_iova(pool, query, field, i) \ 128 pool->bo.iova + pool->stride * (query) + \ 129 sizeof(struct query_slot) + \ 130 sizeof(struct perfcntr_query_slot) * (i) + \ 131 offsetof(struct perfcntr_query_slot, field) 132 133#define query_available_iova(pool, query) \ 134 query_iova(struct query_slot, pool, query, available) 135 136#define query_result_iova(pool, query, type, i) \ 137 pool->bo.iova + pool->stride * (query) + \ 138 sizeof(struct query_slot) + sizeof(type) * (i) 139 140#define query_result_addr(pool, query, type, i) \ 141 pool->bo.map + pool->stride * (query) + \ 142 sizeof(struct query_slot) + sizeof(type) * (i) 143 144#define query_is_available(slot) slot->available 145 146static const VkPerformanceCounterUnitKHR 147fd_perfcntr_type_to_vk_unit[] = { 148 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 149 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 150 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 151 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_UNIT_PERCENTAGE_KHR, 152 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_UNIT_BYTES_KHR, 153 /* TODO. can be UNIT_NANOSECONDS_KHR with a logic to compute */ 154 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 155 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_UNIT_HERTZ_KHR, 156 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 157 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 158 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 159 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 160 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_UNIT_GENERIC_KHR, 161}; 162 163/* TODO. Basically this comes from the freedreno implementation where 164 * only UINT64 is used. We'd better confirm this by the blob vulkan driver 165 * when it starts supporting perf query. 166 */ 167static const VkPerformanceCounterStorageKHR 168fd_perfcntr_type_to_vk_storage[] = { 169 [FD_PERFCNTR_TYPE_UINT] = VK_PERFORMANCE_COUNTER_STORAGE_UINT32_KHR, 170 [FD_PERFCNTR_TYPE_UINT64] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 171 [FD_PERFCNTR_TYPE_FLOAT] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 172 [FD_PERFCNTR_TYPE_PERCENTAGE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 173 [FD_PERFCNTR_TYPE_BYTES] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 174 [FD_PERFCNTR_TYPE_MICROSECONDS] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 175 [FD_PERFCNTR_TYPE_HZ] = VK_PERFORMANCE_COUNTER_STORAGE_UINT64_KHR, 176 [FD_PERFCNTR_TYPE_DBM] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 177 [FD_PERFCNTR_TYPE_TEMPERATURE] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 178 [FD_PERFCNTR_TYPE_VOLTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 179 [FD_PERFCNTR_TYPE_AMPS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 180 [FD_PERFCNTR_TYPE_WATTS] = VK_PERFORMANCE_COUNTER_STORAGE_FLOAT32_KHR, 181}; 182 183/* 184 * Returns a pointer to a given slot in a query pool. 185 */ 186static void* slot_address(struct tu_query_pool *pool, uint32_t query) 187{ 188 return (char*)pool->bo.map + query * pool->stride; 189} 190 191static void 192perfcntr_index(const struct fd_perfcntr_group *group, uint32_t group_count, 193 uint32_t index, uint32_t *gid, uint32_t *cid) 194 195{ 196 uint32_t i; 197 198 for (i = 0; i < group_count; i++) { 199 if (group[i].num_countables > index) { 200 *gid = i; 201 *cid = index; 202 break; 203 } 204 index -= group[i].num_countables; 205 } 206 207 assert(i < group_count); 208} 209 210static int 211compare_perfcntr_pass(const void *a, const void *b) 212{ 213 return ((struct tu_perf_query_data *)a)->pass - 214 ((struct tu_perf_query_data *)b)->pass; 215} 216 217VKAPI_ATTR VkResult VKAPI_CALL 218tu_CreateQueryPool(VkDevice _device, 219 const VkQueryPoolCreateInfo *pCreateInfo, 220 const VkAllocationCallbacks *pAllocator, 221 VkQueryPool *pQueryPool) 222{ 223 TU_FROM_HANDLE(tu_device, device, _device); 224 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 225 assert(pCreateInfo->queryCount > 0); 226 227 uint32_t pool_size, slot_size; 228 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 229 230 pool_size = sizeof(struct tu_query_pool); 231 232 switch (pCreateInfo->queryType) { 233 case VK_QUERY_TYPE_OCCLUSION: 234 slot_size = sizeof(struct occlusion_query_slot); 235 break; 236 case VK_QUERY_TYPE_TIMESTAMP: 237 slot_size = sizeof(struct timestamp_query_slot); 238 break; 239 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 240 slot_size = sizeof(struct primitive_query_slot); 241 break; 242 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 243 perf_query_info = 244 vk_find_struct_const(pCreateInfo->pNext, 245 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 246 assert(perf_query_info); 247 248 slot_size = sizeof(struct perf_query_slot) + 249 sizeof(struct perfcntr_query_slot) * 250 (perf_query_info->counterIndexCount - 1); 251 252 /* Size of the array pool->tu_perf_query_data */ 253 pool_size += sizeof(struct tu_perf_query_data) * 254 perf_query_info->counterIndexCount; 255 break; 256 } 257 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 258 slot_size = sizeof(struct pipeline_stat_query_slot); 259 break; 260 default: 261 unreachable("Invalid query type"); 262 } 263 264 struct tu_query_pool *pool = 265 vk_object_alloc(&device->vk, pAllocator, pool_size, 266 VK_OBJECT_TYPE_QUERY_POOL); 267 if (!pool) 268 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 269 270 if (pCreateInfo->queryType == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 271 pool->perf_group = fd_perfcntrs(&device->physical_device->dev_id, 272 &pool->perf_group_count); 273 274 pool->counter_index_count = perf_query_info->counterIndexCount; 275 276 /* Build all perf counters data that is requested, so we could get 277 * correct group id, countable id, counter register and pass index with 278 * only a counter index provided by applications at each command submit. 279 * 280 * Also, since this built data will be sorted by pass index later, we 281 * should keep the original indices and store perfcntrs results according 282 * to them so apps can get correct results with their own indices. 283 */ 284 uint32_t regs[pool->perf_group_count], pass[pool->perf_group_count]; 285 memset(regs, 0x00, pool->perf_group_count * sizeof(regs[0])); 286 memset(pass, 0x00, pool->perf_group_count * sizeof(pass[0])); 287 288 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 289 uint32_t gid = 0, cid = 0; 290 291 perfcntr_index(pool->perf_group, pool->perf_group_count, 292 perf_query_info->pCounterIndices[i], &gid, &cid); 293 294 pool->perf_query_data[i].gid = gid; 295 pool->perf_query_data[i].cid = cid; 296 pool->perf_query_data[i].app_idx = i; 297 298 /* When a counter register is over the capacity(num_counters), 299 * reset it for next pass. 300 */ 301 if (regs[gid] < pool->perf_group[gid].num_counters) { 302 pool->perf_query_data[i].cntr_reg = regs[gid]++; 303 pool->perf_query_data[i].pass = pass[gid]; 304 } else { 305 pool->perf_query_data[i].pass = ++pass[gid]; 306 pool->perf_query_data[i].cntr_reg = regs[gid] = 0; 307 regs[gid]++; 308 } 309 } 310 311 /* Sort by pass index so we could easily prepare a command stream 312 * with the ascending order of pass index. 313 */ 314 qsort(pool->perf_query_data, pool->counter_index_count, 315 sizeof(pool->perf_query_data[0]), 316 compare_perfcntr_pass); 317 } 318 319 VkResult result = tu_bo_init_new(device, &pool->bo, 320 pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS); 321 if (result != VK_SUCCESS) { 322 vk_object_free(&device->vk, pAllocator, pool); 323 return result; 324 } 325 326 result = tu_bo_map(device, &pool->bo); 327 if (result != VK_SUCCESS) { 328 tu_bo_finish(device, &pool->bo); 329 vk_object_free(&device->vk, pAllocator, pool); 330 return result; 331 } 332 333 /* Initialize all query statuses to unavailable */ 334 memset(pool->bo.map, 0, pool->bo.size); 335 336 pool->type = pCreateInfo->queryType; 337 pool->stride = slot_size; 338 pool->size = pCreateInfo->queryCount; 339 pool->pipeline_statistics = pCreateInfo->pipelineStatistics; 340 *pQueryPool = tu_query_pool_to_handle(pool); 341 342 return VK_SUCCESS; 343} 344 345VKAPI_ATTR void VKAPI_CALL 346tu_DestroyQueryPool(VkDevice _device, 347 VkQueryPool _pool, 348 const VkAllocationCallbacks *pAllocator) 349{ 350 TU_FROM_HANDLE(tu_device, device, _device); 351 TU_FROM_HANDLE(tu_query_pool, pool, _pool); 352 353 if (!pool) 354 return; 355 356 tu_bo_finish(device, &pool->bo); 357 vk_object_free(&device->vk, pAllocator, pool); 358} 359 360static uint32_t 361get_result_count(struct tu_query_pool *pool) 362{ 363 switch (pool->type) { 364 /* Occulusion and timestamp queries write one integer value */ 365 case VK_QUERY_TYPE_OCCLUSION: 366 case VK_QUERY_TYPE_TIMESTAMP: 367 return 1; 368 /* Transform feedback queries write two integer values */ 369 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 370 return 2; 371 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 372 return util_bitcount(pool->pipeline_statistics); 373 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 374 return pool->counter_index_count; 375 default: 376 assert(!"Invalid query type"); 377 return 0; 378 } 379} 380 381static uint32_t 382statistics_index(uint32_t *statistics) 383{ 384 uint32_t stat; 385 stat = u_bit_scan(statistics); 386 387 switch (1 << stat) { 388 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_VERTICES_BIT: 389 case VK_QUERY_PIPELINE_STATISTIC_VERTEX_SHADER_INVOCATIONS_BIT: 390 return 0; 391 case VK_QUERY_PIPELINE_STATISTIC_INPUT_ASSEMBLY_PRIMITIVES_BIT: 392 return 1; 393 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_CONTROL_SHADER_PATCHES_BIT: 394 return 2; 395 case VK_QUERY_PIPELINE_STATISTIC_TESSELLATION_EVALUATION_SHADER_INVOCATIONS_BIT: 396 return 4; 397 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_INVOCATIONS_BIT: 398 return 5; 399 case VK_QUERY_PIPELINE_STATISTIC_GEOMETRY_SHADER_PRIMITIVES_BIT: 400 return 6; 401 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_INVOCATIONS_BIT: 402 return 7; 403 case VK_QUERY_PIPELINE_STATISTIC_CLIPPING_PRIMITIVES_BIT: 404 return 8; 405 case VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT: 406 return 9; 407 case VK_QUERY_PIPELINE_STATISTIC_COMPUTE_SHADER_INVOCATIONS_BIT: 408 return 10; 409 default: 410 return 0; 411 } 412} 413 414/* Wait on the the availability status of a query up until a timeout. */ 415static VkResult 416wait_for_available(struct tu_device *device, struct tu_query_pool *pool, 417 uint32_t query) 418{ 419 /* TODO: Use the MSM_IOVA_WAIT ioctl to wait on the available bit in a 420 * scheduler friendly way instead of busy polling once the patch has landed 421 * upstream. */ 422 struct query_slot *slot = slot_address(pool, query); 423 uint64_t abs_timeout = os_time_get_absolute_timeout( 424 WAIT_TIMEOUT * NSEC_PER_SEC); 425 while(os_time_get_nano() < abs_timeout) { 426 if (query_is_available(slot)) 427 return VK_SUCCESS; 428 } 429 return vk_error(device, VK_TIMEOUT); 430} 431 432/* Writes a query value to a buffer from the CPU. */ 433static void 434write_query_value_cpu(char* base, 435 uint32_t offset, 436 uint64_t value, 437 VkQueryResultFlags flags) 438{ 439 if (flags & VK_QUERY_RESULT_64_BIT) { 440 *(uint64_t*)(base + (offset * sizeof(uint64_t))) = value; 441 } else { 442 *(uint32_t*)(base + (offset * sizeof(uint32_t))) = value; 443 } 444} 445 446static VkResult 447get_query_pool_results(struct tu_device *device, 448 struct tu_query_pool *pool, 449 uint32_t firstQuery, 450 uint32_t queryCount, 451 size_t dataSize, 452 void *pData, 453 VkDeviceSize stride, 454 VkQueryResultFlags flags) 455{ 456 assert(dataSize >= stride * queryCount); 457 458 char *result_base = pData; 459 VkResult result = VK_SUCCESS; 460 for (uint32_t i = 0; i < queryCount; i++) { 461 uint32_t query = firstQuery + i; 462 struct query_slot *slot = slot_address(pool, query); 463 bool available = query_is_available(slot); 464 uint32_t result_count = get_result_count(pool); 465 uint32_t statistics = pool->pipeline_statistics; 466 467 if ((flags & VK_QUERY_RESULT_WAIT_BIT) && !available) { 468 VkResult wait_result = wait_for_available(device, pool, query); 469 if (wait_result != VK_SUCCESS) 470 return wait_result; 471 available = true; 472 } else if (!(flags & VK_QUERY_RESULT_PARTIAL_BIT) && !available) { 473 /* From the Vulkan 1.1.130 spec: 474 * 475 * If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 476 * both not set then no result values are written to pData for 477 * queries that are in the unavailable state at the time of the 478 * call, and vkGetQueryPoolResults returns VK_NOT_READY. However, 479 * availability state is still written to pData for those queries 480 * if VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set. 481 */ 482 result = VK_NOT_READY; 483 if (!(flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT)) { 484 result_base += stride; 485 continue; 486 } 487 } 488 489 for (uint32_t k = 0; k < result_count; k++) { 490 if (available) { 491 uint64_t *result; 492 493 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 494 uint32_t stat_idx = statistics_index(&statistics); 495 result = query_result_addr(pool, query, uint64_t, stat_idx); 496 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 497 result = query_result_addr(pool, query, struct perfcntr_query_slot, k); 498 } else { 499 result = query_result_addr(pool, query, uint64_t, k); 500 } 501 502 write_query_value_cpu(result_base, k, *result, flags); 503 } else if (flags & VK_QUERY_RESULT_PARTIAL_BIT) 504 /* From the Vulkan 1.1.130 spec: 505 * 506 * If VK_QUERY_RESULT_PARTIAL_BIT is set, VK_QUERY_RESULT_WAIT_BIT 507 * is not set, and the query’s status is unavailable, an 508 * intermediate result value between zero and the final result 509 * value is written to pData for that query. 510 * 511 * Just return 0 here for simplicity since it's a valid result. 512 */ 513 write_query_value_cpu(result_base, k, 0, flags); 514 } 515 516 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 517 /* From the Vulkan 1.1.130 spec: 518 * 519 * If VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set, the final 520 * integer value written for each query is non-zero if the query’s 521 * status was available or zero if the status was unavailable. 522 */ 523 write_query_value_cpu(result_base, result_count, available, flags); 524 525 result_base += stride; 526 } 527 return result; 528} 529 530VKAPI_ATTR VkResult VKAPI_CALL 531tu_GetQueryPoolResults(VkDevice _device, 532 VkQueryPool queryPool, 533 uint32_t firstQuery, 534 uint32_t queryCount, 535 size_t dataSize, 536 void *pData, 537 VkDeviceSize stride, 538 VkQueryResultFlags flags) 539{ 540 TU_FROM_HANDLE(tu_device, device, _device); 541 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 542 assert(firstQuery + queryCount <= pool->size); 543 544 if (tu_device_is_lost(device)) 545 return VK_ERROR_DEVICE_LOST; 546 547 switch (pool->type) { 548 case VK_QUERY_TYPE_OCCLUSION: 549 case VK_QUERY_TYPE_TIMESTAMP: 550 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 551 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 552 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 553 return get_query_pool_results(device, pool, firstQuery, queryCount, 554 dataSize, pData, stride, flags); 555 default: 556 assert(!"Invalid query type"); 557 } 558 return VK_SUCCESS; 559} 560 561/* Copies a query value from one buffer to another from the GPU. */ 562static void 563copy_query_value_gpu(struct tu_cmd_buffer *cmdbuf, 564 struct tu_cs *cs, 565 uint64_t src_iova, 566 uint64_t base_write_iova, 567 uint32_t offset, 568 VkQueryResultFlags flags) { 569 uint32_t element_size = flags & VK_QUERY_RESULT_64_BIT ? 570 sizeof(uint64_t) : sizeof(uint32_t); 571 uint64_t write_iova = base_write_iova + (offset * element_size); 572 573 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); 574 uint32_t mem_to_mem_flags = flags & VK_QUERY_RESULT_64_BIT ? 575 CP_MEM_TO_MEM_0_DOUBLE : 0; 576 tu_cs_emit(cs, mem_to_mem_flags); 577 tu_cs_emit_qw(cs, write_iova); 578 tu_cs_emit_qw(cs, src_iova); 579} 580 581static void 582emit_copy_query_pool_results(struct tu_cmd_buffer *cmdbuf, 583 struct tu_cs *cs, 584 struct tu_query_pool *pool, 585 uint32_t firstQuery, 586 uint32_t queryCount, 587 struct tu_buffer *buffer, 588 VkDeviceSize dstOffset, 589 VkDeviceSize stride, 590 VkQueryResultFlags flags) 591{ 592 /* From the Vulkan 1.1.130 spec: 593 * 594 * vkCmdCopyQueryPoolResults is guaranteed to see the effect of previous 595 * uses of vkCmdResetQueryPool in the same queue, without any additional 596 * synchronization. 597 * 598 * To ensure that previous writes to the available bit are coherent, first 599 * wait for all writes to complete. 600 */ 601 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 602 603 for (uint32_t i = 0; i < queryCount; i++) { 604 uint32_t query = firstQuery + i; 605 uint64_t available_iova = query_available_iova(pool, query); 606 uint64_t buffer_iova = tu_buffer_iova(buffer) + dstOffset + i * stride; 607 uint32_t result_count = get_result_count(pool); 608 uint32_t statistics = pool->pipeline_statistics; 609 610 /* Wait for the available bit to be set if executed with the 611 * VK_QUERY_RESULT_WAIT_BIT flag. */ 612 if (flags & VK_QUERY_RESULT_WAIT_BIT) { 613 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 614 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_EQ) | 615 CP_WAIT_REG_MEM_0_POLL_MEMORY); 616 tu_cs_emit_qw(cs, available_iova); 617 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0x1)); 618 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 619 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 620 } 621 622 for (uint32_t k = 0; k < result_count; k++) { 623 uint64_t result_iova; 624 625 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 626 uint32_t stat_idx = statistics_index(&statistics); 627 result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 628 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 629 result_iova = query_result_iova(pool, query, 630 struct perfcntr_query_slot, k); 631 } else { 632 result_iova = query_result_iova(pool, query, uint64_t, k); 633 } 634 635 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 636 /* Unconditionally copying the bo->result into the buffer here is 637 * valid because we only set bo->result on vkCmdEndQuery. Thus, even 638 * if the query is unavailable, this will copy the correct partial 639 * value of 0. 640 */ 641 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 642 k /* offset */, flags); 643 } else { 644 /* Conditionally copy bo->result into the buffer based on whether the 645 * query is available. 646 * 647 * NOTE: For the conditional packets to be executed, CP_COND_EXEC 648 * tests that ADDR0 != 0 and ADDR1 < REF. The packet here simply tests 649 * that 0 < available < 2, aka available == 1. 650 */ 651 tu_cs_reserve(cs, 7 + 6); 652 tu_cs_emit_pkt7(cs, CP_COND_EXEC, 6); 653 tu_cs_emit_qw(cs, available_iova); 654 tu_cs_emit_qw(cs, available_iova); 655 tu_cs_emit(cs, CP_COND_EXEC_4_REF(0x2)); 656 tu_cs_emit(cs, 6); /* Cond execute the next 6 DWORDS */ 657 658 /* Start of conditional execution */ 659 copy_query_value_gpu(cmdbuf, cs, result_iova, buffer_iova, 660 k /* offset */, flags); 661 /* End of conditional execution */ 662 } 663 } 664 665 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 666 copy_query_value_gpu(cmdbuf, cs, available_iova, buffer_iova, 667 result_count /* offset */, flags); 668 } 669 } 670} 671 672VKAPI_ATTR void VKAPI_CALL 673tu_CmdCopyQueryPoolResults(VkCommandBuffer commandBuffer, 674 VkQueryPool queryPool, 675 uint32_t firstQuery, 676 uint32_t queryCount, 677 VkBuffer dstBuffer, 678 VkDeviceSize dstOffset, 679 VkDeviceSize stride, 680 VkQueryResultFlags flags) 681{ 682 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 683 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 684 TU_FROM_HANDLE(tu_buffer, buffer, dstBuffer); 685 struct tu_cs *cs = &cmdbuf->cs; 686 assert(firstQuery + queryCount <= pool->size); 687 688 switch (pool->type) { 689 case VK_QUERY_TYPE_OCCLUSION: 690 case VK_QUERY_TYPE_TIMESTAMP: 691 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 692 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 693 return emit_copy_query_pool_results(cmdbuf, cs, pool, firstQuery, 694 queryCount, buffer, dstOffset, stride, flags); 695 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 696 unreachable("allowCommandBufferQueryCopies is false"); 697 default: 698 assert(!"Invalid query type"); 699 } 700} 701 702static void 703emit_reset_query_pool(struct tu_cmd_buffer *cmdbuf, 704 struct tu_query_pool *pool, 705 uint32_t firstQuery, 706 uint32_t queryCount) 707{ 708 struct tu_cs *cs = &cmdbuf->cs; 709 710 for (uint32_t i = 0; i < queryCount; i++) { 711 uint32_t query = firstQuery + i; 712 uint32_t statistics = pool->pipeline_statistics; 713 714 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 715 tu_cs_emit_qw(cs, query_available_iova(pool, query)); 716 tu_cs_emit_qw(cs, 0x0); 717 718 for (uint32_t k = 0; k < get_result_count(pool); k++) { 719 uint64_t result_iova; 720 721 if (pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS) { 722 uint32_t stat_idx = statistics_index(&statistics); 723 result_iova = query_result_iova(pool, query, uint64_t, stat_idx); 724 } else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 725 result_iova = query_result_iova(pool, query, 726 struct perfcntr_query_slot, k); 727 } else { 728 result_iova = query_result_iova(pool, query, uint64_t, k); 729 } 730 731 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 732 tu_cs_emit_qw(cs, result_iova); 733 tu_cs_emit_qw(cs, 0x0); 734 } 735 } 736 737} 738 739VKAPI_ATTR void VKAPI_CALL 740tu_CmdResetQueryPool(VkCommandBuffer commandBuffer, 741 VkQueryPool queryPool, 742 uint32_t firstQuery, 743 uint32_t queryCount) 744{ 745 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 746 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 747 748 switch (pool->type) { 749 case VK_QUERY_TYPE_TIMESTAMP: 750 case VK_QUERY_TYPE_OCCLUSION: 751 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 752 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 753 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 754 emit_reset_query_pool(cmdbuf, pool, firstQuery, queryCount); 755 break; 756 default: 757 assert(!"Invalid query type"); 758 } 759} 760 761VKAPI_ATTR void VKAPI_CALL 762tu_ResetQueryPool(VkDevice device, 763 VkQueryPool queryPool, 764 uint32_t firstQuery, 765 uint32_t queryCount) 766{ 767 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 768 769 for (uint32_t i = 0; i < queryCount; i++) { 770 struct query_slot *slot = slot_address(pool, i + firstQuery); 771 slot->available = 0; 772 773 for (uint32_t k = 0; k < get_result_count(pool); k++) { 774 uint64_t *res; 775 776 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 777 res = query_result_addr(pool, i + firstQuery, 778 struct perfcntr_query_slot, k); 779 } else { 780 res = query_result_addr(pool, i + firstQuery, uint64_t, k); 781 } 782 783 *res = 0; 784 } 785 } 786} 787 788static void 789emit_begin_occlusion_query(struct tu_cmd_buffer *cmdbuf, 790 struct tu_query_pool *pool, 791 uint32_t query) 792{ 793 /* From the Vulkan 1.1.130 spec: 794 * 795 * A query must begin and end inside the same subpass of a render pass 796 * instance, or must both begin and end outside of a render pass 797 * instance. 798 * 799 * Unlike on an immediate-mode renderer, Turnip renders all tiles on 800 * vkCmdEndRenderPass, not individually on each vkCmdDraw*. As such, if a 801 * query begins/ends inside the same subpass of a render pass, we need to 802 * record the packets on the secondary draw command stream. cmdbuf->draw_cs 803 * is then run on every tile during render, so we just need to accumulate 804 * sample counts in slot->result to compute the query result. 805 */ 806 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 807 808 uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 809 810 tu_cs_emit_regs(cs, 811 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 812 813 tu_cs_emit_regs(cs, 814 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = begin_iova)); 815 816 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 817 tu_cs_emit(cs, ZPASS_DONE); 818} 819 820static void 821emit_begin_stat_query(struct tu_cmd_buffer *cmdbuf, 822 struct tu_query_pool *pool, 823 uint32_t query) 824{ 825 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 826 uint64_t begin_iova = pipeline_stat_query_iova(pool, query, begin); 827 828 tu6_emit_event_write(cmdbuf, cs, START_PRIMITIVE_CTRS); 829 tu6_emit_event_write(cmdbuf, cs, RST_PIX_CNT); 830 tu6_emit_event_write(cmdbuf, cs, TILE_FLUSH); 831 832 tu_cs_emit_wfi(cs); 833 834 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 835 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 836 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 837 CP_REG_TO_MEM_0_64B); 838 tu_cs_emit_qw(cs, begin_iova); 839} 840 841static void 842emit_perfcntrs_pass_start(struct tu_cs *cs, uint32_t pass) 843{ 844 tu_cs_emit_pkt7(cs, CP_REG_TEST, 1); 845 tu_cs_emit(cs, A6XX_CP_REG_TEST_0_REG( 846 REG_A6XX_CP_SCRATCH_REG(PERF_CNTRS_REG)) | 847 A6XX_CP_REG_TEST_0_BIT(pass) | 848 A6XX_CP_REG_TEST_0_WAIT_FOR_ME); 849 tu_cond_exec_start(cs, CP_COND_REG_EXEC_0_MODE(PRED_TEST)); 850} 851 852static void 853emit_begin_perf_query(struct tu_cmd_buffer *cmdbuf, 854 struct tu_query_pool *pool, 855 uint32_t query) 856{ 857 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 858 uint32_t last_pass = ~0; 859 860 /* Querying perf counters happens in these steps: 861 * 862 * 0) There's a scratch reg to set a pass index for perf counters query. 863 * Prepare cmd streams to set each pass index to the reg at device 864 * creation time. See tu_CreateDevice in tu_device.c 865 * 1) Emit command streams to read all requested perf counters at all 866 * passes in begin/end query with CP_REG_TEST/CP_COND_REG_EXEC, which 867 * reads the scratch reg where pass index is set. 868 * See emit_perfcntrs_pass_start. 869 * 2) Pick the right cs setting proper pass index to the reg and prepend 870 * it to the command buffer at each submit time. 871 * See tu_QueueSubmit in tu_drm.c 872 * 3) If the pass index in the reg is true, then executes the command 873 * stream below CP_COND_REG_EXEC. 874 */ 875 876 tu_cs_emit_wfi(cs); 877 878 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 879 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 880 881 if (last_pass != data->pass) { 882 last_pass = data->pass; 883 884 if (data->pass != 0) 885 tu_cond_exec_end(cs); 886 emit_perfcntrs_pass_start(cs, data->pass); 887 } 888 889 const struct fd_perfcntr_counter *counter = 890 &pool->perf_group[data->gid].counters[data->cntr_reg]; 891 const struct fd_perfcntr_countable *countable = 892 &pool->perf_group[data->gid].countables[data->cid]; 893 894 tu_cs_emit_pkt4(cs, counter->select_reg, 1); 895 tu_cs_emit(cs, countable->selector); 896 } 897 tu_cond_exec_end(cs); 898 899 last_pass = ~0; 900 tu_cs_emit_wfi(cs); 901 902 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 903 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 904 905 if (last_pass != data->pass) { 906 last_pass = data->pass; 907 908 if (data->pass != 0) 909 tu_cond_exec_end(cs); 910 emit_perfcntrs_pass_start(cs, data->pass); 911 } 912 913 const struct fd_perfcntr_counter *counter = 914 &pool->perf_group[data->gid].counters[data->cntr_reg]; 915 916 uint64_t begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 917 918 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 919 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 920 CP_REG_TO_MEM_0_64B); 921 tu_cs_emit_qw(cs, begin_iova); 922 } 923 tu_cond_exec_end(cs); 924} 925 926static void 927emit_begin_xfb_query(struct tu_cmd_buffer *cmdbuf, 928 struct tu_query_pool *pool, 929 uint32_t query, 930 uint32_t stream_id) 931{ 932 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 933 uint64_t begin_iova = primitive_query_iova(pool, query, begin[0], 0); 934 935 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = begin_iova)); 936 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 937} 938 939VKAPI_ATTR void VKAPI_CALL 940tu_CmdBeginQuery(VkCommandBuffer commandBuffer, 941 VkQueryPool queryPool, 942 uint32_t query, 943 VkQueryControlFlags flags) 944{ 945 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 946 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 947 assert(query < pool->size); 948 949 switch (pool->type) { 950 case VK_QUERY_TYPE_OCCLUSION: 951 /* In freedreno, there is no implementation difference between 952 * GL_SAMPLES_PASSED and GL_ANY_SAMPLES_PASSED, so we can similarly 953 * ignore the VK_QUERY_CONTROL_PRECISE_BIT flag here. 954 */ 955 emit_begin_occlusion_query(cmdbuf, pool, query); 956 break; 957 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 958 emit_begin_xfb_query(cmdbuf, pool, query, 0); 959 break; 960 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 961 emit_begin_perf_query(cmdbuf, pool, query); 962 break; 963 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 964 emit_begin_stat_query(cmdbuf, pool, query); 965 break; 966 case VK_QUERY_TYPE_TIMESTAMP: 967 unreachable("Unimplemented query type"); 968 default: 969 assert(!"Invalid query type"); 970 } 971} 972 973VKAPI_ATTR void VKAPI_CALL 974tu_CmdBeginQueryIndexedEXT(VkCommandBuffer commandBuffer, 975 VkQueryPool queryPool, 976 uint32_t query, 977 VkQueryControlFlags flags, 978 uint32_t index) 979{ 980 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 981 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 982 assert(query < pool->size); 983 984 switch (pool->type) { 985 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 986 emit_begin_xfb_query(cmdbuf, pool, query, index); 987 break; 988 default: 989 assert(!"Invalid query type"); 990 } 991} 992 993static void 994emit_end_occlusion_query(struct tu_cmd_buffer *cmdbuf, 995 struct tu_query_pool *pool, 996 uint32_t query) 997{ 998 /* Ending an occlusion query happens in a few steps: 999 * 1) Set the slot->end to UINT64_MAX. 1000 * 2) Set up the SAMPLE_COUNT registers and trigger a CP_EVENT_WRITE to 1001 * write the current sample count value into slot->end. 1002 * 3) Since (2) is asynchronous, wait until slot->end is not equal to 1003 * UINT64_MAX before continuing via CP_WAIT_REG_MEM. 1004 * 4) Accumulate the results of the query (slot->end - slot->begin) into 1005 * slot->result. 1006 * 5) If vkCmdEndQuery is *not* called from within the scope of a render 1007 * pass, set the slot's available bit since the query is now done. 1008 * 6) If vkCmdEndQuery *is* called from within the scope of a render 1009 * pass, we cannot mark as available yet since the commands in 1010 * draw_cs are not run until vkCmdEndRenderPass. 1011 */ 1012 const struct tu_render_pass *pass = cmdbuf->state.pass; 1013 struct tu_cs *cs = pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1014 1015 uint64_t available_iova = query_available_iova(pool, query); 1016 uint64_t begin_iova = occlusion_query_iova(pool, query, begin); 1017 uint64_t end_iova = occlusion_query_iova(pool, query, end); 1018 uint64_t result_iova = query_result_iova(pool, query, uint64_t, 0); 1019 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1020 tu_cs_emit_qw(cs, end_iova); 1021 tu_cs_emit_qw(cs, 0xffffffffffffffffull); 1022 1023 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1024 1025 tu_cs_emit_regs(cs, 1026 A6XX_RB_SAMPLE_COUNT_CONTROL(.copy = true)); 1027 1028 tu_cs_emit_regs(cs, 1029 A6XX_RB_SAMPLE_COUNT_ADDR(.qword = end_iova)); 1030 1031 tu_cs_emit_pkt7(cs, CP_EVENT_WRITE, 1); 1032 tu_cs_emit(cs, ZPASS_DONE); 1033 1034 tu_cs_emit_pkt7(cs, CP_WAIT_REG_MEM, 6); 1035 tu_cs_emit(cs, CP_WAIT_REG_MEM_0_FUNCTION(WRITE_NE) | 1036 CP_WAIT_REG_MEM_0_POLL_MEMORY); 1037 tu_cs_emit_qw(cs, end_iova); 1038 tu_cs_emit(cs, CP_WAIT_REG_MEM_3_REF(0xffffffff)); 1039 tu_cs_emit(cs, CP_WAIT_REG_MEM_4_MASK(~0)); 1040 tu_cs_emit(cs, CP_WAIT_REG_MEM_5_DELAY_LOOP_CYCLES(16)); 1041 1042 /* result (dst) = result (srcA) + end (srcB) - begin (srcC) */ 1043 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1044 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C); 1045 tu_cs_emit_qw(cs, result_iova); 1046 tu_cs_emit_qw(cs, result_iova); 1047 tu_cs_emit_qw(cs, end_iova); 1048 tu_cs_emit_qw(cs, begin_iova); 1049 1050 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1051 1052 if (pass) 1053 /* Technically, queries should be tracked per-subpass, but here we track 1054 * at the render pass level to simply the code a bit. This is safe 1055 * because the only commands that use the available bit are 1056 * vkCmdCopyQueryPoolResults and vkCmdResetQueryPool, both of which 1057 * cannot be invoked from inside a render pass scope. 1058 */ 1059 cs = &cmdbuf->draw_epilogue_cs; 1060 1061 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1062 tu_cs_emit_qw(cs, available_iova); 1063 tu_cs_emit_qw(cs, 0x1); 1064} 1065 1066static void 1067emit_end_stat_query(struct tu_cmd_buffer *cmdbuf, 1068 struct tu_query_pool *pool, 1069 uint32_t query) 1070{ 1071 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1072 uint64_t end_iova = pipeline_stat_query_iova(pool, query, end); 1073 uint64_t available_iova = query_available_iova(pool, query); 1074 uint64_t result_iova; 1075 uint64_t stat_start_iova; 1076 uint64_t stat_stop_iova; 1077 1078 tu6_emit_event_write(cmdbuf, cs, STOP_PRIMITIVE_CTRS); 1079 tu6_emit_event_write(cmdbuf, cs, RST_VTX_CNT); 1080 tu6_emit_event_write(cmdbuf, cs, STAT_EVENT); 1081 1082 tu_cs_emit_wfi(cs); 1083 1084 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1085 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_RBBM_PRIMCTR_0_LO) | 1086 CP_REG_TO_MEM_0_CNT(STAT_COUNT * 2) | 1087 CP_REG_TO_MEM_0_64B); 1088 tu_cs_emit_qw(cs, end_iova); 1089 1090 for (int i = 0; i < STAT_COUNT; i++) { 1091 result_iova = query_result_iova(pool, query, uint64_t, i); 1092 stat_start_iova = pipeline_stat_query_iova(pool, query, begin[i]); 1093 stat_stop_iova = pipeline_stat_query_iova(pool, query, end[i]); 1094 1095 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1096 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1097 CP_MEM_TO_MEM_0_DOUBLE | 1098 CP_MEM_TO_MEM_0_NEG_C); 1099 1100 tu_cs_emit_qw(cs, result_iova); 1101 tu_cs_emit_qw(cs, result_iova); 1102 tu_cs_emit_qw(cs, stat_stop_iova); 1103 tu_cs_emit_qw(cs, stat_start_iova); 1104 } 1105 1106 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1107 1108 if (cmdbuf->state.pass) 1109 cs = &cmdbuf->draw_epilogue_cs; 1110 1111 /* Set the availability to 1 */ 1112 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1113 tu_cs_emit_qw(cs, available_iova); 1114 tu_cs_emit_qw(cs, 0x1); 1115} 1116 1117static void 1118emit_end_perf_query(struct tu_cmd_buffer *cmdbuf, 1119 struct tu_query_pool *pool, 1120 uint32_t query) 1121{ 1122 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1123 uint64_t available_iova = query_available_iova(pool, query); 1124 uint64_t end_iova; 1125 uint64_t begin_iova; 1126 uint64_t result_iova; 1127 uint32_t last_pass = ~0; 1128 1129 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1130 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1131 1132 if (last_pass != data->pass) { 1133 last_pass = data->pass; 1134 1135 if (data->pass != 0) 1136 tu_cond_exec_end(cs); 1137 emit_perfcntrs_pass_start(cs, data->pass); 1138 } 1139 1140 const struct fd_perfcntr_counter *counter = 1141 &pool->perf_group[data->gid].counters[data->cntr_reg]; 1142 1143 end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1144 1145 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1146 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(counter->counter_reg_lo) | 1147 CP_REG_TO_MEM_0_64B); 1148 tu_cs_emit_qw(cs, end_iova); 1149 } 1150 tu_cond_exec_end(cs); 1151 1152 last_pass = ~0; 1153 tu_cs_emit_wfi(cs); 1154 1155 for (uint32_t i = 0; i < pool->counter_index_count; i++) { 1156 struct tu_perf_query_data *data = &pool->perf_query_data[i]; 1157 1158 if (last_pass != data->pass) { 1159 last_pass = data->pass; 1160 1161 1162 if (data->pass != 0) 1163 tu_cond_exec_end(cs); 1164 emit_perfcntrs_pass_start(cs, data->pass); 1165 } 1166 1167 result_iova = query_result_iova(pool, 0, struct perfcntr_query_slot, 1168 data->app_idx); 1169 begin_iova = perf_query_iova(pool, 0, begin, data->app_idx); 1170 end_iova = perf_query_iova(pool, 0, end, data->app_idx); 1171 1172 /* result += end - begin */ 1173 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1174 tu_cs_emit(cs, CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 1175 CP_MEM_TO_MEM_0_DOUBLE | 1176 CP_MEM_TO_MEM_0_NEG_C); 1177 1178 tu_cs_emit_qw(cs, result_iova); 1179 tu_cs_emit_qw(cs, result_iova); 1180 tu_cs_emit_qw(cs, end_iova); 1181 tu_cs_emit_qw(cs, begin_iova); 1182 } 1183 tu_cond_exec_end(cs); 1184 1185 tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); 1186 1187 if (cmdbuf->state.pass) 1188 cs = &cmdbuf->draw_epilogue_cs; 1189 1190 /* Set the availability to 1 */ 1191 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1192 tu_cs_emit_qw(cs, available_iova); 1193 tu_cs_emit_qw(cs, 0x1); 1194} 1195 1196static void 1197emit_end_xfb_query(struct tu_cmd_buffer *cmdbuf, 1198 struct tu_query_pool *pool, 1199 uint32_t query, 1200 uint32_t stream_id) 1201{ 1202 struct tu_cs *cs = cmdbuf->state.pass ? &cmdbuf->draw_cs : &cmdbuf->cs; 1203 1204 uint64_t end_iova = primitive_query_iova(pool, query, end[0], 0); 1205 uint64_t result_written_iova = query_result_iova(pool, query, uint64_t, 0); 1206 uint64_t result_generated_iova = query_result_iova(pool, query, uint64_t, 1); 1207 uint64_t begin_written_iova = primitive_query_iova(pool, query, begin[stream_id], 0); 1208 uint64_t begin_generated_iova = primitive_query_iova(pool, query, begin[stream_id], 1); 1209 uint64_t end_written_iova = primitive_query_iova(pool, query, end[stream_id], 0); 1210 uint64_t end_generated_iova = primitive_query_iova(pool, query, end[stream_id], 1); 1211 uint64_t available_iova = query_available_iova(pool, query); 1212 1213 tu_cs_emit_regs(cs, A6XX_VPC_SO_STREAM_COUNTS(.qword = end_iova)); 1214 tu6_emit_event_write(cmdbuf, cs, WRITE_PRIMITIVE_COUNTS); 1215 1216 tu_cs_emit_wfi(cs); 1217 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1218 1219 /* Set the count of written primitives */ 1220 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1221 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1222 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1223 tu_cs_emit_qw(cs, result_written_iova); 1224 tu_cs_emit_qw(cs, result_written_iova); 1225 tu_cs_emit_qw(cs, end_written_iova); 1226 tu_cs_emit_qw(cs, begin_written_iova); 1227 1228 tu6_emit_event_write(cmdbuf, cs, CACHE_FLUSH_TS); 1229 1230 /* Set the count of generated primitives */ 1231 tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 9); 1232 tu_cs_emit(cs, CP_MEM_TO_MEM_0_DOUBLE | CP_MEM_TO_MEM_0_NEG_C | 1233 CP_MEM_TO_MEM_0_WAIT_FOR_MEM_WRITES | 0x80000000); 1234 tu_cs_emit_qw(cs, result_generated_iova); 1235 tu_cs_emit_qw(cs, result_generated_iova); 1236 tu_cs_emit_qw(cs, end_generated_iova); 1237 tu_cs_emit_qw(cs, begin_generated_iova); 1238 1239 /* Set the availability to 1 */ 1240 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1241 tu_cs_emit_qw(cs, available_iova); 1242 tu_cs_emit_qw(cs, 0x1); 1243} 1244 1245/* Implement this bit of spec text from section 17.2 "Query Operation": 1246 * 1247 * If queries are used while executing a render pass instance that has 1248 * multiview enabled, the query uses N consecutive query indices in the 1249 * query pool (starting at query) where N is the number of bits set in the 1250 * view mask in the subpass the query is used in. How the numerical 1251 * results of the query are distributed among the queries is 1252 * implementation-dependent. For example, some implementations may write 1253 * each view’s results to a distinct query, while other implementations 1254 * may write the total result to the first query and write zero to the 1255 * other queries. However, the sum of the results in all the queries must 1256 * accurately reflect the total result of the query summed over all views. 1257 * Applications can sum the results from all the queries to compute the 1258 * total result. 1259 * 1260 * Since we execute all views at once, we write zero to the other queries. 1261 * Furthermore, because queries must be reset before use, and we set the 1262 * result to 0 in vkCmdResetQueryPool(), we just need to mark it as available. 1263 */ 1264 1265static void 1266handle_multiview_queries(struct tu_cmd_buffer *cmd, 1267 struct tu_query_pool *pool, 1268 uint32_t query) 1269{ 1270 if (!cmd->state.pass || !cmd->state.subpass->multiview_mask) 1271 return; 1272 1273 unsigned views = util_bitcount(cmd->state.subpass->multiview_mask); 1274 struct tu_cs *cs = &cmd->draw_epilogue_cs; 1275 1276 for (uint32_t i = 1; i < views; i++) { 1277 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1278 tu_cs_emit_qw(cs, query_available_iova(pool, query + i)); 1279 tu_cs_emit_qw(cs, 0x1); 1280 } 1281} 1282 1283VKAPI_ATTR void VKAPI_CALL 1284tu_CmdEndQuery(VkCommandBuffer commandBuffer, 1285 VkQueryPool queryPool, 1286 uint32_t query) 1287{ 1288 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1289 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1290 assert(query < pool->size); 1291 1292 switch (pool->type) { 1293 case VK_QUERY_TYPE_OCCLUSION: 1294 emit_end_occlusion_query(cmdbuf, pool, query); 1295 break; 1296 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1297 emit_end_xfb_query(cmdbuf, pool, query, 0); 1298 break; 1299 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1300 emit_end_perf_query(cmdbuf, pool, query); 1301 break; 1302 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 1303 emit_end_stat_query(cmdbuf, pool, query); 1304 break; 1305 case VK_QUERY_TYPE_TIMESTAMP: 1306 unreachable("Unimplemented query type"); 1307 default: 1308 assert(!"Invalid query type"); 1309 } 1310 1311 handle_multiview_queries(cmdbuf, pool, query); 1312} 1313 1314VKAPI_ATTR void VKAPI_CALL 1315tu_CmdEndQueryIndexedEXT(VkCommandBuffer commandBuffer, 1316 VkQueryPool queryPool, 1317 uint32_t query, 1318 uint32_t index) 1319{ 1320 TU_FROM_HANDLE(tu_cmd_buffer, cmdbuf, commandBuffer); 1321 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1322 assert(query < pool->size); 1323 1324 switch (pool->type) { 1325 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1326 assert(index <= 4); 1327 emit_end_xfb_query(cmdbuf, pool, query, index); 1328 break; 1329 default: 1330 assert(!"Invalid query type"); 1331 } 1332} 1333 1334VKAPI_ATTR void VKAPI_CALL 1335tu_CmdWriteTimestamp(VkCommandBuffer commandBuffer, 1336 VkPipelineStageFlagBits pipelineStage, 1337 VkQueryPool queryPool, 1338 uint32_t query) 1339{ 1340 TU_FROM_HANDLE(tu_cmd_buffer, cmd, commandBuffer); 1341 TU_FROM_HANDLE(tu_query_pool, pool, queryPool); 1342 1343 /* Inside a render pass, just write the timestamp multiple times so that 1344 * the user gets the last one if we use GMEM. There isn't really much 1345 * better we can do, and this seems to be what the blob does too. 1346 */ 1347 struct tu_cs *cs = cmd->state.pass ? &cmd->draw_cs : &cmd->cs; 1348 1349 /* Stages that will already have been executed by the time the CP executes 1350 * the REG_TO_MEM. DrawIndirect parameters are read by the CP, so the draw 1351 * indirect stage counts as top-of-pipe too. 1352 */ 1353 VkPipelineStageFlags top_of_pipe_flags = 1354 VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT | 1355 VK_PIPELINE_STAGE_DRAW_INDIRECT_BIT; 1356 1357 if (pipelineStage & ~top_of_pipe_flags) { 1358 /* Execute a WFI so that all commands complete. Note that CP_REG_TO_MEM 1359 * does CP_WAIT_FOR_ME internally, which will wait for the WFI to 1360 * complete. 1361 * 1362 * Stalling the CP like this is really unfortunate, but I don't think 1363 * there's a better solution that allows all 48 bits of precision 1364 * because CP_EVENT_WRITE doesn't support 64-bit timestamps. 1365 */ 1366 tu_cs_emit_wfi(cs); 1367 } 1368 1369 tu_cs_emit_pkt7(cs, CP_REG_TO_MEM, 3); 1370 tu_cs_emit(cs, CP_REG_TO_MEM_0_REG(REG_A6XX_CP_ALWAYS_ON_COUNTER_LO) | 1371 CP_REG_TO_MEM_0_CNT(2) | 1372 CP_REG_TO_MEM_0_64B); 1373 tu_cs_emit_qw(cs, query_result_iova(pool, query, uint64_t, 0)); 1374 1375 /* Only flag availability once the entire renderpass is done, similar to 1376 * the begin/end path. 1377 */ 1378 cs = cmd->state.pass ? &cmd->draw_epilogue_cs : &cmd->cs; 1379 1380 tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 4); 1381 tu_cs_emit_qw(cs, query_available_iova(pool, query)); 1382 tu_cs_emit_qw(cs, 0x1); 1383 1384 /* From the spec for vkCmdWriteTimestamp: 1385 * 1386 * If vkCmdWriteTimestamp is called while executing a render pass 1387 * instance that has multiview enabled, the timestamp uses N consecutive 1388 * query indices in the query pool (starting at query) where N is the 1389 * number of bits set in the view mask of the subpass the command is 1390 * executed in. The resulting query values are determined by an 1391 * implementation-dependent choice of one of the following behaviors: 1392 * 1393 * - The first query is a timestamp value and (if more than one bit is 1394 * set in the view mask) zero is written to the remaining queries. 1395 * If two timestamps are written in the same subpass, the sum of the 1396 * execution time of all views between those commands is the 1397 * difference between the first query written by each command. 1398 * 1399 * - All N queries are timestamp values. If two timestamps are written 1400 * in the same subpass, the sum of the execution time of all views 1401 * between those commands is the sum of the difference between 1402 * corresponding queries written by each command. The difference 1403 * between corresponding queries may be the execution time of a 1404 * single view. 1405 * 1406 * We execute all views in the same draw call, so we implement the first 1407 * option, the same as regular queries. 1408 */ 1409 handle_multiview_queries(cmd, pool, query); 1410} 1411 1412VKAPI_ATTR VkResult VKAPI_CALL 1413tu_EnumeratePhysicalDeviceQueueFamilyPerformanceQueryCountersKHR( 1414 VkPhysicalDevice physicalDevice, 1415 uint32_t queueFamilyIndex, 1416 uint32_t* pCounterCount, 1417 VkPerformanceCounterKHR* pCounters, 1418 VkPerformanceCounterDescriptionKHR* pCounterDescriptions) 1419{ 1420 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1421 1422 uint32_t desc_count = *pCounterCount; 1423 uint32_t group_count; 1424 const struct fd_perfcntr_group *group = 1425 fd_perfcntrs(&phydev->dev_id, &group_count); 1426 1427 VK_OUTARRAY_MAKE(out, pCounters, pCounterCount); 1428 VK_OUTARRAY_MAKE(out_desc, pCounterDescriptions, &desc_count); 1429 1430 for (int i = 0; i < group_count; i++) { 1431 for (int j = 0; j < group[i].num_countables; j++) { 1432 1433 vk_outarray_append(&out, counter) { 1434 counter->scope = VK_QUERY_SCOPE_COMMAND_BUFFER_KHR; 1435 counter->unit = 1436 fd_perfcntr_type_to_vk_unit[group[i].countables[j].query_type]; 1437 counter->storage = 1438 fd_perfcntr_type_to_vk_storage[group[i].countables[j].query_type]; 1439 1440 unsigned char sha1_result[20]; 1441 _mesa_sha1_compute(group[i].countables[j].name, 1442 strlen(group[i].countables[j].name), 1443 sha1_result); 1444 memcpy(counter->uuid, sha1_result, sizeof(counter->uuid)); 1445 } 1446 1447 vk_outarray_append(&out_desc, desc) { 1448 desc->flags = 0; 1449 1450 snprintf(desc->name, sizeof(desc->name), 1451 "%s", group[i].countables[j].name); 1452 snprintf(desc->category, sizeof(desc->category), "%s", group[i].name); 1453 snprintf(desc->description, sizeof(desc->description), 1454 "%s: %s performance counter", 1455 group[i].name, group[i].countables[j].name); 1456 } 1457 } 1458 } 1459 1460 return vk_outarray_status(&out); 1461} 1462 1463VKAPI_ATTR void VKAPI_CALL 1464tu_GetPhysicalDeviceQueueFamilyPerformanceQueryPassesKHR( 1465 VkPhysicalDevice physicalDevice, 1466 const VkQueryPoolPerformanceCreateInfoKHR* pPerformanceQueryCreateInfo, 1467 uint32_t* pNumPasses) 1468{ 1469 TU_FROM_HANDLE(tu_physical_device, phydev, physicalDevice); 1470 uint32_t group_count = 0; 1471 uint32_t gid = 0, cid = 0, n_passes; 1472 const struct fd_perfcntr_group *group = 1473 fd_perfcntrs(&phydev->dev_id, &group_count); 1474 1475 uint32_t counters_requested[group_count]; 1476 memset(counters_requested, 0x0, sizeof(counters_requested)); 1477 *pNumPasses = 1; 1478 1479 for (unsigned i = 0; i < pPerformanceQueryCreateInfo->counterIndexCount; i++) { 1480 perfcntr_index(group, group_count, 1481 pPerformanceQueryCreateInfo->pCounterIndices[i], 1482 &gid, &cid); 1483 1484 counters_requested[gid]++; 1485 } 1486 1487 for (uint32_t i = 0; i < group_count; i++) { 1488 n_passes = DIV_ROUND_UP(counters_requested[i], group[i].num_counters); 1489 *pNumPasses = MAX2(*pNumPasses, n_passes); 1490 } 1491} 1492 1493VKAPI_ATTR VkResult VKAPI_CALL 1494tu_AcquireProfilingLockKHR(VkDevice device, 1495 const VkAcquireProfilingLockInfoKHR* pInfo) 1496{ 1497 /* TODO. Probably there's something to do for kgsl. */ 1498 return VK_SUCCESS; 1499} 1500 1501VKAPI_ATTR void VKAPI_CALL 1502tu_ReleaseProfilingLockKHR(VkDevice device) 1503{ 1504 /* TODO. Probably there's something to do for kgsl. */ 1505 return; 1506} 1507