1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29 30#include "anv_private.h" 31 32#include "genxml/gen_macros.h" 33#include "genxml/genX_pack.h" 34 35/* We reserve : 36 * - GPR 14 for perf queries 37 * - GPR 15 for conditional rendering 38 */ 39#define MI_BUILDER_NUM_ALLOC_GPRS 14 40#define MI_BUILDER_CAN_WRITE_BATCH GFX_VER >= 8 41#define __gen_get_batch_dwords anv_batch_emit_dwords 42#define __gen_address_offset anv_address_add 43#define __gen_get_batch_address(b, a) anv_batch_address(b, a) 44#include "common/mi_builder.h" 45#include "perf/intel_perf.h" 46#include "perf/intel_perf_mdapi.h" 47#include "perf/intel_perf_regs.h" 48 49#include "vk_util.h" 50 51static struct anv_address 52anv_query_address(struct anv_query_pool *pool, uint32_t query) 53{ 54 return (struct anv_address) { 55 .bo = pool->bo, 56 .offset = query * pool->stride, 57 }; 58} 59 60VkResult genX(CreateQueryPool)( 61 VkDevice _device, 62 const VkQueryPoolCreateInfo* pCreateInfo, 63 const VkAllocationCallbacks* pAllocator, 64 VkQueryPool* pQueryPool) 65{ 66 ANV_FROM_HANDLE(anv_device, device, _device); 67 const struct anv_physical_device *pdevice = device->physical; 68#if GFX_VER >= 8 69 const VkQueryPoolPerformanceCreateInfoKHR *perf_query_info = NULL; 70 struct intel_perf_counter_pass *counter_pass; 71 struct intel_perf_query_info **pass_query; 72 uint32_t n_passes = 0; 73#endif 74 uint32_t data_offset = 0; 75 VK_MULTIALLOC(ma); 76 VkResult result; 77 78 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 79 80 /* Query pool slots are made up of some number of 64-bit values packed 81 * tightly together. For most query types have the first 64-bit value is 82 * the "available" bit which is 0 when the query is unavailable and 1 when 83 * it is available. The 64-bit values that follow are determined by the 84 * type of query. 85 * 86 * For performance queries, we have a requirement to align OA reports at 87 * 64bytes so we put those first and have the "available" bit behind 88 * together with some other counters. 89 */ 90 uint32_t uint64s_per_slot = 0; 91 92 VK_MULTIALLOC_DECL(&ma, struct anv_query_pool, pool, 1); 93 94 VkQueryPipelineStatisticFlags pipeline_statistics = 0; 95 switch (pCreateInfo->queryType) { 96 case VK_QUERY_TYPE_OCCLUSION: 97 /* Occlusion queries have two values: begin and end. */ 98 uint64s_per_slot = 1 + 2; 99 break; 100 case VK_QUERY_TYPE_TIMESTAMP: 101 /* Timestamps just have the one timestamp value */ 102 uint64s_per_slot = 1 + 1; 103 break; 104 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 105 pipeline_statistics = pCreateInfo->pipelineStatistics; 106 /* We're going to trust this field implicitly so we need to ensure that 107 * no unhandled extension bits leak in. 108 */ 109 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 110 111 /* Statistics queries have a min and max for every statistic */ 112 uint64s_per_slot = 1 + 2 * util_bitcount(pipeline_statistics); 113 break; 114 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 115 /* Transform feedback queries are 4 values, begin/end for 116 * written/available. 117 */ 118 uint64s_per_slot = 1 + 4; 119 break; 120 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 121 const struct intel_perf_query_field_layout *layout = 122 &pdevice->perf->query_layout; 123 124 uint64s_per_slot = 2; /* availability + marker */ 125 /* Align to the requirement of the layout */ 126 uint64s_per_slot = align_u32(uint64s_per_slot, 127 DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 128 data_offset = uint64s_per_slot * sizeof(uint64_t); 129 /* Add the query data for begin & end commands */ 130 uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 131 break; 132 } 133#if GFX_VER >= 8 134 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 135 const struct intel_perf_query_field_layout *layout = 136 &pdevice->perf->query_layout; 137 138 perf_query_info = vk_find_struct_const(pCreateInfo->pNext, 139 QUERY_POOL_PERFORMANCE_CREATE_INFO_KHR); 140 n_passes = intel_perf_get_n_passes(pdevice->perf, 141 perf_query_info->pCounterIndices, 142 perf_query_info->counterIndexCount, 143 NULL); 144 vk_multialloc_add(&ma, &counter_pass, struct intel_perf_counter_pass, 145 perf_query_info->counterIndexCount); 146 vk_multialloc_add(&ma, &pass_query, struct intel_perf_query_info *, 147 n_passes); 148 uint64s_per_slot = 4 /* availability + small batch */; 149 /* Align to the requirement of the layout */ 150 uint64s_per_slot = align_u32(uint64s_per_slot, 151 DIV_ROUND_UP(layout->alignment, sizeof(uint64_t))); 152 data_offset = uint64s_per_slot * sizeof(uint64_t); 153 /* Add the query data for begin & end commands */ 154 uint64s_per_slot += 2 * DIV_ROUND_UP(layout->size, sizeof(uint64_t)); 155 /* Multiply by the number of passes */ 156 uint64s_per_slot *= n_passes; 157 break; 158 } 159#endif 160 default: 161 assert(!"Invalid query type"); 162 } 163 164 if (!vk_object_multialloc(&device->vk, &ma, pAllocator, 165 VK_OBJECT_TYPE_QUERY_POOL)) 166 return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY); 167 168 pool->type = pCreateInfo->queryType; 169 pool->pipeline_statistics = pipeline_statistics; 170 pool->stride = uint64s_per_slot * sizeof(uint64_t); 171 pool->slots = pCreateInfo->queryCount; 172 173 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL) { 174 pool->data_offset = data_offset; 175 pool->snapshot_size = (pool->stride - data_offset) / 2; 176 } 177#if GFX_VER >= 8 178 else if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 179 pool->pass_size = pool->stride / n_passes; 180 pool->data_offset = data_offset; 181 pool->snapshot_size = (pool->pass_size - data_offset) / 2; 182 pool->n_counters = perf_query_info->counterIndexCount; 183 pool->counter_pass = counter_pass; 184 intel_perf_get_counters_passes(pdevice->perf, 185 perf_query_info->pCounterIndices, 186 perf_query_info->counterIndexCount, 187 pool->counter_pass); 188 pool->n_passes = n_passes; 189 pool->pass_query = pass_query; 190 intel_perf_get_n_passes(pdevice->perf, 191 perf_query_info->pCounterIndices, 192 perf_query_info->counterIndexCount, 193 pool->pass_query); 194 } 195#endif 196 197 uint64_t size = pool->slots * (uint64_t)pool->stride; 198 result = anv_device_alloc_bo(device, "query-pool", size, 199 ANV_BO_ALLOC_MAPPED | 200 ANV_BO_ALLOC_SNOOPED, 201 0 /* explicit_address */, 202 &pool->bo); 203 if (result != VK_SUCCESS) 204 goto fail; 205 206#if GFX_VER >= 8 207 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 208 for (uint32_t p = 0; p < pool->n_passes; p++) { 209 struct mi_builder b; 210 struct anv_batch batch = { 211 .start = pool->bo->map + khr_perf_query_preamble_offset(pool, p), 212 .end = pool->bo->map + khr_perf_query_preamble_offset(pool, p) + pool->data_offset, 213 }; 214 batch.next = batch.start; 215 216 mi_builder_init(&b, &device->info, &batch); 217 mi_store(&b, mi_reg64(ANV_PERF_QUERY_OFFSET_REG), 218 mi_imm(p * (uint64_t)pool->pass_size)); 219 anv_batch_emit(&batch, GENX(MI_BATCH_BUFFER_END), bbe); 220 } 221 } 222#endif 223 224 *pQueryPool = anv_query_pool_to_handle(pool); 225 226 return VK_SUCCESS; 227 228 fail: 229 vk_free2(&device->vk.alloc, pAllocator, pool); 230 231 return result; 232} 233 234void genX(DestroyQueryPool)( 235 VkDevice _device, 236 VkQueryPool _pool, 237 const VkAllocationCallbacks* pAllocator) 238{ 239 ANV_FROM_HANDLE(anv_device, device, _device); 240 ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 241 242 if (!pool) 243 return; 244 245 anv_device_release_bo(device, pool->bo); 246 vk_object_free(&device->vk, pAllocator, pool); 247} 248 249#if GFX_VER >= 8 250/** 251 * VK_KHR_performance_query layout : 252 * 253 * -------------------------------------------- 254 * | availability (8b) | | | 255 * |-------------------------------| | | 256 * | Small batch loading | | | 257 * | ANV_PERF_QUERY_OFFSET_REG | | | 258 * | (24b) | | Pass 0 | 259 * |-------------------------------| | | 260 * | some padding (see | | | 261 * | query_field_layout:alignment) | | | 262 * |-------------------------------| | | 263 * | query data | | | 264 * | (2 * query_field_layout:size) | | | 265 * |-------------------------------|-- | Query 0 266 * | availability (8b) | | | 267 * |-------------------------------| | | 268 * | Small batch loading | | | 269 * | ANV_PERF_QUERY_OFFSET_REG | | | 270 * | (24b) | | Pass 1 | 271 * |-------------------------------| | | 272 * | some padding (see | | | 273 * | query_field_layout:alignment) | | | 274 * |-------------------------------| | | 275 * | query data | | | 276 * | (2 * query_field_layout:size) | | | 277 * |-------------------------------|----------- 278 * | availability (8b) | | | 279 * |-------------------------------| | | 280 * | Small batch loading | | | 281 * | ANV_PERF_QUERY_OFFSET_REG | | | 282 * | (24b) | | Pass 0 | 283 * |-------------------------------| | | 284 * | some padding (see | | | 285 * | query_field_layout:alignment) | | | 286 * |-------------------------------| | | 287 * | query data | | | 288 * | (2 * query_field_layout:size) | | | 289 * |-------------------------------|-- | Query 1 290 * | ... | | | 291 * -------------------------------------------- 292 */ 293 294static uint64_t 295khr_perf_query_availability_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 296{ 297 return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size; 298} 299 300static uint64_t 301khr_perf_query_data_offset(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 302{ 303 return query * (uint64_t)pool->stride + pass * (uint64_t)pool->pass_size + 304 pool->data_offset + (end ? pool->snapshot_size : 0); 305} 306 307static struct anv_address 308khr_perf_query_availability_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass) 309{ 310 return anv_address_add( 311 (struct anv_address) { .bo = pool->bo, }, 312 khr_perf_query_availability_offset(pool, query, pass)); 313} 314 315static struct anv_address 316khr_perf_query_data_address(struct anv_query_pool *pool, uint32_t query, uint32_t pass, bool end) 317{ 318 return anv_address_add( 319 (struct anv_address) { .bo = pool->bo, }, 320 khr_perf_query_data_offset(pool, query, pass, end)); 321} 322 323static bool 324khr_perf_query_ensure_relocs(struct anv_cmd_buffer *cmd_buffer) 325{ 326 if (anv_batch_has_error(&cmd_buffer->batch)) 327 return false; 328 329 if (cmd_buffer->self_mod_locations) 330 return true; 331 332 struct anv_device *device = cmd_buffer->device; 333 const struct anv_physical_device *pdevice = device->physical; 334 335 cmd_buffer->self_mod_locations = 336 vk_alloc(&cmd_buffer->pool->alloc, 337 pdevice->n_perf_query_commands * sizeof(*cmd_buffer->self_mod_locations), 8, 338 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 339 340 if (!cmd_buffer->self_mod_locations) { 341 anv_batch_set_error(&cmd_buffer->batch, VK_ERROR_OUT_OF_HOST_MEMORY); 342 return false; 343 } 344 345 return true; 346} 347#endif 348 349/** 350 * VK_INTEL_performance_query layout : 351 * 352 * --------------------------------- 353 * | availability (8b) | 354 * |-------------------------------| 355 * | marker (8b) | 356 * |-------------------------------| 357 * | some padding (see | 358 * | query_field_layout:alignment) | 359 * |-------------------------------| 360 * | query data | 361 * | (2 * query_field_layout:size) | 362 * --------------------------------- 363 */ 364 365static uint32_t 366intel_perf_marker_offset(void) 367{ 368 return 8; 369} 370 371static uint32_t 372intel_perf_query_data_offset(struct anv_query_pool *pool, bool end) 373{ 374 return pool->data_offset + (end ? pool->snapshot_size : 0); 375} 376 377static void 378cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 379 uint32_t value_index, uint64_t result) 380{ 381 if (flags & VK_QUERY_RESULT_64_BIT) { 382 uint64_t *dst64 = dst_slot; 383 dst64[value_index] = result; 384 } else { 385 uint32_t *dst32 = dst_slot; 386 dst32[value_index] = result; 387 } 388} 389 390static void * 391query_slot(struct anv_query_pool *pool, uint32_t query) 392{ 393 return pool->bo->map + query * pool->stride; 394} 395 396static bool 397query_is_available(struct anv_query_pool *pool, uint32_t query) 398{ 399#if GFX_VER >= 8 400 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 401 for (uint32_t p = 0; p < pool->n_passes; p++) { 402 volatile uint64_t *slot = 403 pool->bo->map + khr_perf_query_availability_offset(pool, query, p); 404 if (!slot[0]) 405 return false; 406 } 407 return true; 408 } 409#endif 410 411 return *(volatile uint64_t *)query_slot(pool, query); 412} 413 414static VkResult 415wait_for_available(struct anv_device *device, 416 struct anv_query_pool *pool, uint32_t query) 417{ 418 uint64_t abs_timeout = anv_get_absolute_timeout(2 * NSEC_PER_SEC); 419 420 while (anv_gettime_ns() < abs_timeout) { 421 if (query_is_available(pool, query)) 422 return VK_SUCCESS; 423 VkResult status = anv_device_query_status(device); 424 if (status != VK_SUCCESS) 425 return status; 426 } 427 428 return anv_device_set_lost(device, "query timeout"); 429} 430 431VkResult genX(GetQueryPoolResults)( 432 VkDevice _device, 433 VkQueryPool queryPool, 434 uint32_t firstQuery, 435 uint32_t queryCount, 436 size_t dataSize, 437 void* pData, 438 VkDeviceSize stride, 439 VkQueryResultFlags flags) 440{ 441 ANV_FROM_HANDLE(anv_device, device, _device); 442 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 443 444 assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 445 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 446 pool->type == VK_QUERY_TYPE_TIMESTAMP || 447 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT || 448 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR || 449 pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL); 450 451 if (anv_device_is_lost(device)) 452 return VK_ERROR_DEVICE_LOST; 453 454 if (pData == NULL) 455 return VK_SUCCESS; 456 457 void *data_end = pData + dataSize; 458 459 VkResult status = VK_SUCCESS; 460 for (uint32_t i = 0; i < queryCount; i++) { 461 bool available = query_is_available(pool, firstQuery + i); 462 463 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 464 status = wait_for_available(device, pool, firstQuery + i); 465 if (status != VK_SUCCESS) { 466 return status; 467 } 468 469 available = true; 470 } 471 472 /* From the Vulkan 1.0.42 spec: 473 * 474 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 475 * both not set then no result values are written to pData for 476 * queries that are in the unavailable state at the time of the call, 477 * and vkGetQueryPoolResults returns VK_NOT_READY. However, 478 * availability state is still written to pData for those queries if 479 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 480 * 481 * From VK_KHR_performance_query : 482 * 483 * "VK_QUERY_RESULT_PERFORMANCE_QUERY_RECORDED_COUNTERS_BIT_KHR specifies 484 * that the result should contain the number of counters that were recorded 485 * into a query pool of type ename:VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR" 486 */ 487 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 488 489 uint32_t idx = 0; 490 switch (pool->type) { 491 case VK_QUERY_TYPE_OCCLUSION: { 492 uint64_t *slot = query_slot(pool, firstQuery + i); 493 if (write_results) { 494 /* From the Vulkan 1.2.132 spec: 495 * 496 * "If VK_QUERY_RESULT_PARTIAL_BIT is set, 497 * VK_QUERY_RESULT_WAIT_BIT is not set, and the query’s status 498 * is unavailable, an intermediate result value between zero and 499 * the final result value is written to pData for that query." 500 */ 501 uint64_t result = available ? slot[2] - slot[1] : 0; 502 cpu_write_query_result(pData, flags, idx, result); 503 } 504 idx++; 505 break; 506 } 507 508 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 509 uint64_t *slot = query_slot(pool, firstQuery + i); 510 uint32_t statistics = pool->pipeline_statistics; 511 while (statistics) { 512 uint32_t stat = u_bit_scan(&statistics); 513 if (write_results) { 514 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 515 516 /* WaDividePSInvocationCountBy4:HSW,BDW */ 517 if ((device->info.ver == 8 || device->info.is_haswell) && 518 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 519 result >>= 2; 520 521 cpu_write_query_result(pData, flags, idx, result); 522 } 523 idx++; 524 } 525 assert(idx == util_bitcount(pool->pipeline_statistics)); 526 break; 527 } 528 529 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 530 uint64_t *slot = query_slot(pool, firstQuery + i); 531 if (write_results) 532 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 533 idx++; 534 if (write_results) 535 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); 536 idx++; 537 break; 538 } 539 540 case VK_QUERY_TYPE_TIMESTAMP: { 541 uint64_t *slot = query_slot(pool, firstQuery + i); 542 if (write_results) 543 cpu_write_query_result(pData, flags, idx, slot[1]); 544 idx++; 545 break; 546 } 547 548#if GFX_VER >= 8 549 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 550 const struct anv_physical_device *pdevice = device->physical; 551 assert((flags & (VK_QUERY_RESULT_WITH_AVAILABILITY_BIT | 552 VK_QUERY_RESULT_PARTIAL_BIT)) == 0); 553 for (uint32_t p = 0; p < pool->n_passes; p++) { 554 const struct intel_perf_query_info *query = pool->pass_query[p]; 555 struct intel_perf_query_result result; 556 intel_perf_query_result_clear(&result); 557 intel_perf_query_result_accumulate_fields(&result, query, &device->info, 558 pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, false), 559 pool->bo->map + khr_perf_query_data_offset(pool, firstQuery + i, p, true), 560 false /* no_oa_accumulate */); 561 anv_perf_write_pass_results(pdevice->perf, pool, p, &result, pData); 562 } 563 break; 564 } 565#endif 566 567 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 568 if (!write_results) 569 break; 570 const void *query_data = query_slot(pool, firstQuery + i); 571 const struct intel_perf_query_info *query = &device->physical->perf->queries[0]; 572 struct intel_perf_query_result result; 573 intel_perf_query_result_clear(&result); 574 intel_perf_query_result_accumulate_fields(&result, query, &device->info, 575 query_data + intel_perf_query_data_offset(pool, false), 576 query_data + intel_perf_query_data_offset(pool, true), 577 false /* no_oa_accumulate */); 578 intel_perf_query_result_write_mdapi(pData, stride, 579 &device->info, 580 query, &result); 581 const uint64_t *marker = query_data + intel_perf_marker_offset(); 582 intel_perf_query_mdapi_write_marker(pData, stride, &device->info, *marker); 583 break; 584 } 585 586 default: 587 unreachable("invalid pool type"); 588 } 589 590 if (!write_results) 591 status = VK_NOT_READY; 592 593 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 594 cpu_write_query_result(pData, flags, idx, available); 595 596 pData += stride; 597 if (pData >= data_end) 598 break; 599 } 600 601 return status; 602} 603 604static void 605emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 606 struct anv_address addr) 607{ 608 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 609 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 610 611 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 612 pc.DestinationAddressType = DAT_PPGTT; 613 pc.PostSyncOperation = WritePSDepthCount; 614 pc.DepthStallEnable = true; 615 pc.Address = addr; 616 617 if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 618 pc.CommandStreamerStallEnable = true; 619 } 620} 621 622static void 623emit_query_mi_availability(struct mi_builder *b, 624 struct anv_address addr, 625 bool available) 626{ 627 mi_store(b, mi_mem64(addr), mi_imm(available)); 628} 629 630static void 631emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, 632 struct anv_address addr, 633 bool available) 634{ 635 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 636 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 637 638 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 639 pc.DestinationAddressType = DAT_PPGTT; 640 pc.PostSyncOperation = WriteImmediateData; 641 pc.Address = addr; 642 pc.ImmediateData = available; 643 } 644} 645 646/** 647 * Goes through a series of consecutive query indices in the given pool 648 * setting all element values to 0 and emitting them as available. 649 */ 650static void 651emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 652 struct mi_builder *b, struct anv_query_pool *pool, 653 uint32_t first_index, uint32_t num_queries) 654{ 655 switch (pool->type) { 656 case VK_QUERY_TYPE_OCCLUSION: 657 case VK_QUERY_TYPE_TIMESTAMP: 658 /* These queries are written with a PIPE_CONTROL so clear them using the 659 * PIPE_CONTROL as well so we don't have to synchronize between 2 types 660 * of operations. 661 */ 662 assert((pool->stride % 8) == 0); 663 for (uint32_t i = 0; i < num_queries; i++) { 664 struct anv_address slot_addr = 665 anv_query_address(pool, first_index + i); 666 667 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { 668 emit_query_pc_availability(cmd_buffer, 669 anv_address_add(slot_addr, qword * 8), 670 false); 671 } 672 emit_query_pc_availability(cmd_buffer, slot_addr, true); 673 } 674 break; 675 676 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 677 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 678 for (uint32_t i = 0; i < num_queries; i++) { 679 struct anv_address slot_addr = 680 anv_query_address(pool, first_index + i); 681 mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 682 emit_query_mi_availability(b, slot_addr, true); 683 } 684 break; 685 686#if GFX_VER >= 8 687 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 688 for (uint32_t i = 0; i < num_queries; i++) { 689 for (uint32_t p = 0; p < pool->n_passes; p++) { 690 mi_memset(b, khr_perf_query_data_address(pool, first_index + i, p, false), 691 0, 2 * pool->snapshot_size); 692 emit_query_mi_availability(b, 693 khr_perf_query_availability_address(pool, first_index + i, p), 694 true); 695 } 696 } 697 break; 698 } 699#endif 700 701 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: 702 for (uint32_t i = 0; i < num_queries; i++) { 703 struct anv_address slot_addr = 704 anv_query_address(pool, first_index + i); 705 mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 706 emit_query_mi_availability(b, slot_addr, true); 707 } 708 break; 709 710 default: 711 unreachable("Unsupported query type"); 712 } 713} 714 715void genX(CmdResetQueryPool)( 716 VkCommandBuffer commandBuffer, 717 VkQueryPool queryPool, 718 uint32_t firstQuery, 719 uint32_t queryCount) 720{ 721 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 722 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 723 724 switch (pool->type) { 725 case VK_QUERY_TYPE_OCCLUSION: 726 case VK_QUERY_TYPE_TIMESTAMP: 727 for (uint32_t i = 0; i < queryCount; i++) { 728 emit_query_pc_availability(cmd_buffer, 729 anv_query_address(pool, firstQuery + i), 730 false); 731 } 732 break; 733 734 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 735 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 736 struct mi_builder b; 737 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 738 739 for (uint32_t i = 0; i < queryCount; i++) 740 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 741 break; 742 } 743 744#if GFX_VER >= 8 745 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 746 struct mi_builder b; 747 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 748 749 for (uint32_t i = 0; i < queryCount; i++) { 750 for (uint32_t p = 0; p < pool->n_passes; p++) { 751 emit_query_mi_availability( 752 &b, 753 khr_perf_query_availability_address(pool, firstQuery + i, p), 754 false); 755 } 756 } 757 break; 758 } 759#endif 760 761 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 762 struct mi_builder b; 763 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 764 765 for (uint32_t i = 0; i < queryCount; i++) 766 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 767 break; 768 } 769 770 default: 771 unreachable("Unsupported query type"); 772 } 773} 774 775void genX(ResetQueryPool)( 776 VkDevice _device, 777 VkQueryPool queryPool, 778 uint32_t firstQuery, 779 uint32_t queryCount) 780{ 781 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 782 783 for (uint32_t i = 0; i < queryCount; i++) { 784 if (pool->type == VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR) { 785#if GFX_VER >= 8 786 for (uint32_t p = 0; p < pool->n_passes; p++) { 787 uint64_t *pass_slot = pool->bo->map + 788 khr_perf_query_availability_offset(pool, firstQuery + i, p); 789 *pass_slot = 0; 790 } 791#endif 792 } else { 793 uint64_t *slot = query_slot(pool, firstQuery + i); 794 *slot = 0; 795 } 796 } 797} 798 799static const uint32_t vk_pipeline_stat_to_reg[] = { 800 GENX(IA_VERTICES_COUNT_num), 801 GENX(IA_PRIMITIVES_COUNT_num), 802 GENX(VS_INVOCATION_COUNT_num), 803 GENX(GS_INVOCATION_COUNT_num), 804 GENX(GS_PRIMITIVES_COUNT_num), 805 GENX(CL_INVOCATION_COUNT_num), 806 GENX(CL_PRIMITIVES_COUNT_num), 807 GENX(PS_INVOCATION_COUNT_num), 808 GENX(HS_INVOCATION_COUNT_num), 809 GENX(DS_INVOCATION_COUNT_num), 810 GENX(CS_INVOCATION_COUNT_num), 811}; 812 813static void 814emit_pipeline_stat(struct mi_builder *b, uint32_t stat, 815 struct anv_address addr) 816{ 817 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 818 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 819 820 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 821 mi_store(b, mi_mem64(addr), mi_reg64(vk_pipeline_stat_to_reg[stat])); 822} 823 824static void 825emit_xfb_query(struct mi_builder *b, uint32_t stream, 826 struct anv_address addr) 827{ 828 assert(stream < MAX_XFB_STREAMS); 829 830 mi_store(b, mi_mem64(anv_address_add(addr, 0)), 831 mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8)); 832 mi_store(b, mi_mem64(anv_address_add(addr, 16)), 833 mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8)); 834} 835 836static void 837emit_perf_intel_query(struct anv_cmd_buffer *cmd_buffer, 838 struct anv_query_pool *pool, 839 struct mi_builder *b, 840 struct anv_address query_addr, 841 bool end) 842{ 843 const struct intel_perf_query_field_layout *layout = 844 &cmd_buffer->device->physical->perf->query_layout; 845 struct anv_address data_addr = 846 anv_address_add(query_addr, intel_perf_query_data_offset(pool, end)); 847 848 for (uint32_t f = 0; f < layout->n_fields; f++) { 849 const struct intel_perf_query_field *field = 850 &layout->fields[end ? f : (layout->n_fields - 1 - f)]; 851 852 switch (field->type) { 853 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 854 anv_batch_emit(&cmd_buffer->batch, GENX(MI_REPORT_PERF_COUNT), rpc) { 855 rpc.MemoryAddress = anv_address_add(data_addr, field->location); 856 } 857 break; 858 859 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 860 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 861 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 862 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: { 863 struct anv_address addr = anv_address_add(data_addr, field->location); 864 struct mi_value src = field->size == 8 ? 865 mi_reg64(field->mmio_offset) : 866 mi_reg32(field->mmio_offset); 867 struct mi_value dst = field->size == 8 ? 868 mi_mem64(addr) : mi_mem32(addr); 869 mi_store(b, dst, src); 870 break; 871 } 872 873 default: 874 unreachable("Invalid query field"); 875 break; 876 } 877 } 878} 879 880void genX(CmdBeginQuery)( 881 VkCommandBuffer commandBuffer, 882 VkQueryPool queryPool, 883 uint32_t query, 884 VkQueryControlFlags flags) 885{ 886 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); 887} 888 889void genX(CmdBeginQueryIndexedEXT)( 890 VkCommandBuffer commandBuffer, 891 VkQueryPool queryPool, 892 uint32_t query, 893 VkQueryControlFlags flags, 894 uint32_t index) 895{ 896 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 897 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 898 struct anv_address query_addr = anv_query_address(pool, query); 899 900 struct mi_builder b; 901 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 902 903 switch (pool->type) { 904 case VK_QUERY_TYPE_OCCLUSION: 905 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); 906 break; 907 908 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 909 /* TODO: This might only be necessary for certain stats */ 910 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 911 pc.CommandStreamerStallEnable = true; 912 pc.StallAtPixelScoreboard = true; 913 } 914 915 uint32_t statistics = pool->pipeline_statistics; 916 uint32_t offset = 8; 917 while (statistics) { 918 uint32_t stat = u_bit_scan(&statistics); 919 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 920 offset += 16; 921 } 922 break; 923 } 924 925 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 926 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 927 pc.CommandStreamerStallEnable = true; 928 pc.StallAtPixelScoreboard = true; 929 } 930 emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); 931 break; 932 933#if GFX_VER >= 8 934 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 935 if (!khr_perf_query_ensure_relocs(cmd_buffer)) 936 return; 937 938 const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 939 const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 940 941 uint32_t reloc_idx = 0; 942 for (uint32_t end = 0; end < 2; end++) { 943 for (uint32_t r = 0; r < layout->n_fields; r++) { 944 const struct intel_perf_query_field *field = 945 &layout->fields[end ? r : (layout->n_fields - 1 - r)]; 946 struct mi_value reg_addr = 947 mi_iadd( 948 &b, 949 mi_imm(intel_canonical_address(pool->bo->offset + 950 khr_perf_query_data_offset(pool, query, 0, end) + 951 field->location)), 952 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 953 cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 954 955 if (field->type != INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC && 956 field->size == 8) { 957 reg_addr = 958 mi_iadd( 959 &b, 960 mi_imm(intel_canonical_address(pool->bo->offset + 961 khr_perf_query_data_offset(pool, query, 0, end) + 962 field->location + 4)), 963 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 964 cmd_buffer->self_mod_locations[reloc_idx++] = mi_store_address(&b, reg_addr); 965 } 966 } 967 } 968 969 struct mi_value availability_write_offset = 970 mi_iadd( 971 &b, 972 mi_imm( 973 intel_canonical_address( 974 pool->bo->offset + 975 khr_perf_query_availability_offset(pool, query, 0 /* pass */))), 976 mi_reg64(ANV_PERF_QUERY_OFFSET_REG)); 977 cmd_buffer->self_mod_locations[reloc_idx++] = 978 mi_store_address(&b, availability_write_offset); 979 980 assert(reloc_idx == pdevice->n_perf_query_commands); 981 982 mi_self_mod_barrier(&b); 983 984 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 985 pc.CommandStreamerStallEnable = true; 986 pc.StallAtPixelScoreboard = true; 987 } 988 cmd_buffer->perf_query_pool = pool; 989 990 cmd_buffer->perf_reloc_idx = 0; 991 for (uint32_t r = 0; r < layout->n_fields; r++) { 992 const struct intel_perf_query_field *field = 993 &layout->fields[layout->n_fields - 1 - r]; 994 void *dws; 995 996 switch (field->type) { 997 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 998 dws = anv_batch_emitn(&cmd_buffer->batch, 999 GENX(MI_REPORT_PERF_COUNT_length), 1000 GENX(MI_REPORT_PERF_COUNT), 1001 .MemoryAddress = query_addr /* Will be overwritten */); 1002 _mi_resolve_address_token(&b, 1003 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1004 dws + 1005 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 1006 break; 1007 1008 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1009 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 1010 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1011 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1012 dws = 1013 anv_batch_emitn(&cmd_buffer->batch, 1014 GENX(MI_STORE_REGISTER_MEM_length), 1015 GENX(MI_STORE_REGISTER_MEM), 1016 .RegisterAddress = field->mmio_offset, 1017 .MemoryAddress = query_addr /* Will be overwritten */ ); 1018 _mi_resolve_address_token(&b, 1019 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1020 dws + 1021 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1022 if (field->size == 8) { 1023 dws = 1024 anv_batch_emitn(&cmd_buffer->batch, 1025 GENX(MI_STORE_REGISTER_MEM_length), 1026 GENX(MI_STORE_REGISTER_MEM), 1027 .RegisterAddress = field->mmio_offset + 4, 1028 .MemoryAddress = query_addr /* Will be overwritten */ ); 1029 _mi_resolve_address_token(&b, 1030 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1031 dws + 1032 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1033 } 1034 break; 1035 1036 default: 1037 unreachable("Invalid query field"); 1038 break; 1039 } 1040 } 1041 break; 1042 } 1043#endif 1044 1045 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 1046 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1047 pc.CommandStreamerStallEnable = true; 1048 pc.StallAtPixelScoreboard = true; 1049 } 1050 emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, false); 1051 break; 1052 } 1053 1054 default: 1055 unreachable(""); 1056 } 1057} 1058 1059void genX(CmdEndQuery)( 1060 VkCommandBuffer commandBuffer, 1061 VkQueryPool queryPool, 1062 uint32_t query) 1063{ 1064 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); 1065} 1066 1067void genX(CmdEndQueryIndexedEXT)( 1068 VkCommandBuffer commandBuffer, 1069 VkQueryPool queryPool, 1070 uint32_t query, 1071 uint32_t index) 1072{ 1073 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1074 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1075 struct anv_address query_addr = anv_query_address(pool, query); 1076 1077 struct mi_builder b; 1078 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1079 1080 switch (pool->type) { 1081 case VK_QUERY_TYPE_OCCLUSION: 1082 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); 1083 emit_query_pc_availability(cmd_buffer, query_addr, true); 1084 break; 1085 1086 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1087 /* TODO: This might only be necessary for certain stats */ 1088 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1089 pc.CommandStreamerStallEnable = true; 1090 pc.StallAtPixelScoreboard = true; 1091 } 1092 1093 uint32_t statistics = pool->pipeline_statistics; 1094 uint32_t offset = 16; 1095 while (statistics) { 1096 uint32_t stat = u_bit_scan(&statistics); 1097 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 1098 offset += 16; 1099 } 1100 1101 emit_query_mi_availability(&b, query_addr, true); 1102 break; 1103 } 1104 1105 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1106 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1107 pc.CommandStreamerStallEnable = true; 1108 pc.StallAtPixelScoreboard = true; 1109 } 1110 1111 emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); 1112 emit_query_mi_availability(&b, query_addr, true); 1113 break; 1114 1115#if GFX_VER >= 8 1116 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: { 1117 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1118 pc.CommandStreamerStallEnable = true; 1119 pc.StallAtPixelScoreboard = true; 1120 } 1121 cmd_buffer->perf_query_pool = pool; 1122 1123 if (!khr_perf_query_ensure_relocs(cmd_buffer)) 1124 return; 1125 1126 const struct anv_physical_device *pdevice = cmd_buffer->device->physical; 1127 const struct intel_perf_query_field_layout *layout = &pdevice->perf->query_layout; 1128 1129 void *dws; 1130 for (uint32_t r = 0; r < layout->n_fields; r++) { 1131 const struct intel_perf_query_field *field = &layout->fields[r]; 1132 1133 switch (field->type) { 1134 case INTEL_PERF_QUERY_FIELD_TYPE_MI_RPC: 1135 dws = anv_batch_emitn(&cmd_buffer->batch, 1136 GENX(MI_REPORT_PERF_COUNT_length), 1137 GENX(MI_REPORT_PERF_COUNT), 1138 .MemoryAddress = query_addr /* Will be overwritten */); 1139 _mi_resolve_address_token(&b, 1140 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1141 dws + 1142 GENX(MI_REPORT_PERF_COUNT_MemoryAddress_start) / 8); 1143 break; 1144 1145 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_PERFCNT: 1146 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_RPSTAT: 1147 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_B: 1148 case INTEL_PERF_QUERY_FIELD_TYPE_SRM_OA_C: 1149 dws = 1150 anv_batch_emitn(&cmd_buffer->batch, 1151 GENX(MI_STORE_REGISTER_MEM_length), 1152 GENX(MI_STORE_REGISTER_MEM), 1153 .RegisterAddress = field->mmio_offset, 1154 .MemoryAddress = query_addr /* Will be overwritten */ ); 1155 _mi_resolve_address_token(&b, 1156 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1157 dws + 1158 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1159 if (field->size == 8) { 1160 dws = 1161 anv_batch_emitn(&cmd_buffer->batch, 1162 GENX(MI_STORE_REGISTER_MEM_length), 1163 GENX(MI_STORE_REGISTER_MEM), 1164 .RegisterAddress = field->mmio_offset + 4, 1165 .MemoryAddress = query_addr /* Will be overwritten */ ); 1166 _mi_resolve_address_token(&b, 1167 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1168 dws + 1169 GENX(MI_STORE_REGISTER_MEM_MemoryAddress_start) / 8); 1170 } 1171 break; 1172 1173 default: 1174 unreachable("Invalid query field"); 1175 break; 1176 } 1177 } 1178 1179 dws = 1180 anv_batch_emitn(&cmd_buffer->batch, 1181 GENX(MI_STORE_DATA_IMM_length), 1182 GENX(MI_STORE_DATA_IMM), 1183 .ImmediateData = true); 1184 _mi_resolve_address_token(&b, 1185 cmd_buffer->self_mod_locations[cmd_buffer->perf_reloc_idx++], 1186 dws + 1187 GENX(MI_STORE_DATA_IMM_Address_start) / 8); 1188 1189 assert(cmd_buffer->perf_reloc_idx == pdevice->n_perf_query_commands); 1190 break; 1191 } 1192#endif 1193 1194 case VK_QUERY_TYPE_PERFORMANCE_QUERY_INTEL: { 1195 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1196 pc.CommandStreamerStallEnable = true; 1197 pc.StallAtPixelScoreboard = true; 1198 } 1199 uint32_t marker_offset = intel_perf_marker_offset(); 1200 mi_store(&b, mi_mem64(anv_address_add(query_addr, marker_offset)), 1201 mi_imm(cmd_buffer->intel_perf_marker)); 1202 emit_perf_intel_query(cmd_buffer, pool, &b, query_addr, true); 1203 emit_query_mi_availability(&b, query_addr, true); 1204 break; 1205 } 1206 1207 default: 1208 unreachable(""); 1209 } 1210 1211 /* When multiview is active the spec requires that N consecutive query 1212 * indices are used, where N is the number of active views in the subpass. 1213 * The spec allows that we only write the results to one of the queries 1214 * but we still need to manage result availability for all the query indices. 1215 * Since we only emit a single query for all active views in the 1216 * first index, mark the other query indices as being already available 1217 * with result 0. 1218 */ 1219 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 1220 const uint32_t num_queries = 1221 util_bitcount(cmd_buffer->state.subpass->view_mask); 1222 if (num_queries > 1) 1223 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 1224 } 1225} 1226 1227#define TIMESTAMP 0x2358 1228 1229void genX(CmdWriteTimestamp2KHR)( 1230 VkCommandBuffer commandBuffer, 1231 VkPipelineStageFlags2KHR stage, 1232 VkQueryPool queryPool, 1233 uint32_t query) 1234{ 1235 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1236 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1237 struct anv_address query_addr = anv_query_address(pool, query); 1238 1239 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 1240 1241 struct mi_builder b; 1242 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1243 1244 if (stage == VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT_KHR) { 1245 mi_store(&b, mi_mem64(anv_address_add(query_addr, 8)), 1246 mi_reg64(TIMESTAMP)); 1247 } else { 1248 /* Everything else is bottom-of-pipe */ 1249 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_POST_SYNC_BIT; 1250 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1251 1252 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 1253 pc.DestinationAddressType = DAT_PPGTT; 1254 pc.PostSyncOperation = WriteTimestamp; 1255 pc.Address = anv_address_add(query_addr, 8); 1256 1257 if (GFX_VER == 9 && cmd_buffer->device->info.gt == 4) 1258 pc.CommandStreamerStallEnable = true; 1259 } 1260 } 1261 1262 emit_query_pc_availability(cmd_buffer, query_addr, true); 1263 1264 /* When multiview is active the spec requires that N consecutive query 1265 * indices are used, where N is the number of active views in the subpass. 1266 * The spec allows that we only write the results to one of the queries 1267 * but we still need to manage result availability for all the query indices. 1268 * Since we only emit a single query for all active views in the 1269 * first index, mark the other query indices as being already available 1270 * with result 0. 1271 */ 1272 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 1273 const uint32_t num_queries = 1274 util_bitcount(cmd_buffer->state.subpass->view_mask); 1275 if (num_queries > 1) 1276 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 1277 } 1278} 1279 1280#if GFX_VERx10 >= 75 1281 1282#define MI_PREDICATE_SRC0 0x2400 1283#define MI_PREDICATE_SRC1 0x2408 1284#define MI_PREDICATE_RESULT 0x2418 1285 1286/** 1287 * Writes the results of a query to dst_addr is the value at poll_addr is equal 1288 * to the reference value. 1289 */ 1290static void 1291gpu_write_query_result_cond(struct anv_cmd_buffer *cmd_buffer, 1292 struct mi_builder *b, 1293 struct anv_address poll_addr, 1294 struct anv_address dst_addr, 1295 uint64_t ref_value, 1296 VkQueryResultFlags flags, 1297 uint32_t value_index, 1298 struct mi_value query_result) 1299{ 1300 mi_store(b, mi_reg64(MI_PREDICATE_SRC0), mi_mem64(poll_addr)); 1301 mi_store(b, mi_reg64(MI_PREDICATE_SRC1), mi_imm(ref_value)); 1302 anv_batch_emit(&cmd_buffer->batch, GENX(MI_PREDICATE), mip) { 1303 mip.LoadOperation = LOAD_LOAD; 1304 mip.CombineOperation = COMBINE_SET; 1305 mip.CompareOperation = COMPARE_SRCS_EQUAL; 1306 } 1307 1308 if (flags & VK_QUERY_RESULT_64_BIT) { 1309 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 1310 mi_store_if(b, mi_mem64(res_addr), query_result); 1311 } else { 1312 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 1313 mi_store_if(b, mi_mem32(res_addr), query_result); 1314 } 1315} 1316 1317static void 1318gpu_write_query_result(struct mi_builder *b, 1319 struct anv_address dst_addr, 1320 VkQueryResultFlags flags, 1321 uint32_t value_index, 1322 struct mi_value query_result) 1323{ 1324 if (flags & VK_QUERY_RESULT_64_BIT) { 1325 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 1326 mi_store(b, mi_mem64(res_addr), query_result); 1327 } else { 1328 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 1329 mi_store(b, mi_mem32(res_addr), query_result); 1330 } 1331} 1332 1333static struct mi_value 1334compute_query_result(struct mi_builder *b, struct anv_address addr) 1335{ 1336 return mi_isub(b, mi_mem64(anv_address_add(addr, 8)), 1337 mi_mem64(anv_address_add(addr, 0))); 1338} 1339 1340void genX(CmdCopyQueryPoolResults)( 1341 VkCommandBuffer commandBuffer, 1342 VkQueryPool queryPool, 1343 uint32_t firstQuery, 1344 uint32_t queryCount, 1345 VkBuffer destBuffer, 1346 VkDeviceSize destOffset, 1347 VkDeviceSize destStride, 1348 VkQueryResultFlags flags) 1349{ 1350 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 1351 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 1352 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 1353 1354 struct mi_builder b; 1355 mi_builder_init(&b, &cmd_buffer->device->info, &cmd_buffer->batch); 1356 struct mi_value result; 1357 1358 /* If render target writes are ongoing, request a render target cache flush 1359 * to ensure proper ordering of the commands from the 3d pipe and the 1360 * command streamer. 1361 */ 1362 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { 1363 anv_add_pending_pipe_bits(cmd_buffer, 1364 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT, 1365 "CopyQueryPoolResults"); 1366 } 1367 1368 if ((flags & VK_QUERY_RESULT_WAIT_BIT) || 1369 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || 1370 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and 1371 * because we're about to copy values from MI commands, we need to 1372 * stall the command streamer to make sure the PIPE_CONTROL values have 1373 * landed, otherwise we could see inconsistent values & availability. 1374 * 1375 * From the vulkan spec: 1376 * 1377 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 1378 * previous uses of vkCmdResetQueryPool in the same queue, without 1379 * any additional synchronization." 1380 */ 1381 pool->type == VK_QUERY_TYPE_OCCLUSION || 1382 pool->type == VK_QUERY_TYPE_TIMESTAMP) { 1383 anv_add_pending_pipe_bits(cmd_buffer, 1384 ANV_PIPE_CS_STALL_BIT, 1385 "CopyQueryPoolResults"); 1386 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 1387 } 1388 1389 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); 1390 for (uint32_t i = 0; i < queryCount; i++) { 1391 struct anv_address query_addr = anv_query_address(pool, firstQuery + i); 1392 uint32_t idx = 0; 1393 switch (pool->type) { 1394 case VK_QUERY_TYPE_OCCLUSION: 1395 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 1396 /* Like in the case of vkGetQueryPoolResults, if the query is 1397 * unavailable and the VK_QUERY_RESULT_PARTIAL_BIT flag is set, 1398 * conservatively write 0 as the query result. If the 1399 * VK_QUERY_RESULT_PARTIAL_BIT isn't set, don't write any value. 1400 */ 1401 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 1402 1 /* available */, flags, idx, result); 1403 if (flags & VK_QUERY_RESULT_PARTIAL_BIT) { 1404 gpu_write_query_result_cond(cmd_buffer, &b, query_addr, dest_addr, 1405 0 /* unavailable */, flags, idx, mi_imm(0)); 1406 } 1407 idx++; 1408 break; 1409 1410 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 1411 uint32_t statistics = pool->pipeline_statistics; 1412 while (statistics) { 1413 uint32_t stat = u_bit_scan(&statistics); 1414 1415 result = compute_query_result(&b, anv_address_add(query_addr, 1416 idx * 16 + 8)); 1417 1418 /* WaDividePSInvocationCountBy4:HSW,BDW */ 1419 if ((cmd_buffer->device->info.ver == 8 || 1420 cmd_buffer->device->info.is_haswell) && 1421 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 1422 result = mi_ushr32_imm(&b, result, 2); 1423 } 1424 1425 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1426 } 1427 assert(idx == util_bitcount(pool->pipeline_statistics)); 1428 break; 1429 } 1430 1431 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 1432 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 1433 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1434 result = compute_query_result(&b, anv_address_add(query_addr, 24)); 1435 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1436 break; 1437 1438 case VK_QUERY_TYPE_TIMESTAMP: 1439 result = mi_mem64(anv_address_add(query_addr, 8)); 1440 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 1441 break; 1442 1443#if GFX_VER >= 8 1444 case VK_QUERY_TYPE_PERFORMANCE_QUERY_KHR: 1445 unreachable("Copy KHR performance query results not implemented"); 1446 break; 1447#endif 1448 1449 default: 1450 unreachable("unhandled query type"); 1451 } 1452 1453 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 1454 gpu_write_query_result(&b, dest_addr, flags, idx, 1455 mi_mem64(query_addr)); 1456 } 1457 1458 dest_addr = anv_address_add(dest_addr, destStride); 1459 } 1460} 1461 1462#else 1463void genX(CmdCopyQueryPoolResults)( 1464 VkCommandBuffer commandBuffer, 1465 VkQueryPool queryPool, 1466 uint32_t firstQuery, 1467 uint32_t queryCount, 1468 VkBuffer destBuffer, 1469 VkDeviceSize destOffset, 1470 VkDeviceSize destStride, 1471 VkQueryResultFlags flags) 1472{ 1473 anv_finishme("Queries not yet supported on Ivy Bridge"); 1474} 1475#endif 1476