genX_query.c revision 9f464c52
1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29 30#include "anv_private.h" 31 32#include "genxml/gen_macros.h" 33#include "genxml/genX_pack.h" 34 35/* We reserve GPR 14 and 15 for conditional rendering */ 36#define GEN_MI_BUILDER_NUM_ALLOC_GPRS 14 37#define __gen_get_batch_dwords anv_batch_emit_dwords 38#define __gen_address_offset anv_address_add 39#include "common/gen_mi_builder.h" 40 41VkResult genX(CreateQueryPool)( 42 VkDevice _device, 43 const VkQueryPoolCreateInfo* pCreateInfo, 44 const VkAllocationCallbacks* pAllocator, 45 VkQueryPool* pQueryPool) 46{ 47 ANV_FROM_HANDLE(anv_device, device, _device); 48 const struct anv_physical_device *pdevice = &device->instance->physicalDevice; 49 struct anv_query_pool *pool; 50 VkResult result; 51 52 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 53 54 /* Query pool slots are made up of some number of 64-bit values packed 55 * tightly together. The first 64-bit value is always the "available" bit 56 * which is 0 when the query is unavailable and 1 when it is available. 57 * The 64-bit values that follow are determined by the type of query. 58 */ 59 uint32_t uint64s_per_slot = 1; 60 61 VkQueryPipelineStatisticFlags pipeline_statistics = 0; 62 switch (pCreateInfo->queryType) { 63 case VK_QUERY_TYPE_OCCLUSION: 64 /* Occlusion queries have two values: begin and end. */ 65 uint64s_per_slot += 2; 66 break; 67 case VK_QUERY_TYPE_TIMESTAMP: 68 /* Timestamps just have the one timestamp value */ 69 uint64s_per_slot += 1; 70 break; 71 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 72 pipeline_statistics = pCreateInfo->pipelineStatistics; 73 /* We're going to trust this field implicitly so we need to ensure that 74 * no unhandled extension bits leak in. 75 */ 76 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 77 78 /* Statistics queries have a min and max for every statistic */ 79 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics); 80 break; 81 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 82 /* Transform feedback queries are 4 values, begin/end for 83 * written/available. 84 */ 85 uint64s_per_slot += 4; 86 break; 87 default: 88 assert(!"Invalid query type"); 89 } 90 91 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, 92 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 93 if (pool == NULL) 94 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 95 96 pool->type = pCreateInfo->queryType; 97 pool->pipeline_statistics = pipeline_statistics; 98 pool->stride = uint64s_per_slot * sizeof(uint64_t); 99 pool->slots = pCreateInfo->queryCount; 100 101 uint64_t size = pool->slots * pool->stride; 102 result = anv_bo_init_new(&pool->bo, device, size); 103 if (result != VK_SUCCESS) 104 goto fail; 105 106 if (pdevice->supports_48bit_addresses) 107 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 108 109 if (pdevice->use_softpin) 110 pool->bo.flags |= EXEC_OBJECT_PINNED; 111 112 if (pdevice->has_exec_async) 113 pool->bo.flags |= EXEC_OBJECT_ASYNC; 114 115 anv_vma_alloc(device, &pool->bo); 116 117 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC 118 * platforms, this does nothing. On non-LLC platforms, this means snooping 119 * which comes at a slight cost. However, the buffers aren't big, won't be 120 * written frequently, and trying to handle the flushing manually without 121 * doing too much flushing is extremely painful. 122 */ 123 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); 124 125 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0); 126 127 *pQueryPool = anv_query_pool_to_handle(pool); 128 129 return VK_SUCCESS; 130 131 fail: 132 vk_free2(&device->alloc, pAllocator, pool); 133 134 return result; 135} 136 137void genX(DestroyQueryPool)( 138 VkDevice _device, 139 VkQueryPool _pool, 140 const VkAllocationCallbacks* pAllocator) 141{ 142 ANV_FROM_HANDLE(anv_device, device, _device); 143 ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 144 145 if (!pool) 146 return; 147 148 anv_gem_munmap(pool->bo.map, pool->bo.size); 149 anv_vma_free(device, &pool->bo); 150 anv_gem_close(device, pool->bo.gem_handle); 151 vk_free2(&device->alloc, pAllocator, pool); 152} 153 154static struct anv_address 155anv_query_address(struct anv_query_pool *pool, uint32_t query) 156{ 157 return (struct anv_address) { 158 .bo = &pool->bo, 159 .offset = query * pool->stride, 160 }; 161} 162 163static void 164cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 165 uint32_t value_index, uint64_t result) 166{ 167 if (flags & VK_QUERY_RESULT_64_BIT) { 168 uint64_t *dst64 = dst_slot; 169 dst64[value_index] = result; 170 } else { 171 uint32_t *dst32 = dst_slot; 172 dst32[value_index] = result; 173 } 174} 175 176static bool 177query_is_available(uint64_t *slot) 178{ 179 return *(volatile uint64_t *)slot; 180} 181 182static VkResult 183wait_for_available(struct anv_device *device, 184 struct anv_query_pool *pool, uint64_t *slot) 185{ 186 while (true) { 187 if (query_is_available(slot)) 188 return VK_SUCCESS; 189 190 int ret = anv_gem_busy(device, pool->bo.gem_handle); 191 if (ret == 1) { 192 /* The BO is still busy, keep waiting. */ 193 continue; 194 } else if (ret == -1) { 195 /* We don't know the real error. */ 196 return anv_device_set_lost(device, "gem wait failed: %m"); 197 } else { 198 assert(ret == 0); 199 /* The BO is no longer busy. */ 200 if (query_is_available(slot)) { 201 return VK_SUCCESS; 202 } else { 203 VkResult status = anv_device_query_status(device); 204 if (status != VK_SUCCESS) 205 return status; 206 207 /* If we haven't seen availability yet, then we never will. This 208 * can only happen if we have a client error where they call 209 * GetQueryPoolResults on a query that they haven't submitted to 210 * the GPU yet. The spec allows us to do anything in this case, 211 * but returning VK_SUCCESS doesn't seem right and we shouldn't 212 * just keep spinning. 213 */ 214 return VK_NOT_READY; 215 } 216 } 217 } 218} 219 220VkResult genX(GetQueryPoolResults)( 221 VkDevice _device, 222 VkQueryPool queryPool, 223 uint32_t firstQuery, 224 uint32_t queryCount, 225 size_t dataSize, 226 void* pData, 227 VkDeviceSize stride, 228 VkQueryResultFlags flags) 229{ 230 ANV_FROM_HANDLE(anv_device, device, _device); 231 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 232 233 assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 234 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 235 pool->type == VK_QUERY_TYPE_TIMESTAMP || 236 pool->type == VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT); 237 238 if (anv_device_is_lost(device)) 239 return VK_ERROR_DEVICE_LOST; 240 241 if (pData == NULL) 242 return VK_SUCCESS; 243 244 void *data_end = pData + dataSize; 245 246 VkResult status = VK_SUCCESS; 247 for (uint32_t i = 0; i < queryCount; i++) { 248 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; 249 250 /* Availability is always at the start of the slot */ 251 bool available = slot[0]; 252 253 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 254 status = wait_for_available(device, pool, slot); 255 if (status != VK_SUCCESS) 256 return status; 257 258 available = true; 259 } 260 261 /* From the Vulkan 1.0.42 spec: 262 * 263 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 264 * both not set then no result values are written to pData for 265 * queries that are in the unavailable state at the time of the call, 266 * and vkGetQueryPoolResults returns VK_NOT_READY. However, 267 * availability state is still written to pData for those queries if 268 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 269 */ 270 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 271 272 uint32_t idx = 0; 273 switch (pool->type) { 274 case VK_QUERY_TYPE_OCCLUSION: 275 if (write_results) 276 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 277 idx++; 278 break; 279 280 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 281 uint32_t statistics = pool->pipeline_statistics; 282 while (statistics) { 283 uint32_t stat = u_bit_scan(&statistics); 284 if (write_results) { 285 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 286 287 /* WaDividePSInvocationCountBy4:HSW,BDW */ 288 if ((device->info.gen == 8 || device->info.is_haswell) && 289 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 290 result >>= 2; 291 292 cpu_write_query_result(pData, flags, idx, result); 293 } 294 idx++; 295 } 296 assert(idx == util_bitcount(pool->pipeline_statistics)); 297 break; 298 } 299 300 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 301 if (write_results) 302 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 303 idx++; 304 if (write_results) 305 cpu_write_query_result(pData, flags, idx, slot[4] - slot[3]); 306 idx++; 307 break; 308 309 case VK_QUERY_TYPE_TIMESTAMP: 310 if (write_results) 311 cpu_write_query_result(pData, flags, idx, slot[1]); 312 idx++; 313 break; 314 315 default: 316 unreachable("invalid pool type"); 317 } 318 319 if (!write_results) 320 status = VK_NOT_READY; 321 322 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 323 cpu_write_query_result(pData, flags, idx, available); 324 325 pData += stride; 326 if (pData >= data_end) 327 break; 328 } 329 330 return status; 331} 332 333static void 334emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 335 struct anv_address addr) 336{ 337 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 338 pc.DestinationAddressType = DAT_PPGTT; 339 pc.PostSyncOperation = WritePSDepthCount; 340 pc.DepthStallEnable = true; 341 pc.Address = addr; 342 343 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 344 pc.CommandStreamerStallEnable = true; 345 } 346} 347 348static void 349emit_query_mi_availability(struct gen_mi_builder *b, 350 struct anv_address addr, 351 bool available) 352{ 353 gen_mi_store(b, gen_mi_mem64(addr), gen_mi_imm(available)); 354} 355 356static void 357emit_query_pc_availability(struct anv_cmd_buffer *cmd_buffer, 358 struct anv_address addr, 359 bool available) 360{ 361 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 362 pc.DestinationAddressType = DAT_PPGTT; 363 pc.PostSyncOperation = WriteImmediateData; 364 pc.Address = addr; 365 pc.ImmediateData = available; 366 } 367} 368 369/** 370 * Goes through a series of consecutive query indices in the given pool 371 * setting all element values to 0 and emitting them as available. 372 */ 373static void 374emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 375 struct gen_mi_builder *b, struct anv_query_pool *pool, 376 uint32_t first_index, uint32_t num_queries) 377{ 378 switch (pool->type) { 379 case VK_QUERY_TYPE_OCCLUSION: 380 case VK_QUERY_TYPE_TIMESTAMP: 381 /* These queries are written with a PIPE_CONTROL so clear them using the 382 * PIPE_CONTROL as well so we don't have to synchronize between 2 types 383 * of operations. 384 */ 385 assert((pool->stride % 8) == 0); 386 for (uint32_t i = 0; i < num_queries; i++) { 387 struct anv_address slot_addr = 388 anv_query_address(pool, first_index + i); 389 390 for (uint32_t qword = 1; qword < (pool->stride / 8); qword++) { 391 emit_query_pc_availability(cmd_buffer, 392 anv_address_add(slot_addr, qword * 8), 393 false); 394 } 395 emit_query_pc_availability(cmd_buffer, slot_addr, true); 396 } 397 break; 398 399 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 400 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 401 for (uint32_t i = 0; i < num_queries; i++) { 402 struct anv_address slot_addr = 403 anv_query_address(pool, first_index + i); 404 gen_mi_memset(b, anv_address_add(slot_addr, 8), 0, pool->stride - 8); 405 emit_query_mi_availability(b, slot_addr, true); 406 } 407 break; 408 409 default: 410 unreachable("Unsupported query type"); 411 } 412} 413 414void genX(CmdResetQueryPool)( 415 VkCommandBuffer commandBuffer, 416 VkQueryPool queryPool, 417 uint32_t firstQuery, 418 uint32_t queryCount) 419{ 420 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 421 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 422 423 switch (pool->type) { 424 case VK_QUERY_TYPE_OCCLUSION: 425 case VK_QUERY_TYPE_TIMESTAMP: 426 for (uint32_t i = 0; i < queryCount; i++) { 427 emit_query_pc_availability(cmd_buffer, 428 anv_query_address(pool, firstQuery + i), 429 false); 430 } 431 break; 432 433 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 434 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: { 435 struct gen_mi_builder b; 436 gen_mi_builder_init(&b, &cmd_buffer->batch); 437 438 for (uint32_t i = 0; i < queryCount; i++) 439 emit_query_mi_availability(&b, anv_query_address(pool, firstQuery + i), false); 440 break; 441 } 442 443 default: 444 unreachable("Unsupported query type"); 445 } 446} 447 448void genX(ResetQueryPoolEXT)( 449 VkDevice _device, 450 VkQueryPool queryPool, 451 uint32_t firstQuery, 452 uint32_t queryCount) 453{ 454 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 455 456 for (uint32_t i = 0; i < queryCount; i++) { 457 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; 458 *slot = 0; 459 } 460} 461 462static const uint32_t vk_pipeline_stat_to_reg[] = { 463 GENX(IA_VERTICES_COUNT_num), 464 GENX(IA_PRIMITIVES_COUNT_num), 465 GENX(VS_INVOCATION_COUNT_num), 466 GENX(GS_INVOCATION_COUNT_num), 467 GENX(GS_PRIMITIVES_COUNT_num), 468 GENX(CL_INVOCATION_COUNT_num), 469 GENX(CL_PRIMITIVES_COUNT_num), 470 GENX(PS_INVOCATION_COUNT_num), 471 GENX(HS_INVOCATION_COUNT_num), 472 GENX(DS_INVOCATION_COUNT_num), 473 GENX(CS_INVOCATION_COUNT_num), 474}; 475 476static void 477emit_pipeline_stat(struct gen_mi_builder *b, uint32_t stat, 478 struct anv_address addr) 479{ 480 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 481 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 482 483 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 484 gen_mi_store(b, gen_mi_mem64(addr), 485 gen_mi_reg64(vk_pipeline_stat_to_reg[stat])); 486} 487 488static void 489emit_xfb_query(struct gen_mi_builder *b, uint32_t stream, 490 struct anv_address addr) 491{ 492 assert(stream < MAX_XFB_STREAMS); 493 494 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 0)), 495 gen_mi_reg64(GENX(SO_NUM_PRIMS_WRITTEN0_num) + stream * 8)); 496 gen_mi_store(b, gen_mi_mem64(anv_address_add(addr, 16)), 497 gen_mi_reg64(GENX(SO_PRIM_STORAGE_NEEDED0_num) + stream * 8)); 498} 499 500void genX(CmdBeginQuery)( 501 VkCommandBuffer commandBuffer, 502 VkQueryPool queryPool, 503 uint32_t query, 504 VkQueryControlFlags flags) 505{ 506 genX(CmdBeginQueryIndexedEXT)(commandBuffer, queryPool, query, flags, 0); 507} 508 509void genX(CmdBeginQueryIndexedEXT)( 510 VkCommandBuffer commandBuffer, 511 VkQueryPool queryPool, 512 uint32_t query, 513 VkQueryControlFlags flags, 514 uint32_t index) 515{ 516 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 517 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 518 struct anv_address query_addr = anv_query_address(pool, query); 519 520 struct gen_mi_builder b; 521 gen_mi_builder_init(&b, &cmd_buffer->batch); 522 523 switch (pool->type) { 524 case VK_QUERY_TYPE_OCCLUSION: 525 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); 526 break; 527 528 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 529 /* TODO: This might only be necessary for certain stats */ 530 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 531 pc.CommandStreamerStallEnable = true; 532 pc.StallAtPixelScoreboard = true; 533 } 534 535 uint32_t statistics = pool->pipeline_statistics; 536 uint32_t offset = 8; 537 while (statistics) { 538 uint32_t stat = u_bit_scan(&statistics); 539 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 540 offset += 16; 541 } 542 break; 543 } 544 545 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 546 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 547 pc.CommandStreamerStallEnable = true; 548 pc.StallAtPixelScoreboard = true; 549 } 550 emit_xfb_query(&b, index, anv_address_add(query_addr, 8)); 551 break; 552 553 default: 554 unreachable(""); 555 } 556} 557 558void genX(CmdEndQuery)( 559 VkCommandBuffer commandBuffer, 560 VkQueryPool queryPool, 561 uint32_t query) 562{ 563 genX(CmdEndQueryIndexedEXT)(commandBuffer, queryPool, query, 0); 564} 565 566void genX(CmdEndQueryIndexedEXT)( 567 VkCommandBuffer commandBuffer, 568 VkQueryPool queryPool, 569 uint32_t query, 570 uint32_t index) 571{ 572 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 573 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 574 struct anv_address query_addr = anv_query_address(pool, query); 575 576 struct gen_mi_builder b; 577 gen_mi_builder_init(&b, &cmd_buffer->batch); 578 579 switch (pool->type) { 580 case VK_QUERY_TYPE_OCCLUSION: 581 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); 582 emit_query_pc_availability(cmd_buffer, query_addr, true); 583 break; 584 585 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 586 /* TODO: This might only be necessary for certain stats */ 587 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 588 pc.CommandStreamerStallEnable = true; 589 pc.StallAtPixelScoreboard = true; 590 } 591 592 uint32_t statistics = pool->pipeline_statistics; 593 uint32_t offset = 16; 594 while (statistics) { 595 uint32_t stat = u_bit_scan(&statistics); 596 emit_pipeline_stat(&b, stat, anv_address_add(query_addr, offset)); 597 offset += 16; 598 } 599 600 emit_query_mi_availability(&b, query_addr, true); 601 break; 602 } 603 604 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 605 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 606 pc.CommandStreamerStallEnable = true; 607 pc.StallAtPixelScoreboard = true; 608 } 609 610 emit_xfb_query(&b, index, anv_address_add(query_addr, 16)); 611 emit_query_mi_availability(&b, query_addr, true); 612 break; 613 614 default: 615 unreachable(""); 616 } 617 618 /* When multiview is active the spec requires that N consecutive query 619 * indices are used, where N is the number of active views in the subpass. 620 * The spec allows that we only write the results to one of the queries 621 * but we still need to manage result availability for all the query indices. 622 * Since we only emit a single query for all active views in the 623 * first index, mark the other query indices as being already available 624 * with result 0. 625 */ 626 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 627 const uint32_t num_queries = 628 util_bitcount(cmd_buffer->state.subpass->view_mask); 629 if (num_queries > 1) 630 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 631 } 632} 633 634#define TIMESTAMP 0x2358 635 636void genX(CmdWriteTimestamp)( 637 VkCommandBuffer commandBuffer, 638 VkPipelineStageFlagBits pipelineStage, 639 VkQueryPool queryPool, 640 uint32_t query) 641{ 642 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 643 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 644 struct anv_address query_addr = anv_query_address(pool, query); 645 646 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 647 648 struct gen_mi_builder b; 649 gen_mi_builder_init(&b, &cmd_buffer->batch); 650 651 switch (pipelineStage) { 652 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 653 gen_mi_store(&b, gen_mi_mem64(anv_address_add(query_addr, 8)), 654 gen_mi_reg64(TIMESTAMP)); 655 break; 656 657 default: 658 /* Everything else is bottom-of-pipe */ 659 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 660 pc.DestinationAddressType = DAT_PPGTT; 661 pc.PostSyncOperation = WriteTimestamp; 662 pc.Address = anv_address_add(query_addr, 8); 663 664 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 665 pc.CommandStreamerStallEnable = true; 666 } 667 break; 668 } 669 670 emit_query_pc_availability(cmd_buffer, query_addr, true); 671 672 /* When multiview is active the spec requires that N consecutive query 673 * indices are used, where N is the number of active views in the subpass. 674 * The spec allows that we only write the results to one of the queries 675 * but we still need to manage result availability for all the query indices. 676 * Since we only emit a single query for all active views in the 677 * first index, mark the other query indices as being already available 678 * with result 0. 679 */ 680 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 681 const uint32_t num_queries = 682 util_bitcount(cmd_buffer->state.subpass->view_mask); 683 if (num_queries > 1) 684 emit_zero_queries(cmd_buffer, &b, pool, query + 1, num_queries - 1); 685 } 686} 687 688#if GEN_GEN > 7 || GEN_IS_HASWELL 689 690static void 691gpu_write_query_result(struct gen_mi_builder *b, 692 struct anv_address dst_addr, 693 VkQueryResultFlags flags, 694 uint32_t value_index, 695 struct gen_mi_value query_result) 696{ 697 if (flags & VK_QUERY_RESULT_64_BIT) { 698 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 8); 699 gen_mi_store(b, gen_mi_mem64(res_addr), query_result); 700 } else { 701 struct anv_address res_addr = anv_address_add(dst_addr, value_index * 4); 702 gen_mi_store(b, gen_mi_mem32(res_addr), query_result); 703 } 704} 705 706static struct gen_mi_value 707compute_query_result(struct gen_mi_builder *b, struct anv_address addr) 708{ 709 return gen_mi_isub(b, gen_mi_mem64(anv_address_add(addr, 8)), 710 gen_mi_mem64(anv_address_add(addr, 0))); 711} 712 713void genX(CmdCopyQueryPoolResults)( 714 VkCommandBuffer commandBuffer, 715 VkQueryPool queryPool, 716 uint32_t firstQuery, 717 uint32_t queryCount, 718 VkBuffer destBuffer, 719 VkDeviceSize destOffset, 720 VkDeviceSize destStride, 721 VkQueryResultFlags flags) 722{ 723 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 724 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 725 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 726 727 struct gen_mi_builder b; 728 gen_mi_builder_init(&b, &cmd_buffer->batch); 729 struct gen_mi_value result; 730 731 /* If render target writes are ongoing, request a render target cache flush 732 * to ensure proper ordering of the commands from the 3d pipe and the 733 * command streamer. 734 */ 735 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_BUFFER_WRITES) { 736 cmd_buffer->state.pending_pipe_bits |= 737 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 738 } 739 740 if ((flags & VK_QUERY_RESULT_WAIT_BIT) || 741 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS) || 742 /* Occlusion & timestamp queries are written using a PIPE_CONTROL and 743 * because we're about to copy values from MI commands, we need to 744 * stall the command streamer to make sure the PIPE_CONTROL values have 745 * landed, otherwise we could see inconsistent values & availability. 746 * 747 * From the vulkan spec: 748 * 749 * "vkCmdCopyQueryPoolResults is guaranteed to see the effect of 750 * previous uses of vkCmdResetQueryPool in the same queue, without 751 * any additional synchronization." 752 */ 753 pool->type == VK_QUERY_TYPE_OCCLUSION || 754 pool->type == VK_QUERY_TYPE_TIMESTAMP) { 755 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; 756 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 757 } 758 759 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); 760 for (uint32_t i = 0; i < queryCount; i++) { 761 struct anv_address query_addr = anv_query_address(pool, firstQuery + i); 762 uint32_t idx = 0; 763 switch (pool->type) { 764 case VK_QUERY_TYPE_OCCLUSION: 765 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 766 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 767 break; 768 769 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 770 uint32_t statistics = pool->pipeline_statistics; 771 while (statistics) { 772 uint32_t stat = u_bit_scan(&statistics); 773 774 result = compute_query_result(&b, anv_address_add(query_addr, 775 idx * 16 + 8)); 776 777 /* WaDividePSInvocationCountBy4:HSW,BDW */ 778 if ((cmd_buffer->device->info.gen == 8 || 779 cmd_buffer->device->info.is_haswell) && 780 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 781 result = gen_mi_ushr32_imm(&b, result, 2); 782 } 783 784 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 785 } 786 assert(idx == util_bitcount(pool->pipeline_statistics)); 787 break; 788 } 789 790 case VK_QUERY_TYPE_TRANSFORM_FEEDBACK_STREAM_EXT: 791 result = compute_query_result(&b, anv_address_add(query_addr, 8)); 792 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 793 result = compute_query_result(&b, anv_address_add(query_addr, 24)); 794 gpu_write_query_result(&b, dest_addr, flags, idx++, result); 795 break; 796 797 case VK_QUERY_TYPE_TIMESTAMP: 798 result = gen_mi_mem64(anv_address_add(query_addr, 8)); 799 gpu_write_query_result(&b, dest_addr, flags, 0, result); 800 break; 801 802 default: 803 unreachable("unhandled query type"); 804 } 805 806 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 807 gpu_write_query_result(&b, dest_addr, flags, idx, 808 gen_mi_mem64(query_addr)); 809 } 810 811 dest_addr = anv_address_add(dest_addr, destStride); 812 } 813} 814 815#else 816void genX(CmdCopyQueryPoolResults)( 817 VkCommandBuffer commandBuffer, 818 VkQueryPool queryPool, 819 uint32_t firstQuery, 820 uint32_t queryCount, 821 VkBuffer destBuffer, 822 VkDeviceSize destOffset, 823 VkDeviceSize destStride, 824 VkQueryResultFlags flags) 825{ 826 anv_finishme("Queries not yet supported on Ivy Bridge"); 827} 828#endif 829