genX_query.c revision 01e04c3f
1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25#include <stdbool.h> 26#include <string.h> 27#include <unistd.h> 28#include <fcntl.h> 29 30#include "anv_private.h" 31 32#include "genxml/gen_macros.h" 33#include "genxml/genX_pack.h" 34 35VkResult genX(CreateQueryPool)( 36 VkDevice _device, 37 const VkQueryPoolCreateInfo* pCreateInfo, 38 const VkAllocationCallbacks* pAllocator, 39 VkQueryPool* pQueryPool) 40{ 41 ANV_FROM_HANDLE(anv_device, device, _device); 42 const struct anv_physical_device *pdevice = &device->instance->physicalDevice; 43 struct anv_query_pool *pool; 44 VkResult result; 45 46 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO); 47 48 /* Query pool slots are made up of some number of 64-bit values packed 49 * tightly together. The first 64-bit value is always the "available" bit 50 * which is 0 when the query is unavailable and 1 when it is available. 51 * The 64-bit values that follow are determined by the type of query. 52 */ 53 uint32_t uint64s_per_slot = 1; 54 55 VkQueryPipelineStatisticFlags pipeline_statistics = 0; 56 switch (pCreateInfo->queryType) { 57 case VK_QUERY_TYPE_OCCLUSION: 58 /* Occlusion queries have two values: begin and end. */ 59 uint64s_per_slot += 2; 60 break; 61 case VK_QUERY_TYPE_TIMESTAMP: 62 /* Timestamps just have the one timestamp value */ 63 uint64s_per_slot += 1; 64 break; 65 case VK_QUERY_TYPE_PIPELINE_STATISTICS: 66 pipeline_statistics = pCreateInfo->pipelineStatistics; 67 /* We're going to trust this field implicitly so we need to ensure that 68 * no unhandled extension bits leak in. 69 */ 70 pipeline_statistics &= ANV_PIPELINE_STATISTICS_MASK; 71 72 /* Statistics queries have a min and max for every statistic */ 73 uint64s_per_slot += 2 * util_bitcount(pipeline_statistics); 74 break; 75 default: 76 assert(!"Invalid query type"); 77 } 78 79 pool = vk_alloc2(&device->alloc, pAllocator, sizeof(*pool), 8, 80 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 81 if (pool == NULL) 82 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 83 84 pool->type = pCreateInfo->queryType; 85 pool->pipeline_statistics = pipeline_statistics; 86 pool->stride = uint64s_per_slot * sizeof(uint64_t); 87 pool->slots = pCreateInfo->queryCount; 88 89 uint64_t size = pool->slots * pool->stride; 90 result = anv_bo_init_new(&pool->bo, device, size); 91 if (result != VK_SUCCESS) 92 goto fail; 93 94 if (pdevice->supports_48bit_addresses) 95 pool->bo.flags |= EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 96 97 if (pdevice->use_softpin) 98 pool->bo.flags |= EXEC_OBJECT_PINNED; 99 100 if (pdevice->has_exec_async) 101 pool->bo.flags |= EXEC_OBJECT_ASYNC; 102 103 anv_vma_alloc(device, &pool->bo); 104 105 /* For query pools, we set the caching mode to I915_CACHING_CACHED. On LLC 106 * platforms, this does nothing. On non-LLC platforms, this means snooping 107 * which comes at a slight cost. However, the buffers aren't big, won't be 108 * written frequently, and trying to handle the flushing manually without 109 * doing too much flushing is extremely painful. 110 */ 111 anv_gem_set_caching(device, pool->bo.gem_handle, I915_CACHING_CACHED); 112 113 pool->bo.map = anv_gem_mmap(device, pool->bo.gem_handle, 0, size, 0); 114 115 *pQueryPool = anv_query_pool_to_handle(pool); 116 117 return VK_SUCCESS; 118 119 fail: 120 vk_free2(&device->alloc, pAllocator, pool); 121 122 return result; 123} 124 125void genX(DestroyQueryPool)( 126 VkDevice _device, 127 VkQueryPool _pool, 128 const VkAllocationCallbacks* pAllocator) 129{ 130 ANV_FROM_HANDLE(anv_device, device, _device); 131 ANV_FROM_HANDLE(anv_query_pool, pool, _pool); 132 133 if (!pool) 134 return; 135 136 anv_gem_munmap(pool->bo.map, pool->bo.size); 137 anv_vma_free(device, &pool->bo); 138 anv_gem_close(device, pool->bo.gem_handle); 139 vk_free2(&device->alloc, pAllocator, pool); 140} 141 142static struct anv_address 143anv_query_address(struct anv_query_pool *pool, uint32_t query) 144{ 145 return (struct anv_address) { 146 .bo = &pool->bo, 147 .offset = query * pool->stride, 148 }; 149} 150 151static void 152cpu_write_query_result(void *dst_slot, VkQueryResultFlags flags, 153 uint32_t value_index, uint64_t result) 154{ 155 if (flags & VK_QUERY_RESULT_64_BIT) { 156 uint64_t *dst64 = dst_slot; 157 dst64[value_index] = result; 158 } else { 159 uint32_t *dst32 = dst_slot; 160 dst32[value_index] = result; 161 } 162} 163 164static bool 165query_is_available(uint64_t *slot) 166{ 167 return *(volatile uint64_t *)slot; 168} 169 170static VkResult 171wait_for_available(struct anv_device *device, 172 struct anv_query_pool *pool, uint64_t *slot) 173{ 174 while (true) { 175 if (query_is_available(slot)) 176 return VK_SUCCESS; 177 178 int ret = anv_gem_busy(device, pool->bo.gem_handle); 179 if (ret == 1) { 180 /* The BO is still busy, keep waiting. */ 181 continue; 182 } else if (ret == -1) { 183 /* We don't know the real error. */ 184 return anv_device_set_lost(device, "gem wait failed: %m"); 185 } else { 186 assert(ret == 0); 187 /* The BO is no longer busy. */ 188 if (query_is_available(slot)) { 189 return VK_SUCCESS; 190 } else { 191 VkResult status = anv_device_query_status(device); 192 if (status != VK_SUCCESS) 193 return status; 194 195 /* If we haven't seen availability yet, then we never will. This 196 * can only happen if we have a client error where they call 197 * GetQueryPoolResults on a query that they haven't submitted to 198 * the GPU yet. The spec allows us to do anything in this case, 199 * but returning VK_SUCCESS doesn't seem right and we shouldn't 200 * just keep spinning. 201 */ 202 return VK_NOT_READY; 203 } 204 } 205 } 206} 207 208VkResult genX(GetQueryPoolResults)( 209 VkDevice _device, 210 VkQueryPool queryPool, 211 uint32_t firstQuery, 212 uint32_t queryCount, 213 size_t dataSize, 214 void* pData, 215 VkDeviceSize stride, 216 VkQueryResultFlags flags) 217{ 218 ANV_FROM_HANDLE(anv_device, device, _device); 219 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 220 221 assert(pool->type == VK_QUERY_TYPE_OCCLUSION || 222 pool->type == VK_QUERY_TYPE_PIPELINE_STATISTICS || 223 pool->type == VK_QUERY_TYPE_TIMESTAMP); 224 225 if (anv_device_is_lost(device)) 226 return VK_ERROR_DEVICE_LOST; 227 228 if (pData == NULL) 229 return VK_SUCCESS; 230 231 void *data_end = pData + dataSize; 232 233 VkResult status = VK_SUCCESS; 234 for (uint32_t i = 0; i < queryCount; i++) { 235 uint64_t *slot = pool->bo.map + (firstQuery + i) * pool->stride; 236 237 /* Availability is always at the start of the slot */ 238 bool available = slot[0]; 239 240 if (!available && (flags & VK_QUERY_RESULT_WAIT_BIT)) { 241 status = wait_for_available(device, pool, slot); 242 if (status != VK_SUCCESS) 243 return status; 244 245 available = true; 246 } 247 248 /* From the Vulkan 1.0.42 spec: 249 * 250 * "If VK_QUERY_RESULT_WAIT_BIT and VK_QUERY_RESULT_PARTIAL_BIT are 251 * both not set then no result values are written to pData for 252 * queries that are in the unavailable state at the time of the call, 253 * and vkGetQueryPoolResults returns VK_NOT_READY. However, 254 * availability state is still written to pData for those queries if 255 * VK_QUERY_RESULT_WITH_AVAILABILITY_BIT is set." 256 */ 257 bool write_results = available || (flags & VK_QUERY_RESULT_PARTIAL_BIT); 258 259 uint32_t idx = 0; 260 switch (pool->type) { 261 case VK_QUERY_TYPE_OCCLUSION: 262 if (write_results) 263 cpu_write_query_result(pData, flags, idx, slot[2] - slot[1]); 264 idx++; 265 break; 266 267 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 268 uint32_t statistics = pool->pipeline_statistics; 269 while (statistics) { 270 uint32_t stat = u_bit_scan(&statistics); 271 if (write_results) { 272 uint64_t result = slot[idx * 2 + 2] - slot[idx * 2 + 1]; 273 274 /* WaDividePSInvocationCountBy4:HSW,BDW */ 275 if ((device->info.gen == 8 || device->info.is_haswell) && 276 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) 277 result >>= 2; 278 279 cpu_write_query_result(pData, flags, idx, result); 280 } 281 idx++; 282 } 283 assert(idx == util_bitcount(pool->pipeline_statistics)); 284 break; 285 } 286 287 case VK_QUERY_TYPE_TIMESTAMP: 288 if (write_results) 289 cpu_write_query_result(pData, flags, idx, slot[1]); 290 idx++; 291 break; 292 293 default: 294 unreachable("invalid pool type"); 295 } 296 297 if (!write_results) 298 status = VK_NOT_READY; 299 300 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) 301 cpu_write_query_result(pData, flags, idx, available); 302 303 pData += stride; 304 if (pData >= data_end) 305 break; 306 } 307 308 return status; 309} 310 311static void 312emit_srm32(struct anv_batch *batch, struct anv_address addr, uint32_t reg) 313{ 314 anv_batch_emit(batch, GENX(MI_STORE_REGISTER_MEM), srm) { 315 srm.MemoryAddress = addr; 316 srm.RegisterAddress = reg; 317 } 318} 319 320static void 321emit_srm64(struct anv_batch *batch, struct anv_address addr, uint32_t reg) 322{ 323 emit_srm32(batch, anv_address_add(addr, 0), reg + 0); 324 emit_srm32(batch, anv_address_add(addr, 4), reg + 4); 325} 326 327static void 328emit_ps_depth_count(struct anv_cmd_buffer *cmd_buffer, 329 struct anv_address addr) 330{ 331 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 332 pc.DestinationAddressType = DAT_PPGTT; 333 pc.PostSyncOperation = WritePSDepthCount; 334 pc.DepthStallEnable = true; 335 pc.Address = addr; 336 337 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 338 pc.CommandStreamerStallEnable = true; 339 } 340} 341 342static void 343emit_query_availability(struct anv_cmd_buffer *cmd_buffer, 344 struct anv_address addr) 345{ 346 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 347 pc.DestinationAddressType = DAT_PPGTT; 348 pc.PostSyncOperation = WriteImmediateData; 349 pc.Address = addr; 350 pc.ImmediateData = 1; 351 } 352} 353 354/** 355 * Goes through a series of consecutive query indices in the given pool 356 * setting all element values to 0 and emitting them as available. 357 */ 358static void 359emit_zero_queries(struct anv_cmd_buffer *cmd_buffer, 360 struct anv_query_pool *pool, 361 uint32_t first_index, uint32_t num_queries) 362{ 363 for (uint32_t i = 0; i < num_queries; i++) { 364 struct anv_address slot_addr = 365 anv_query_address(pool, first_index + i); 366 genX(cmd_buffer_mi_memset)(cmd_buffer, anv_address_add(slot_addr, 8), 367 0, pool->stride - 8); 368 emit_query_availability(cmd_buffer, slot_addr); 369 } 370} 371 372void genX(CmdResetQueryPool)( 373 VkCommandBuffer commandBuffer, 374 VkQueryPool queryPool, 375 uint32_t firstQuery, 376 uint32_t queryCount) 377{ 378 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 379 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 380 381 for (uint32_t i = 0; i < queryCount; i++) { 382 anv_batch_emit(&cmd_buffer->batch, GENX(MI_STORE_DATA_IMM), sdm) { 383 sdm.Address = anv_query_address(pool, firstQuery + i); 384 sdm.ImmediateData = 0; 385 } 386 } 387} 388 389static const uint32_t vk_pipeline_stat_to_reg[] = { 390 GENX(IA_VERTICES_COUNT_num), 391 GENX(IA_PRIMITIVES_COUNT_num), 392 GENX(VS_INVOCATION_COUNT_num), 393 GENX(GS_INVOCATION_COUNT_num), 394 GENX(GS_PRIMITIVES_COUNT_num), 395 GENX(CL_INVOCATION_COUNT_num), 396 GENX(CL_PRIMITIVES_COUNT_num), 397 GENX(PS_INVOCATION_COUNT_num), 398 GENX(HS_INVOCATION_COUNT_num), 399 GENX(DS_INVOCATION_COUNT_num), 400 GENX(CS_INVOCATION_COUNT_num), 401}; 402 403static void 404emit_pipeline_stat(struct anv_cmd_buffer *cmd_buffer, uint32_t stat, 405 struct anv_address addr) 406{ 407 STATIC_ASSERT(ANV_PIPELINE_STATISTICS_MASK == 408 (1 << ARRAY_SIZE(vk_pipeline_stat_to_reg)) - 1); 409 410 assert(stat < ARRAY_SIZE(vk_pipeline_stat_to_reg)); 411 emit_srm64(&cmd_buffer->batch, addr, vk_pipeline_stat_to_reg[stat]); 412} 413 414void genX(CmdBeginQuery)( 415 VkCommandBuffer commandBuffer, 416 VkQueryPool queryPool, 417 uint32_t query, 418 VkQueryControlFlags flags) 419{ 420 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 421 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 422 struct anv_address query_addr = anv_query_address(pool, query); 423 424 switch (pool->type) { 425 case VK_QUERY_TYPE_OCCLUSION: 426 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 8)); 427 break; 428 429 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 430 /* TODO: This might only be necessary for certain stats */ 431 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 432 pc.CommandStreamerStallEnable = true; 433 pc.StallAtPixelScoreboard = true; 434 } 435 436 uint32_t statistics = pool->pipeline_statistics; 437 uint32_t offset = 8; 438 while (statistics) { 439 uint32_t stat = u_bit_scan(&statistics); 440 emit_pipeline_stat(cmd_buffer, stat, 441 anv_address_add(query_addr, offset)); 442 offset += 16; 443 } 444 break; 445 } 446 447 default: 448 unreachable(""); 449 } 450} 451 452void genX(CmdEndQuery)( 453 VkCommandBuffer commandBuffer, 454 VkQueryPool queryPool, 455 uint32_t query) 456{ 457 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 458 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 459 struct anv_address query_addr = anv_query_address(pool, query); 460 461 switch (pool->type) { 462 case VK_QUERY_TYPE_OCCLUSION: 463 emit_ps_depth_count(cmd_buffer, anv_address_add(query_addr, 16)); 464 emit_query_availability(cmd_buffer, query_addr); 465 break; 466 467 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 468 /* TODO: This might only be necessary for certain stats */ 469 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 470 pc.CommandStreamerStallEnable = true; 471 pc.StallAtPixelScoreboard = true; 472 } 473 474 uint32_t statistics = pool->pipeline_statistics; 475 uint32_t offset = 16; 476 while (statistics) { 477 uint32_t stat = u_bit_scan(&statistics); 478 emit_pipeline_stat(cmd_buffer, stat, 479 anv_address_add(query_addr, offset)); 480 offset += 16; 481 } 482 483 emit_query_availability(cmd_buffer, query_addr); 484 break; 485 } 486 487 default: 488 unreachable(""); 489 } 490 491 /* When multiview is active the spec requires that N consecutive query 492 * indices are used, where N is the number of active views in the subpass. 493 * The spec allows that we only write the results to one of the queries 494 * but we still need to manage result availability for all the query indices. 495 * Since we only emit a single query for all active views in the 496 * first index, mark the other query indices as being already available 497 * with result 0. 498 */ 499 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 500 const uint32_t num_queries = 501 util_bitcount(cmd_buffer->state.subpass->view_mask); 502 if (num_queries > 1) 503 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1); 504 } 505} 506 507#define TIMESTAMP 0x2358 508 509void genX(CmdWriteTimestamp)( 510 VkCommandBuffer commandBuffer, 511 VkPipelineStageFlagBits pipelineStage, 512 VkQueryPool queryPool, 513 uint32_t query) 514{ 515 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 516 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 517 struct anv_address query_addr = anv_query_address(pool, query); 518 519 assert(pool->type == VK_QUERY_TYPE_TIMESTAMP); 520 521 switch (pipelineStage) { 522 case VK_PIPELINE_STAGE_TOP_OF_PIPE_BIT: 523 emit_srm64(&cmd_buffer->batch, anv_address_add(query_addr, 8), TIMESTAMP); 524 break; 525 526 default: 527 /* Everything else is bottom-of-pipe */ 528 anv_batch_emit(&cmd_buffer->batch, GENX(PIPE_CONTROL), pc) { 529 pc.DestinationAddressType = DAT_PPGTT; 530 pc.PostSyncOperation = WriteTimestamp; 531 pc.Address = anv_address_add(query_addr, 8); 532 533 if (GEN_GEN == 9 && cmd_buffer->device->info.gt == 4) 534 pc.CommandStreamerStallEnable = true; 535 } 536 break; 537 } 538 539 emit_query_availability(cmd_buffer, query_addr); 540 541 /* When multiview is active the spec requires that N consecutive query 542 * indices are used, where N is the number of active views in the subpass. 543 * The spec allows that we only write the results to one of the queries 544 * but we still need to manage result availability for all the query indices. 545 * Since we only emit a single query for all active views in the 546 * first index, mark the other query indices as being already available 547 * with result 0. 548 */ 549 if (cmd_buffer->state.subpass && cmd_buffer->state.subpass->view_mask) { 550 const uint32_t num_queries = 551 util_bitcount(cmd_buffer->state.subpass->view_mask); 552 if (num_queries > 1) 553 emit_zero_queries(cmd_buffer, pool, query + 1, num_queries - 1); 554 } 555} 556 557#if GEN_GEN > 7 || GEN_IS_HASWELL 558 559static uint32_t 560mi_alu(uint32_t opcode, uint32_t operand1, uint32_t operand2) 561{ 562 struct GENX(MI_MATH_ALU_INSTRUCTION) instr = { 563 .ALUOpcode = opcode, 564 .Operand1 = operand1, 565 .Operand2 = operand2, 566 }; 567 568 uint32_t dw; 569 GENX(MI_MATH_ALU_INSTRUCTION_pack)(NULL, &dw, &instr); 570 571 return dw; 572} 573 574#define CS_GPR(n) (0x2600 + (n) * 8) 575 576static void 577emit_load_alu_reg_u64(struct anv_batch *batch, uint32_t reg, 578 struct anv_address addr) 579{ 580 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 581 lrm.RegisterAddress = reg; 582 lrm.MemoryAddress = anv_address_add(addr, 0); 583 } 584 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) { 585 lrm.RegisterAddress = reg + 4; 586 lrm.MemoryAddress = anv_address_add(addr, 4); 587 } 588} 589 590static void 591emit_load_alu_reg_imm32(struct anv_batch *batch, uint32_t reg, uint32_t imm) 592{ 593 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_IMM), lri) { 594 lri.RegisterOffset = reg; 595 lri.DataDWord = imm; 596 } 597} 598 599static void 600emit_load_alu_reg_imm64(struct anv_batch *batch, uint32_t reg, uint64_t imm) 601{ 602 emit_load_alu_reg_imm32(batch, reg, (uint32_t)imm); 603 emit_load_alu_reg_imm32(batch, reg + 4, (uint32_t)(imm >> 32)); 604} 605 606static void 607emit_load_alu_reg_reg32(struct anv_batch *batch, uint32_t src, uint32_t dst) 608{ 609 anv_batch_emit(batch, GENX(MI_LOAD_REGISTER_REG), lrr) { 610 lrr.SourceRegisterAddress = src; 611 lrr.DestinationRegisterAddress = dst; 612 } 613} 614 615/* 616 * GPR0 = GPR0 & ((1ull << n) - 1); 617 */ 618static void 619keep_gpr0_lower_n_bits(struct anv_batch *batch, uint32_t n) 620{ 621 assert(n < 64); 622 emit_load_alu_reg_imm64(batch, CS_GPR(1), (1ull << n) - 1); 623 624 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); 625 if (!dw) { 626 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 627 return; 628 } 629 630 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); 631 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG1); 632 dw[3] = mi_alu(MI_ALU_AND, 0, 0); 633 dw[4] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU); 634} 635 636/* 637 * GPR0 = GPR0 << 30; 638 */ 639static void 640shl_gpr0_by_30_bits(struct anv_batch *batch) 641{ 642 /* First we mask 34 bits of GPR0 to prevent overflow */ 643 keep_gpr0_lower_n_bits(batch, 34); 644 645 const uint32_t outer_count = 5; 646 const uint32_t inner_count = 6; 647 STATIC_ASSERT(outer_count * inner_count == 30); 648 const uint32_t cmd_len = 1 + inner_count * 4; 649 650 /* We'll emit 5 commands, each shifting GPR0 left by 6 bits, for a total of 651 * 30 left shifts. 652 */ 653 for (int o = 0; o < outer_count; o++) { 654 /* Submit one MI_MATH to shift left by 6 bits */ 655 uint32_t *dw = anv_batch_emitn(batch, cmd_len, GENX(MI_MATH)); 656 if (!dw) { 657 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 658 return; 659 } 660 661 dw++; 662 for (int i = 0; i < inner_count; i++, dw += 4) { 663 dw[0] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG0); 664 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0); 665 dw[2] = mi_alu(MI_ALU_ADD, 0, 0); 666 dw[3] = mi_alu(MI_ALU_STORE, MI_ALU_REG0, MI_ALU_ACCU); 667 } 668 } 669} 670 671/* 672 * GPR0 = GPR0 >> 2; 673 * 674 * Note that the upper 30 bits of GPR are lost! 675 */ 676static void 677shr_gpr0_by_2_bits(struct anv_batch *batch) 678{ 679 shl_gpr0_by_30_bits(batch); 680 emit_load_alu_reg_reg32(batch, CS_GPR(0) + 4, CS_GPR(0)); 681 emit_load_alu_reg_imm32(batch, CS_GPR(0) + 4, 0); 682} 683 684static void 685gpu_write_query_result(struct anv_batch *batch, 686 struct anv_address dst_addr, 687 VkQueryResultFlags flags, 688 uint32_t value_index, uint32_t reg) 689{ 690 if (flags & VK_QUERY_RESULT_64_BIT) { 691 emit_srm64(batch, anv_address_add(dst_addr, value_index * 8), reg); 692 } else { 693 emit_srm32(batch, anv_address_add(dst_addr, value_index * 4), reg); 694 } 695} 696 697static void 698compute_query_result(struct anv_batch *batch, uint32_t dst_reg, 699 struct anv_address addr) 700{ 701 emit_load_alu_reg_u64(batch, CS_GPR(0), anv_address_add(addr, 0)); 702 emit_load_alu_reg_u64(batch, CS_GPR(1), anv_address_add(addr, 8)); 703 704 /* FIXME: We need to clamp the result for 32 bit. */ 705 706 uint32_t *dw = anv_batch_emitn(batch, 5, GENX(MI_MATH)); 707 if (!dw) { 708 anv_batch_set_error(batch, VK_ERROR_OUT_OF_HOST_MEMORY); 709 return; 710 } 711 712 dw[1] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCA, MI_ALU_REG1); 713 dw[2] = mi_alu(MI_ALU_LOAD, MI_ALU_SRCB, MI_ALU_REG0); 714 dw[3] = mi_alu(MI_ALU_SUB, 0, 0); 715 dw[4] = mi_alu(MI_ALU_STORE, dst_reg, MI_ALU_ACCU); 716} 717 718void genX(CmdCopyQueryPoolResults)( 719 VkCommandBuffer commandBuffer, 720 VkQueryPool queryPool, 721 uint32_t firstQuery, 722 uint32_t queryCount, 723 VkBuffer destBuffer, 724 VkDeviceSize destOffset, 725 VkDeviceSize destStride, 726 VkQueryResultFlags flags) 727{ 728 ANV_FROM_HANDLE(anv_cmd_buffer, cmd_buffer, commandBuffer); 729 ANV_FROM_HANDLE(anv_query_pool, pool, queryPool); 730 ANV_FROM_HANDLE(anv_buffer, buffer, destBuffer); 731 732 /* If render target writes are ongoing, request a render target cache flush 733 * to ensure proper ordering of the commands from the 3d pipe and the 734 * command streamer. 735 */ 736 if (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_RENDER_TARGET_WRITES) { 737 cmd_buffer->state.pending_pipe_bits |= 738 ANV_PIPE_RENDER_TARGET_CACHE_FLUSH_BIT; 739 } 740 741 if ((flags & VK_QUERY_RESULT_WAIT_BIT) || 742 (cmd_buffer->state.pending_pipe_bits & ANV_PIPE_FLUSH_BITS)) { 743 cmd_buffer->state.pending_pipe_bits |= ANV_PIPE_CS_STALL_BIT; 744 genX(cmd_buffer_apply_pipe_flushes)(cmd_buffer); 745 } 746 747 struct anv_address dest_addr = anv_address_add(buffer->address, destOffset); 748 for (uint32_t i = 0; i < queryCount; i++) { 749 struct anv_address query_addr = anv_query_address(pool, firstQuery + i); 750 uint32_t idx = 0; 751 switch (pool->type) { 752 case VK_QUERY_TYPE_OCCLUSION: 753 compute_query_result(&cmd_buffer->batch, MI_ALU_REG2, 754 anv_address_add(query_addr, 8)); 755 gpu_write_query_result(&cmd_buffer->batch, dest_addr, 756 flags, idx++, CS_GPR(2)); 757 break; 758 759 case VK_QUERY_TYPE_PIPELINE_STATISTICS: { 760 uint32_t statistics = pool->pipeline_statistics; 761 while (statistics) { 762 uint32_t stat = u_bit_scan(&statistics); 763 764 compute_query_result(&cmd_buffer->batch, MI_ALU_REG0, 765 anv_address_add(query_addr, idx * 16 + 8)); 766 767 /* WaDividePSInvocationCountBy4:HSW,BDW */ 768 if ((cmd_buffer->device->info.gen == 8 || 769 cmd_buffer->device->info.is_haswell) && 770 (1 << stat) == VK_QUERY_PIPELINE_STATISTIC_FRAGMENT_SHADER_INVOCATIONS_BIT) { 771 shr_gpr0_by_2_bits(&cmd_buffer->batch); 772 } 773 774 gpu_write_query_result(&cmd_buffer->batch, dest_addr, 775 flags, idx++, CS_GPR(0)); 776 } 777 assert(idx == util_bitcount(pool->pipeline_statistics)); 778 break; 779 } 780 781 case VK_QUERY_TYPE_TIMESTAMP: 782 emit_load_alu_reg_u64(&cmd_buffer->batch, 783 CS_GPR(2), anv_address_add(query_addr, 8)); 784 gpu_write_query_result(&cmd_buffer->batch, dest_addr, 785 flags, 0, CS_GPR(2)); 786 break; 787 788 default: 789 unreachable("unhandled query type"); 790 } 791 792 if (flags & VK_QUERY_RESULT_WITH_AVAILABILITY_BIT) { 793 emit_load_alu_reg_u64(&cmd_buffer->batch, CS_GPR(0), query_addr); 794 gpu_write_query_result(&cmd_buffer->batch, dest_addr, 795 flags, idx, CS_GPR(0)); 796 } 797 798 dest_addr = anv_address_add(dest_addr, destStride); 799 } 800} 801 802#else 803void genX(CmdCopyQueryPoolResults)( 804 VkCommandBuffer commandBuffer, 805 VkQueryPool queryPool, 806 uint32_t firstQuery, 807 uint32_t queryCount, 808 VkBuffer destBuffer, 809 VkDeviceSize destOffset, 810 VkDeviceSize destStride, 811 VkQueryResultFlags flags) 812{ 813 anv_finishme("Queries not yet supported on Ivy Bridge"); 814} 815#endif 816