1/* 2 * Copyright © 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28/** @file brw_queryobj.c 29 * 30 * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query, 31 * GL_EXT_transform_feedback, and friends). 32 * 33 * The hardware provides a PIPE_CONTROL command that can report the number of 34 * fragments that passed the depth test, or the hardware timer. They are 35 * appropriately synced with the stage of the pipeline for our extensions' 36 * needs. 37 */ 38#include "main/imports.h" 39 40#include "brw_context.h" 41#include "brw_defines.h" 42#include "brw_state.h" 43#include "intel_batchbuffer.h" 44 45/* As best we know currently, the Gen HW timestamps are 36bits across 46 * all platforms, which we need to account for when calculating a 47 * delta to measure elapsed time. 48 * 49 * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes 50 * only have 32bits due to a kernel bug and so in that case we make sure to 51 * treat all raw timestamps as 32bits so they overflow consistently and remain 52 * comparable. (Note: the timestamps being passed here are not from the kernel 53 * so we don't need to be taking the upper 32bits in this buggy kernel case we 54 * are just clipping to 32bits here for consistency.) 55 */ 56uint64_t 57brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1) 58{ 59 if (brw->screen->hw_has_timestamp == 2) { 60 /* Kernel clips timestamps to 32bits in this case, so we also clip 61 * PIPE_CONTROL timestamps for consistency. 62 */ 63 return (uint32_t)time1 - (uint32_t)time0; 64 } else { 65 if (time0 > time1) { 66 return (1ULL << 36) + time1 - time0; 67 } else { 68 return time1 - time0; 69 } 70 } 71} 72 73/** 74 * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. 75 */ 76void 77brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx) 78{ 79 const struct gen_device_info *devinfo = &brw->screen->devinfo; 80 81 if (devinfo->gen == 6) { 82 /* Emit Sandybridge workaround flush: */ 83 brw_emit_pipe_control_flush(brw, 84 PIPE_CONTROL_CS_STALL | 85 PIPE_CONTROL_STALL_AT_SCOREBOARD); 86 } 87 88 uint32_t flags = PIPE_CONTROL_WRITE_TIMESTAMP; 89 90 if (devinfo->gen == 9 && devinfo->gt == 4) 91 flags |= PIPE_CONTROL_CS_STALL; 92 93 brw_emit_pipe_control_write(brw, flags, 94 query_bo, idx * sizeof(uint64_t), 0); 95} 96 97/** 98 * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. 99 */ 100void 101brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx) 102{ 103 const struct gen_device_info *devinfo = &brw->screen->devinfo; 104 uint32_t flags = PIPE_CONTROL_WRITE_DEPTH_COUNT | PIPE_CONTROL_DEPTH_STALL; 105 106 if (devinfo->gen == 9 && devinfo->gt == 4) 107 flags |= PIPE_CONTROL_CS_STALL; 108 109 if (devinfo->gen >= 10) { 110 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable bit set 111 * prior to programming a PIPE_CONTROL with Write PS Depth Count Post sync 112 * operation." 113 */ 114 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL); 115 } 116 117 brw_emit_pipe_control_write(brw, flags, 118 query_bo, idx * sizeof(uint64_t), 0); 119} 120 121/** 122 * Wait on the query object's BO and calculate the final result. 123 */ 124static void 125brw_queryobj_get_results(struct gl_context *ctx, 126 struct brw_query_object *query) 127{ 128 struct brw_context *brw = brw_context(ctx); 129 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 130 131 int i; 132 uint64_t *results; 133 134 assert(devinfo->gen < 6); 135 136 if (query->bo == NULL) 137 return; 138 139 /* If the application has requested the query result, but this batch is 140 * still contributing to it, flush it now so the results will be present 141 * when mapped. 142 */ 143 if (brw_batch_references(&brw->batch, query->bo)) 144 intel_batchbuffer_flush(brw); 145 146 if (unlikely(brw->perf_debug)) { 147 if (brw_bo_busy(query->bo)) { 148 perf_debug("Stalling on the GPU waiting for a query object.\n"); 149 } 150 } 151 152 results = brw_bo_map(brw, query->bo, MAP_READ); 153 switch (query->Base.Target) { 154 case GL_TIME_ELAPSED_EXT: 155 /* The query BO contains the starting and ending timestamps. 156 * Subtract the two and convert to nanoseconds. 157 */ 158 query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]); 159 query->Base.Result = gen_device_info_timebase_scale(devinfo, query->Base.Result); 160 break; 161 162 case GL_TIMESTAMP: 163 /* The query BO contains a single timestamp value in results[0]. */ 164 query->Base.Result = gen_device_info_timebase_scale(devinfo, results[0]); 165 166 /* Ensure the scaled timestamp overflows according to 167 * GL_QUERY_COUNTER_BITS 168 */ 169 query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 170 break; 171 172 case GL_SAMPLES_PASSED_ARB: 173 /* Loop over pairs of values from the BO, which are the PS_DEPTH_COUNT 174 * value at the start and end of the batchbuffer. Subtract them to 175 * get the number of fragments which passed the depth test in each 176 * individual batch, and add those differences up to get the number 177 * of fragments for the entire query. 178 * 179 * Note that query->Base.Result may already be non-zero. We may have 180 * run out of space in the query's BO and allocated a new one. If so, 181 * this function was already called to accumulate the results so far. 182 */ 183 for (i = 0; i < query->last_index; i++) { 184 query->Base.Result += results[i * 2 + 1] - results[i * 2]; 185 } 186 break; 187 188 case GL_ANY_SAMPLES_PASSED: 189 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 190 /* If the starting and ending PS_DEPTH_COUNT from any of the batches 191 * differ, then some fragments passed the depth test. 192 */ 193 for (i = 0; i < query->last_index; i++) { 194 if (results[i * 2 + 1] != results[i * 2]) { 195 query->Base.Result = GL_TRUE; 196 break; 197 } 198 } 199 break; 200 201 default: 202 unreachable("Unrecognized query target in brw_queryobj_get_results()"); 203 } 204 brw_bo_unmap(query->bo); 205 206 /* Now that we've processed the data stored in the query's buffer object, 207 * we can release it. 208 */ 209 brw_bo_unreference(query->bo); 210 query->bo = NULL; 211} 212 213/** 214 * The NewQueryObject() driver hook. 215 * 216 * Allocates and initializes a new query object. 217 */ 218static struct gl_query_object * 219brw_new_query_object(struct gl_context *ctx, GLuint id) 220{ 221 struct brw_query_object *query; 222 223 query = calloc(1, sizeof(struct brw_query_object)); 224 225 query->Base.Id = id; 226 query->Base.Result = 0; 227 query->Base.Active = false; 228 query->Base.Ready = true; 229 230 return &query->Base; 231} 232 233/** 234 * The DeleteQuery() driver hook. 235 */ 236static void 237brw_delete_query(struct gl_context *ctx, struct gl_query_object *q) 238{ 239 struct brw_query_object *query = (struct brw_query_object *)q; 240 241 brw_bo_unreference(query->bo); 242 free(query); 243} 244 245/** 246 * Gen4-5 driver hook for glBeginQuery(). 247 * 248 * Initializes driver structures and emits any GPU commands required to begin 249 * recording data for the query. 250 */ 251static void 252brw_begin_query(struct gl_context *ctx, struct gl_query_object *q) 253{ 254 struct brw_context *brw = brw_context(ctx); 255 struct brw_query_object *query = (struct brw_query_object *)q; 256 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 257 258 assert(devinfo->gen < 6); 259 260 switch (query->Base.Target) { 261 case GL_TIME_ELAPSED_EXT: 262 /* For timestamp queries, we record the starting time right away so that 263 * we measure the full time between BeginQuery and EndQuery. There's 264 * some debate about whether this is the right thing to do. Our decision 265 * is based on the following text from the ARB_timer_query extension: 266 * 267 * "(5) Should the extension measure total time elapsed between the full 268 * completion of the BeginQuery and EndQuery commands, or just time 269 * spent in the graphics library? 270 * 271 * RESOLVED: This extension will measure the total time elapsed 272 * between the full completion of these commands. Future extensions 273 * may implement a query to determine time elapsed at different stages 274 * of the graphics pipeline." 275 * 276 * We write a starting timestamp now (at index 0). At EndQuery() time, 277 * we'll write a second timestamp (at index 1), and subtract the two to 278 * obtain the time elapsed. Notably, this includes time elapsed while 279 * the system was doing other work, such as running other applications. 280 */ 281 brw_bo_unreference(query->bo); 282 query->bo = 283 brw_bo_alloc(brw->bufmgr, "timer query", 4096, BRW_MEMZONE_OTHER); 284 brw_write_timestamp(brw, query->bo, 0); 285 break; 286 287 case GL_ANY_SAMPLES_PASSED: 288 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 289 case GL_SAMPLES_PASSED_ARB: 290 /* For occlusion queries, we delay taking an initial sample until the 291 * first drawing occurs in this batch. See the reasoning in the comments 292 * for brw_emit_query_begin() below. 293 * 294 * Since we're starting a new query, we need to be sure to throw away 295 * any previous occlusion query results. 296 */ 297 brw_bo_unreference(query->bo); 298 query->bo = NULL; 299 query->last_index = -1; 300 301 brw->query.obj = query; 302 303 /* Depth statistics on Gen4 require strange workarounds, so we try to 304 * avoid them when necessary. They're required for occlusion queries, 305 * so turn them on now. 306 */ 307 brw->stats_wm++; 308 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 309 break; 310 311 default: 312 unreachable("Unrecognized query target in brw_begin_query()"); 313 } 314} 315 316/** 317 * Gen4-5 driver hook for glEndQuery(). 318 * 319 * Emits GPU commands to record a final query value, ending any data capturing. 320 * However, the final result isn't necessarily available until the GPU processes 321 * those commands. brw_queryobj_get_results() processes the captured data to 322 * produce the final result. 323 */ 324static void 325brw_end_query(struct gl_context *ctx, struct gl_query_object *q) 326{ 327 struct brw_context *brw = brw_context(ctx); 328 struct brw_query_object *query = (struct brw_query_object *)q; 329 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 330 331 assert(devinfo->gen < 6); 332 333 switch (query->Base.Target) { 334 case GL_TIME_ELAPSED_EXT: 335 /* Write the final timestamp. */ 336 brw_write_timestamp(brw, query->bo, 1); 337 break; 338 339 case GL_ANY_SAMPLES_PASSED: 340 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 341 case GL_SAMPLES_PASSED_ARB: 342 343 /* No query->bo means that EndQuery was called after BeginQuery with no 344 * intervening drawing. Rather than doing nothing at all here in this 345 * case, we emit the query_begin and query_end state to the 346 * hardware. This is to guarantee that waiting on the result of this 347 * empty state will cause all previous queries to complete at all, as 348 * required by the specification: 349 * 350 * It must always be true that if any query object 351 * returns a result available of TRUE, all queries of the 352 * same type issued prior to that query must also return 353 * TRUE. [Open GL 4.3 (Core Profile) Section 4.2.1] 354 */ 355 if (!query->bo) { 356 brw_emit_query_begin(brw); 357 } 358 359 assert(query->bo); 360 361 brw_emit_query_end(brw); 362 363 brw->query.obj = NULL; 364 365 brw->stats_wm--; 366 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 367 break; 368 369 default: 370 unreachable("Unrecognized query target in brw_end_query()"); 371 } 372} 373 374/** 375 * The Gen4-5 WaitQuery() driver hook. 376 * 377 * Wait for a query result to become available and return it. This is the 378 * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname. 379 */ 380static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q) 381{ 382 struct brw_query_object *query = (struct brw_query_object *)q; 383 UNUSED const struct gen_device_info *devinfo = 384 &brw_context(ctx)->screen->devinfo; 385 386 assert(devinfo->gen < 6); 387 388 brw_queryobj_get_results(ctx, query); 389 query->Base.Ready = true; 390} 391 392/** 393 * The Gen4-5 CheckQuery() driver hook. 394 * 395 * Checks whether a query result is ready yet. If not, flushes. 396 * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname. 397 */ 398static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q) 399{ 400 struct brw_context *brw = brw_context(ctx); 401 struct brw_query_object *query = (struct brw_query_object *)q; 402 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 403 404 assert(devinfo->gen < 6); 405 406 /* From the GL_ARB_occlusion_query spec: 407 * 408 * "Instead of allowing for an infinite loop, performing a 409 * QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is 410 * not ready yet on the first time it is queried. This ensures that 411 * the async query will return true in finite time. 412 */ 413 if (query->bo && brw_batch_references(&brw->batch, query->bo)) 414 intel_batchbuffer_flush(brw); 415 416 if (query->bo == NULL || !brw_bo_busy(query->bo)) { 417 brw_queryobj_get_results(ctx, query); 418 query->Base.Ready = true; 419 } 420} 421 422/** 423 * Ensure there query's BO has enough space to store a new pair of values. 424 * 425 * If not, gather the existing BO's results and create a new buffer of the 426 * same size. 427 */ 428static void 429ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) 430{ 431 struct brw_context *brw = brw_context(ctx); 432 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 433 434 assert(devinfo->gen < 6); 435 436 if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) { 437 438 if (query->bo != NULL) { 439 /* The old query BO did not have enough space, so we allocated a new 440 * one. Gather the results so far (adding up the differences) and 441 * release the old BO. 442 */ 443 brw_queryobj_get_results(ctx, query); 444 } 445 446 query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096, BRW_MEMZONE_OTHER); 447 query->last_index = 0; 448 } 449} 450 451/** 452 * Record the PS_DEPTH_COUNT value (for occlusion queries) just before 453 * primitive drawing. 454 * 455 * In a pre-hardware context world, the single PS_DEPTH_COUNT register is 456 * shared among all applications using the GPU. However, our query value 457 * needs to only include fragments generated by our application/GL context. 458 * 459 * To accommodate this, we record PS_DEPTH_COUNT at the start and end of 460 * each batchbuffer (technically, the first primitive drawn and flush time). 461 * Subtracting each pair of values calculates the change in PS_DEPTH_COUNT 462 * caused by a batchbuffer. Since there is no preemption inside batches, 463 * this is guaranteed to only measure the effects of our current application. 464 * 465 * Adding each of these differences (in case drawing is done over many batches) 466 * produces the final expected value. 467 * 468 * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored 469 * as part of the context state, so this is unnecessary, and skipped. 470 */ 471void 472brw_emit_query_begin(struct brw_context *brw) 473{ 474 struct gl_context *ctx = &brw->ctx; 475 struct brw_query_object *query = brw->query.obj; 476 477 /* Skip if we're not doing any queries, or we've already recorded the 478 * initial query value for this batchbuffer. 479 */ 480 if (!query || brw->query.begin_emitted) 481 return; 482 483 ensure_bo_has_space(ctx, query); 484 485 brw_write_depth_count(brw, query->bo, query->last_index * 2); 486 487 brw->query.begin_emitted = true; 488} 489 490/** 491 * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT 492 * (for non-hardware context platforms). 493 * 494 * See the explanation in brw_emit_query_begin(). 495 */ 496void 497brw_emit_query_end(struct brw_context *brw) 498{ 499 struct brw_query_object *query = brw->query.obj; 500 501 if (!brw->query.begin_emitted) 502 return; 503 504 brw_write_depth_count(brw, query->bo, query->last_index * 2 + 1); 505 506 brw->query.begin_emitted = false; 507 query->last_index++; 508} 509 510/** 511 * Driver hook for glQueryCounter(). 512 * 513 * This handles GL_TIMESTAMP queries, which perform a pipelined read of the 514 * current GPU time. This is unlike GL_TIME_ELAPSED, which measures the 515 * time while the query is active. 516 */ 517void 518brw_query_counter(struct gl_context *ctx, struct gl_query_object *q) 519{ 520 struct brw_context *brw = brw_context(ctx); 521 struct brw_query_object *query = (struct brw_query_object *) q; 522 523 assert(q->Target == GL_TIMESTAMP); 524 525 brw_bo_unreference(query->bo); 526 query->bo = 527 brw_bo_alloc(brw->bufmgr, "timestamp query", 4096, BRW_MEMZONE_OTHER); 528 brw_write_timestamp(brw, query->bo, 0); 529 530 query->flushed = false; 531} 532 533/** 534 * Read the TIMESTAMP register immediately (in a non-pipelined fashion). 535 * 536 * This is used to implement the GetTimestamp() driver hook. 537 */ 538static uint64_t 539brw_get_timestamp(struct gl_context *ctx) 540{ 541 struct brw_context *brw = brw_context(ctx); 542 const struct gen_device_info *devinfo = &brw->screen->devinfo; 543 uint64_t result = 0; 544 545 switch (brw->screen->hw_has_timestamp) { 546 case 3: /* New kernel, always full 36bit accuracy */ 547 brw_reg_read(brw->bufmgr, TIMESTAMP | 1, &result); 548 break; 549 case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */ 550 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 551 result = result >> 32; 552 break; 553 case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */ 554 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 555 break; 556 } 557 558 /* Scale to nanosecond units */ 559 result = gen_device_info_timebase_scale(devinfo, result); 560 561 /* Ensure the scaled timestamp overflows according to 562 * GL_QUERY_COUNTER_BITS. Technically this isn't required if 563 * querying GL_TIMESTAMP via glGetInteger but it seems best to keep 564 * QueryObject and GetInteger timestamps consistent. 565 */ 566 result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 567 return result; 568} 569 570/** 571 * Is this type of query written by PIPE_CONTROL? 572 */ 573bool 574brw_is_query_pipelined(struct brw_query_object *query) 575{ 576 switch (query->Base.Target) { 577 case GL_TIMESTAMP: 578 case GL_TIME_ELAPSED: 579 case GL_ANY_SAMPLES_PASSED: 580 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 581 case GL_SAMPLES_PASSED_ARB: 582 return true; 583 584 case GL_PRIMITIVES_GENERATED: 585 case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: 586 case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB: 587 case GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB: 588 case GL_VERTICES_SUBMITTED_ARB: 589 case GL_PRIMITIVES_SUBMITTED_ARB: 590 case GL_VERTEX_SHADER_INVOCATIONS_ARB: 591 case GL_GEOMETRY_SHADER_INVOCATIONS: 592 case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: 593 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 594 case GL_CLIPPING_INPUT_PRIMITIVES_ARB: 595 case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: 596 case GL_COMPUTE_SHADER_INVOCATIONS_ARB: 597 case GL_TESS_CONTROL_SHADER_PATCHES_ARB: 598 case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: 599 return false; 600 601 default: 602 unreachable("Unrecognized query target in is_query_pipelined()"); 603 } 604} 605 606/* Initialize query object functions used on all generations. */ 607void brw_init_common_queryobj_functions(struct dd_function_table *functions) 608{ 609 functions->NewQueryObject = brw_new_query_object; 610 functions->DeleteQuery = brw_delete_query; 611 functions->GetTimestamp = brw_get_timestamp; 612} 613 614/* Initialize Gen4/5-specific query object functions. */ 615void gen4_init_queryobj_functions(struct dd_function_table *functions) 616{ 617 functions->BeginQuery = brw_begin_query; 618 functions->EndQuery = brw_end_query; 619 functions->CheckQuery = brw_check_query; 620 functions->WaitQuery = brw_wait_query; 621 functions->QueryCounter = brw_query_counter; 622} 623