1/* 2 * Copyright © 2008 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 * Authors: 24 * Eric Anholt <eric@anholt.net> 25 * 26 */ 27 28/** @file brw_queryobj.c 29 * 30 * Support for query objects (GL_ARB_occlusion_query, GL_ARB_timer_query, 31 * GL_EXT_transform_feedback, and friends). 32 * 33 * The hardware provides a PIPE_CONTROL command that can report the number of 34 * fragments that passed the depth test, or the hardware timer. They are 35 * appropriately synced with the stage of the pipeline for our extensions' 36 * needs. 37 */ 38#include "main/queryobj.h" 39 40#include "brw_context.h" 41#include "brw_defines.h" 42#include "brw_state.h" 43#include "brw_batch.h" 44 45/* As best we know currently, the Gen HW timestamps are 36bits across 46 * all platforms, which we need to account for when calculating a 47 * delta to measure elapsed time. 48 * 49 * The timestamps read via glGetTimestamp() / brw_get_timestamp() sometimes 50 * only have 32bits due to a kernel bug and so in that case we make sure to 51 * treat all raw timestamps as 32bits so they overflow consistently and remain 52 * comparable. (Note: the timestamps being passed here are not from the kernel 53 * so we don't need to be taking the upper 32bits in this buggy kernel case we 54 * are just clipping to 32bits here for consistency.) 55 */ 56uint64_t 57brw_raw_timestamp_delta(struct brw_context *brw, uint64_t time0, uint64_t time1) 58{ 59 if (brw->screen->hw_has_timestamp == 2) { 60 /* Kernel clips timestamps to 32bits in this case, so we also clip 61 * PIPE_CONTROL timestamps for consistency. 62 */ 63 return (uint32_t)time1 - (uint32_t)time0; 64 } else { 65 if (time0 > time1) { 66 return (1ULL << 36) + time1 - time0; 67 } else { 68 return time1 - time0; 69 } 70 } 71} 72 73/** 74 * Emit PIPE_CONTROLs to write the current GPU timestamp into a buffer. 75 */ 76void 77brw_write_timestamp(struct brw_context *brw, struct brw_bo *query_bo, int idx) 78{ 79 const struct intel_device_info *devinfo = &brw->screen->devinfo; 80 81 if (devinfo->ver == 6) { 82 /* Emit Sandybridge workaround flush: */ 83 brw_emit_pipe_control_flush(brw, 84 PIPE_CONTROL_CS_STALL | 85 PIPE_CONTROL_STALL_AT_SCOREBOARD); 86 } 87 88 uint32_t flags = PIPE_CONTROL_WRITE_TIMESTAMP; 89 90 if (devinfo->ver == 9 && devinfo->gt == 4) 91 flags |= PIPE_CONTROL_CS_STALL; 92 93 brw_emit_pipe_control_write(brw, flags, 94 query_bo, idx * sizeof(uint64_t), 0); 95} 96 97/** 98 * Emit PIPE_CONTROLs to write the PS_DEPTH_COUNT register into a buffer. 99 */ 100void 101brw_write_depth_count(struct brw_context *brw, struct brw_bo *query_bo, int idx) 102{ 103 const struct intel_device_info *devinfo = &brw->screen->devinfo; 104 uint32_t flags = PIPE_CONTROL_WRITE_DEPTH_COUNT | PIPE_CONTROL_DEPTH_STALL; 105 106 if (devinfo->ver == 9 && devinfo->gt == 4) 107 flags |= PIPE_CONTROL_CS_STALL; 108 109 if (devinfo->ver >= 10) { 110 /* "Driver must program PIPE_CONTROL with only Depth Stall Enable bit set 111 * prior to programming a PIPE_CONTROL with Write PS Depth Count Post sync 112 * operation." 113 */ 114 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL); 115 } 116 117 brw_emit_pipe_control_write(brw, flags, 118 query_bo, idx * sizeof(uint64_t), 0); 119} 120 121/** 122 * Wait on the query object's BO and calculate the final result. 123 */ 124static void 125brw_queryobj_get_results(struct gl_context *ctx, 126 struct brw_query_object *query) 127{ 128 struct brw_context *brw = brw_context(ctx); 129 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 130 131 int i; 132 uint64_t *results; 133 134 assert(devinfo->ver < 6); 135 136 if (query->bo == NULL) 137 return; 138 139 /* If the application has requested the query result, but this batch is 140 * still contributing to it, flush it now so the results will be present 141 * when mapped. 142 */ 143 if (brw_batch_references(&brw->batch, query->bo)) 144 brw_batch_flush(brw); 145 146 if (unlikely(brw->perf_debug)) { 147 if (brw_bo_busy(query->bo)) { 148 perf_debug("Stalling on the GPU waiting for a query object.\n"); 149 } 150 } 151 152 results = brw_bo_map(brw, query->bo, MAP_READ); 153 switch (query->Base.Target) { 154 case GL_TIME_ELAPSED_EXT: 155 /* The query BO contains the starting and ending timestamps. 156 * Subtract the two and convert to nanoseconds. 157 */ 158 query->Base.Result = brw_raw_timestamp_delta(brw, results[0], results[1]); 159 query->Base.Result = intel_device_info_timebase_scale(devinfo, query->Base.Result); 160 break; 161 162 case GL_TIMESTAMP: 163 /* The query BO contains a single timestamp value in results[0]. */ 164 query->Base.Result = intel_device_info_timebase_scale(devinfo, results[0]); 165 166 /* Ensure the scaled timestamp overflows according to 167 * GL_QUERY_COUNTER_BITS 168 */ 169 query->Base.Result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 170 break; 171 172 case GL_SAMPLES_PASSED_ARB: 173 /* Loop over pairs of values from the BO, which are the PS_DEPTH_COUNT 174 * value at the start and end of the batchbuffer. Subtract them to 175 * get the number of fragments which passed the depth test in each 176 * individual batch, and add those differences up to get the number 177 * of fragments for the entire query. 178 * 179 * Note that query->Base.Result may already be non-zero. We may have 180 * run out of space in the query's BO and allocated a new one. If so, 181 * this function was already called to accumulate the results so far. 182 */ 183 for (i = 0; i < query->last_index; i++) { 184 query->Base.Result += results[i * 2 + 1] - results[i * 2]; 185 } 186 break; 187 188 case GL_ANY_SAMPLES_PASSED: 189 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 190 /* If the starting and ending PS_DEPTH_COUNT from any of the batches 191 * differ, then some fragments passed the depth test. 192 */ 193 for (i = 0; i < query->last_index; i++) { 194 if (results[i * 2 + 1] != results[i * 2]) { 195 query->Base.Result = GL_TRUE; 196 break; 197 } 198 } 199 break; 200 201 default: 202 unreachable("Unrecognized query target in brw_queryobj_get_results()"); 203 } 204 brw_bo_unmap(query->bo); 205 206 /* Now that we've processed the data stored in the query's buffer object, 207 * we can release it. 208 */ 209 brw_bo_unreference(query->bo); 210 query->bo = NULL; 211} 212 213/** 214 * The NewQueryObject() driver hook. 215 * 216 * Allocates and initializes a new query object. 217 */ 218static struct gl_query_object * 219brw_new_query_object(struct gl_context *ctx, GLuint id) 220{ 221 struct brw_query_object *query; 222 223 query = calloc(1, sizeof(struct brw_query_object)); 224 225 query->Base.Id = id; 226 query->Base.Result = 0; 227 query->Base.Active = false; 228 query->Base.Ready = true; 229 230 return &query->Base; 231} 232 233/** 234 * The DeleteQuery() driver hook. 235 */ 236static void 237brw_delete_query(struct gl_context *ctx, struct gl_query_object *q) 238{ 239 struct brw_query_object *query = (struct brw_query_object *)q; 240 241 brw_bo_unreference(query->bo); 242 _mesa_delete_query(ctx, q); 243} 244 245/** 246 * Gfx4-5 driver hook for glBeginQuery(). 247 * 248 * Initializes driver structures and emits any GPU commands required to begin 249 * recording data for the query. 250 */ 251static void 252brw_begin_query(struct gl_context *ctx, struct gl_query_object *q) 253{ 254 struct brw_context *brw = brw_context(ctx); 255 struct brw_query_object *query = (struct brw_query_object *)q; 256 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 257 258 assert(devinfo->ver < 6); 259 260 switch (query->Base.Target) { 261 case GL_TIME_ELAPSED_EXT: 262 /* For timestamp queries, we record the starting time right away so that 263 * we measure the full time between BeginQuery and EndQuery. There's 264 * some debate about whether this is the right thing to do. Our decision 265 * is based on the following text from the ARB_timer_query extension: 266 * 267 * "(5) Should the extension measure total time elapsed between the full 268 * completion of the BeginQuery and EndQuery commands, or just time 269 * spent in the graphics library? 270 * 271 * RESOLVED: This extension will measure the total time elapsed 272 * between the full completion of these commands. Future extensions 273 * may implement a query to determine time elapsed at different stages 274 * of the graphics pipeline." 275 * 276 * We write a starting timestamp now (at index 0). At EndQuery() time, 277 * we'll write a second timestamp (at index 1), and subtract the two to 278 * obtain the time elapsed. Notably, this includes time elapsed while 279 * the system was doing other work, such as running other applications. 280 */ 281 brw_bo_unreference(query->bo); 282 query->bo = 283 brw_bo_alloc(brw->bufmgr, "timer query", 4096, BRW_MEMZONE_OTHER); 284 brw_write_timestamp(brw, query->bo, 0); 285 break; 286 287 case GL_ANY_SAMPLES_PASSED: 288 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 289 case GL_SAMPLES_PASSED_ARB: 290 /* For occlusion queries, we delay taking an initial sample until the 291 * first drawing occurs in this batch. See the reasoning in the comments 292 * for brw_emit_query_begin() below. 293 * 294 * Since we're starting a new query, we need to be sure to throw away 295 * any previous occlusion query results. 296 */ 297 brw_bo_unreference(query->bo); 298 query->bo = NULL; 299 query->last_index = -1; 300 301 brw->query.obj = query; 302 303 /* Depth statistics on Gfx4 require strange workarounds, so we try to 304 * avoid them when necessary. They're required for occlusion queries, 305 * so turn them on now. 306 */ 307 brw->stats_wm++; 308 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 309 break; 310 311 default: 312 unreachable("Unrecognized query target in brw_begin_query()"); 313 } 314} 315 316/** 317 * Gfx4-5 driver hook for glEndQuery(). 318 * 319 * Emits GPU commands to record a final query value, ending any data capturing. 320 * However, the final result isn't necessarily available until the GPU processes 321 * those commands. brw_queryobj_get_results() processes the captured data to 322 * produce the final result. 323 */ 324static void 325brw_end_query(struct gl_context *ctx, struct gl_query_object *q) 326{ 327 struct brw_context *brw = brw_context(ctx); 328 struct brw_query_object *query = (struct brw_query_object *)q; 329 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 330 331 assert(devinfo->ver < 6); 332 333 switch (query->Base.Target) { 334 case GL_TIME_ELAPSED_EXT: 335 /* Write the final timestamp. */ 336 brw_write_timestamp(brw, query->bo, 1); 337 break; 338 339 case GL_ANY_SAMPLES_PASSED: 340 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 341 case GL_SAMPLES_PASSED_ARB: 342 343 /* No query->bo means that EndQuery was called after BeginQuery with no 344 * intervening drawing. Rather than doing nothing at all here in this 345 * case, we emit the query_begin and query_end state to the 346 * hardware. This is to guarantee that waiting on the result of this 347 * empty state will cause all previous queries to complete at all, as 348 * required by the OpenGL 4.3 (Core Profile) spec, section 4.2.1: 349 * 350 * "It must always be true that if any query object returns 351 * a result available of TRUE, all queries of the same type 352 * issued prior to that query must also return TRUE." 353 */ 354 if (!query->bo) { 355 brw_emit_query_begin(brw); 356 } 357 358 assert(query->bo); 359 360 brw_emit_query_end(brw); 361 362 brw->query.obj = NULL; 363 364 brw->stats_wm--; 365 brw->ctx.NewDriverState |= BRW_NEW_STATS_WM; 366 break; 367 368 default: 369 unreachable("Unrecognized query target in brw_end_query()"); 370 } 371} 372 373/** 374 * The Gfx4-5 WaitQuery() driver hook. 375 * 376 * Wait for a query result to become available and return it. This is the 377 * backing for glGetQueryObjectiv() with the GL_QUERY_RESULT pname. 378 */ 379static void brw_wait_query(struct gl_context *ctx, struct gl_query_object *q) 380{ 381 struct brw_query_object *query = (struct brw_query_object *)q; 382 UNUSED const struct intel_device_info *devinfo = 383 &brw_context(ctx)->screen->devinfo; 384 385 assert(devinfo->ver < 6); 386 387 brw_queryobj_get_results(ctx, query); 388 query->Base.Ready = true; 389} 390 391/** 392 * The Gfx4-5 CheckQuery() driver hook. 393 * 394 * Checks whether a query result is ready yet. If not, flushes. 395 * This is the backing for glGetQueryObjectiv()'s QUERY_RESULT_AVAILABLE pname. 396 */ 397static void brw_check_query(struct gl_context *ctx, struct gl_query_object *q) 398{ 399 struct brw_context *brw = brw_context(ctx); 400 struct brw_query_object *query = (struct brw_query_object *)q; 401 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 402 403 assert(devinfo->ver < 6); 404 405 /* From the GL_ARB_occlusion_query spec: 406 * 407 * "Instead of allowing for an infinite loop, performing a 408 * QUERY_RESULT_AVAILABLE_ARB will perform a flush if the result is 409 * not ready yet on the first time it is queried. This ensures that 410 * the async query will return true in finite time. 411 */ 412 if (query->bo && brw_batch_references(&brw->batch, query->bo)) 413 brw_batch_flush(brw); 414 415 if (query->bo == NULL || !brw_bo_busy(query->bo)) { 416 brw_queryobj_get_results(ctx, query); 417 query->Base.Ready = true; 418 } 419} 420 421/** 422 * Ensure there query's BO has enough space to store a new pair of values. 423 * 424 * If not, gather the existing BO's results and create a new buffer of the 425 * same size. 426 */ 427static void 428ensure_bo_has_space(struct gl_context *ctx, struct brw_query_object *query) 429{ 430 struct brw_context *brw = brw_context(ctx); 431 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 432 433 assert(devinfo->ver < 6); 434 435 if (!query->bo || query->last_index * 2 + 1 >= 4096 / sizeof(uint64_t)) { 436 437 if (query->bo != NULL) { 438 /* The old query BO did not have enough space, so we allocated a new 439 * one. Gather the results so far (adding up the differences) and 440 * release the old BO. 441 */ 442 brw_queryobj_get_results(ctx, query); 443 } 444 445 query->bo = brw_bo_alloc(brw->bufmgr, "query", 4096, BRW_MEMZONE_OTHER); 446 query->last_index = 0; 447 } 448} 449 450/** 451 * Record the PS_DEPTH_COUNT value (for occlusion queries) just before 452 * primitive drawing. 453 * 454 * In a pre-hardware context world, the single PS_DEPTH_COUNT register is 455 * shared among all applications using the GPU. However, our query value 456 * needs to only include fragments generated by our application/GL context. 457 * 458 * To accommodate this, we record PS_DEPTH_COUNT at the start and end of 459 * each batchbuffer (technically, the first primitive drawn and flush time). 460 * Subtracting each pair of values calculates the change in PS_DEPTH_COUNT 461 * caused by a batchbuffer. Since there is no preemption inside batches, 462 * this is guaranteed to only measure the effects of our current application. 463 * 464 * Adding each of these differences (in case drawing is done over many batches) 465 * produces the final expected value. 466 * 467 * In a world with hardware contexts, PS_DEPTH_COUNT is saved and restored 468 * as part of the context state, so this is unnecessary, and skipped. 469 */ 470void 471brw_emit_query_begin(struct brw_context *brw) 472{ 473 struct gl_context *ctx = &brw->ctx; 474 struct brw_query_object *query = brw->query.obj; 475 476 /* Skip if we're not doing any queries, or we've already recorded the 477 * initial query value for this batchbuffer. 478 */ 479 if (!query || brw->query.begin_emitted) 480 return; 481 482 ensure_bo_has_space(ctx, query); 483 484 brw_write_depth_count(brw, query->bo, query->last_index * 2); 485 486 brw->query.begin_emitted = true; 487} 488 489/** 490 * Called at batchbuffer flush to get an ending PS_DEPTH_COUNT 491 * (for non-hardware context platforms). 492 * 493 * See the explanation in brw_emit_query_begin(). 494 */ 495void 496brw_emit_query_end(struct brw_context *brw) 497{ 498 struct brw_query_object *query = brw->query.obj; 499 500 if (!brw->query.begin_emitted) 501 return; 502 503 brw_write_depth_count(brw, query->bo, query->last_index * 2 + 1); 504 505 brw->query.begin_emitted = false; 506 query->last_index++; 507} 508 509/** 510 * Driver hook for glQueryCounter(). 511 * 512 * This handles GL_TIMESTAMP queries, which perform a pipelined read of the 513 * current GPU time. This is unlike GL_TIME_ELAPSED, which measures the 514 * time while the query is active. 515 */ 516void 517brw_query_counter(struct gl_context *ctx, struct gl_query_object *q) 518{ 519 struct brw_context *brw = brw_context(ctx); 520 struct brw_query_object *query = (struct brw_query_object *) q; 521 522 assert(q->Target == GL_TIMESTAMP); 523 524 brw_bo_unreference(query->bo); 525 query->bo = 526 brw_bo_alloc(brw->bufmgr, "timestamp query", 4096, BRW_MEMZONE_OTHER); 527 brw_write_timestamp(brw, query->bo, 0); 528 529 query->flushed = false; 530} 531 532/** 533 * Read the TIMESTAMP register immediately (in a non-pipelined fashion). 534 * 535 * This is used to implement the GetTimestamp() driver hook. 536 */ 537static uint64_t 538brw_get_timestamp(struct gl_context *ctx) 539{ 540 struct brw_context *brw = brw_context(ctx); 541 const struct intel_device_info *devinfo = &brw->screen->devinfo; 542 uint64_t result = 0; 543 544 switch (brw->screen->hw_has_timestamp) { 545 case 3: /* New kernel, always full 36bit accuracy */ 546 brw_reg_read(brw->bufmgr, TIMESTAMP | 1, &result); 547 break; 548 case 2: /* 64bit kernel, result is left-shifted by 32bits, losing 4bits */ 549 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 550 result = result >> 32; 551 break; 552 case 1: /* 32bit kernel, result is 36bit wide but may be inaccurate! */ 553 brw_reg_read(brw->bufmgr, TIMESTAMP, &result); 554 break; 555 } 556 557 /* Scale to nanosecond units */ 558 result = intel_device_info_timebase_scale(devinfo, result); 559 560 /* Ensure the scaled timestamp overflows according to 561 * GL_QUERY_COUNTER_BITS. Technically this isn't required if 562 * querying GL_TIMESTAMP via glGetInteger but it seems best to keep 563 * QueryObject and GetInteger timestamps consistent. 564 */ 565 result &= (1ull << ctx->Const.QueryCounterBits.Timestamp) - 1; 566 return result; 567} 568 569/** 570 * Is this type of query written by PIPE_CONTROL? 571 */ 572bool 573brw_is_query_pipelined(struct brw_query_object *query) 574{ 575 switch (query->Base.Target) { 576 case GL_TIMESTAMP: 577 case GL_TIME_ELAPSED: 578 case GL_ANY_SAMPLES_PASSED: 579 case GL_ANY_SAMPLES_PASSED_CONSERVATIVE: 580 case GL_SAMPLES_PASSED_ARB: 581 return true; 582 583 case GL_PRIMITIVES_GENERATED: 584 case GL_TRANSFORM_FEEDBACK_PRIMITIVES_WRITTEN: 585 case GL_TRANSFORM_FEEDBACK_STREAM_OVERFLOW_ARB: 586 case GL_TRANSFORM_FEEDBACK_OVERFLOW_ARB: 587 case GL_VERTICES_SUBMITTED_ARB: 588 case GL_PRIMITIVES_SUBMITTED_ARB: 589 case GL_VERTEX_SHADER_INVOCATIONS_ARB: 590 case GL_GEOMETRY_SHADER_INVOCATIONS: 591 case GL_GEOMETRY_SHADER_PRIMITIVES_EMITTED_ARB: 592 case GL_FRAGMENT_SHADER_INVOCATIONS_ARB: 593 case GL_CLIPPING_INPUT_PRIMITIVES_ARB: 594 case GL_CLIPPING_OUTPUT_PRIMITIVES_ARB: 595 case GL_COMPUTE_SHADER_INVOCATIONS_ARB: 596 case GL_TESS_CONTROL_SHADER_PATCHES_ARB: 597 case GL_TESS_EVALUATION_SHADER_INVOCATIONS_ARB: 598 return false; 599 600 default: 601 unreachable("Unrecognized query target in is_query_pipelined()"); 602 } 603} 604 605/* Initialize query object functions used on all generations. */ 606void brw_init_common_queryobj_functions(struct dd_function_table *functions) 607{ 608 functions->NewQueryObject = brw_new_query_object; 609 functions->DeleteQuery = brw_delete_query; 610 functions->GetTimestamp = brw_get_timestamp; 611} 612 613/* Initialize Gfx4/5-specific query object functions. */ 614void gfx4_init_queryobj_functions(struct dd_function_table *functions) 615{ 616 functions->BeginQuery = brw_begin_query; 617 functions->EndQuery = brw_end_query; 618 functions->CheckQuery = brw_check_query; 619 functions->WaitQuery = brw_wait_query; 620 functions->QueryCounter = brw_query_counter; 621} 622