1/* 2 * Copyright 2006 VMware, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the 7 * "Software"), to deal in the Software without restriction, including 8 * without limitation the rights to use, copy, modify, merge, publish, 9 * distribute, sublicense, and/or sell copies of the Software, and to 10 * permit persons to whom the Software is furnished to do so, subject to 11 * the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the 14 * next paragraph) shall be included in all copies or substantial portions 15 * of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 24 */ 25 26#include "brw_batch.h" 27#include "brw_buffer_objects.h" 28#include "brw_bufmgr.h" 29#include "brw_buffers.h" 30#include "brw_fbo.h" 31#include "brw_context.h" 32#include "brw_defines.h" 33#include "brw_state.h" 34#include "common/intel_decoder.h" 35#include "common/intel_gem.h" 36 37#include "util/hash_table.h" 38 39#include <xf86drm.h> 40#include "drm-uapi/i915_drm.h" 41 42#define FILE_DEBUG_FLAG DEBUG_BUFMGR 43 44/** 45 * Target sizes of the batch and state buffers. We create the initial 46 * buffers at these sizes, and flush when they're nearly full. If we 47 * underestimate how close we are to the end, and suddenly need more space 48 * in the middle of a draw, we can grow the buffers, and finish the draw. 49 * At that point, we'll be over our target size, so the next operation 50 * should flush. Each time we flush the batch, we recreate both buffers 51 * at the original target size, so it doesn't grow without bound. 52 */ 53#define BATCH_SZ (20 * 1024) 54#define STATE_SZ (16 * 1024) 55 56static void 57brw_batch_reset(struct brw_context *brw); 58static void 59brw_new_batch(struct brw_context *brw); 60 61static unsigned 62num_fences(struct brw_batch *batch) 63{ 64 return util_dynarray_num_elements(&batch->exec_fences, 65 struct drm_i915_gem_exec_fence); 66} 67 68 69static void 70dump_validation_list(struct brw_batch *batch) 71{ 72 fprintf(stderr, "Validation list (length %d):\n", batch->exec_count); 73 74 for (int i = 0; i < batch->exec_count; i++) { 75 uint64_t flags = batch->validation_list[i].flags; 76 assert(batch->validation_list[i].handle == 77 batch->exec_bos[i]->gem_handle); 78 fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%"PRIx64"%s (%"PRIu64"B)\n", 79 i, 80 batch->validation_list[i].handle, 81 batch->exec_bos[i]->name, 82 batch->exec_bos[i], 83 (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b", 84 (flags & EXEC_OBJECT_WRITE) ? " write)" : ")", 85 (uint64_t)batch->validation_list[i].offset, 86 (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "", 87 batch->exec_bos[i]->size); 88 } 89} 90 91static struct intel_batch_decode_bo 92decode_get_bo(void *v_brw, bool ppgtt, uint64_t address) 93{ 94 struct brw_context *brw = v_brw; 95 struct brw_batch *batch = &brw->batch; 96 97 for (int i = 0; i < batch->exec_count; i++) { 98 struct brw_bo *bo = batch->exec_bos[i]; 99 /* The decoder zeroes out the top 16 bits, so we need to as well */ 100 uint64_t bo_address = bo->gtt_offset & (~0ull >> 16); 101 102 if (address >= bo_address && address < bo_address + bo->size) { 103 return (struct intel_batch_decode_bo) { 104 .addr = bo_address, 105 .size = bo->size, 106 .map = brw_bo_map(brw, bo, MAP_READ), 107 }; 108 } 109 } 110 111 return (struct intel_batch_decode_bo) { }; 112} 113 114static unsigned 115decode_get_state_size(void *v_brw, uint64_t address, uint64_t base_address) 116{ 117 struct brw_context *brw = v_brw; 118 struct brw_batch *batch = &brw->batch; 119 unsigned size = (uintptr_t) 120 _mesa_hash_table_u64_search(batch->state_batch_sizes, 121 address - base_address); 122 return size; 123} 124 125static void 126init_reloc_list(struct brw_reloc_list *rlist, int count) 127{ 128 rlist->reloc_count = 0; 129 rlist->reloc_array_size = count; 130 rlist->relocs = malloc(rlist->reloc_array_size * 131 sizeof(struct drm_i915_gem_relocation_entry)); 132} 133 134void 135brw_batch_init(struct brw_context *brw) 136{ 137 struct brw_screen *screen = brw->screen; 138 struct brw_batch *batch = &brw->batch; 139 const struct intel_device_info *devinfo = &screen->devinfo; 140 141 if (INTEL_DEBUG(DEBUG_BATCH)) { 142 /* The shadow doesn't get relocs written so state decode fails. */ 143 batch->use_shadow_copy = false; 144 } else 145 batch->use_shadow_copy = !devinfo->has_llc; 146 147 init_reloc_list(&batch->batch_relocs, 250); 148 init_reloc_list(&batch->state_relocs, 250); 149 150 batch->batch.map = NULL; 151 batch->state.map = NULL; 152 batch->exec_count = 0; 153 batch->exec_array_size = 100; 154 batch->exec_bos = 155 malloc(batch->exec_array_size * sizeof(batch->exec_bos[0])); 156 batch->validation_list = 157 malloc(batch->exec_array_size * sizeof(batch->validation_list[0])); 158 batch->contains_fence_signal = false; 159 160 if (INTEL_DEBUG(DEBUG_BATCH)) { 161 batch->state_batch_sizes = 162 _mesa_hash_table_u64_create(NULL); 163 164 const unsigned decode_flags = 165 INTEL_BATCH_DECODE_FULL | 166 (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) | 167 INTEL_BATCH_DECODE_OFFSETS | 168 INTEL_BATCH_DECODE_FLOATS; 169 170 intel_batch_decode_ctx_init(&batch->decoder, devinfo, stderr, 171 decode_flags, NULL, decode_get_bo, 172 decode_get_state_size, brw); 173 batch->decoder.max_vbo_decoded_lines = 100; 174 } 175 176 batch->use_batch_first = 177 screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST; 178 179 /* PIPE_CONTROL needs a w/a but only on gfx6 */ 180 batch->valid_reloc_flags = EXEC_OBJECT_WRITE; 181 if (devinfo->ver == 6) 182 batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT; 183 184 brw_batch_reset(brw); 185} 186 187#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x)) 188 189static unsigned 190add_exec_bo(struct brw_batch *batch, struct brw_bo *bo) 191{ 192 assert(bo->bufmgr == batch->batch.bo->bufmgr); 193 194 unsigned index = READ_ONCE(bo->index); 195 196 if (index < batch->exec_count && batch->exec_bos[index] == bo) 197 return index; 198 199 /* May have been shared between multiple active batches */ 200 for (index = 0; index < batch->exec_count; index++) { 201 if (batch->exec_bos[index] == bo) 202 return index; 203 } 204 205 brw_bo_reference(bo); 206 207 if (batch->exec_count == batch->exec_array_size) { 208 batch->exec_array_size *= 2; 209 batch->exec_bos = 210 realloc(batch->exec_bos, 211 batch->exec_array_size * sizeof(batch->exec_bos[0])); 212 batch->validation_list = 213 realloc(batch->validation_list, 214 batch->exec_array_size * sizeof(batch->validation_list[0])); 215 } 216 217 batch->validation_list[batch->exec_count] = 218 (struct drm_i915_gem_exec_object2) { 219 .handle = bo->gem_handle, 220 .offset = bo->gtt_offset, 221 .flags = bo->kflags, 222 }; 223 224 bo->index = batch->exec_count; 225 batch->exec_bos[batch->exec_count] = bo; 226 batch->aperture_space += bo->size; 227 228 return batch->exec_count++; 229} 230 231static void 232recreate_growing_buffer(struct brw_context *brw, 233 struct brw_growing_bo *grow, 234 const char *name, unsigned size, 235 enum brw_memory_zone memzone) 236{ 237 struct brw_screen *screen = brw->screen; 238 struct brw_batch *batch = &brw->batch; 239 struct brw_bufmgr *bufmgr = screen->bufmgr; 240 241 /* We can't grow buffers when using softpin, so just overallocate them. */ 242 if (brw_using_softpin(bufmgr)) 243 size *= 2; 244 245 grow->bo = brw_bo_alloc(bufmgr, name, size, memzone); 246 grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0; 247 grow->partial_bo = NULL; 248 grow->partial_bo_map = NULL; 249 grow->partial_bytes = 0; 250 grow->memzone = memzone; 251 252 if (batch->use_shadow_copy) 253 grow->map = realloc(grow->map, grow->bo->size); 254 else 255 grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE); 256} 257 258static void 259brw_batch_reset(struct brw_context *brw) 260{ 261 struct brw_batch *batch = &brw->batch; 262 263 if (batch->last_bo != NULL) { 264 brw_bo_unreference(batch->last_bo); 265 batch->last_bo = NULL; 266 } 267 batch->last_bo = batch->batch.bo; 268 269 recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ, 270 BRW_MEMZONE_OTHER); 271 batch->map_next = batch->batch.map; 272 273 recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ, 274 BRW_MEMZONE_DYNAMIC); 275 276 /* Avoid making 0 a valid state offset - otherwise the decoder will try 277 * and decode data when we use offset 0 as a null pointer. 278 */ 279 batch->state_used = 1; 280 281 add_exec_bo(batch, batch->batch.bo); 282 assert(batch->batch.bo->index == 0); 283 284 batch->needs_sol_reset = false; 285 batch->state_base_address_emitted = false; 286 287 if (batch->state_batch_sizes) 288 _mesa_hash_table_u64_clear(batch->state_batch_sizes); 289 290 /* Always add workaround_bo which contains a driver identifier to be 291 * recorded in error states. 292 */ 293 struct brw_bo *identifier_bo = brw->workaround_bo; 294 if (identifier_bo) 295 add_exec_bo(batch, identifier_bo); 296 297 if (batch->contains_fence_signal) 298 batch->contains_fence_signal = false; 299} 300 301static void 302brw_batch_reset_and_clear_render_cache(struct brw_context *brw) 303{ 304 brw_batch_reset(brw); 305 brw_cache_sets_clear(brw); 306} 307 308void 309brw_batch_save_state(struct brw_context *brw) 310{ 311 brw->batch.saved.map_next = brw->batch.map_next; 312 brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count; 313 brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count; 314 brw->batch.saved.exec_count = brw->batch.exec_count; 315} 316 317bool 318brw_batch_saved_state_is_empty(struct brw_context *brw) 319{ 320 struct brw_batch *batch = &brw->batch; 321 return (batch->saved.map_next == batch->batch.map); 322} 323 324void 325brw_batch_reset_to_saved(struct brw_context *brw) 326{ 327 for (int i = brw->batch.saved.exec_count; 328 i < brw->batch.exec_count; i++) { 329 brw_bo_unreference(brw->batch.exec_bos[i]); 330 } 331 brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count; 332 brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count; 333 brw->batch.exec_count = brw->batch.saved.exec_count; 334 335 brw->batch.map_next = brw->batch.saved.map_next; 336 if (USED_BATCH(brw->batch) == 0) 337 brw_new_batch(brw); 338} 339 340void 341brw_batch_free(struct brw_batch *batch) 342{ 343 if (batch->use_shadow_copy) { 344 free(batch->batch.map); 345 free(batch->state.map); 346 } 347 348 for (int i = 0; i < batch->exec_count; i++) { 349 brw_bo_unreference(batch->exec_bos[i]); 350 } 351 free(batch->batch_relocs.relocs); 352 free(batch->state_relocs.relocs); 353 free(batch->exec_bos); 354 free(batch->validation_list); 355 356 brw_bo_unreference(batch->last_bo); 357 brw_bo_unreference(batch->batch.bo); 358 brw_bo_unreference(batch->state.bo); 359 if (batch->state_batch_sizes) { 360 _mesa_hash_table_u64_destroy(batch->state_batch_sizes); 361 intel_batch_decode_ctx_finish(&batch->decoder); 362 } 363} 364 365/** 366 * Finish copying the old batch/state buffer's contents to the new one 367 * after we tried to "grow" the buffer in an earlier operation. 368 */ 369static void 370finish_growing_bos(struct brw_growing_bo *grow) 371{ 372 struct brw_bo *old_bo = grow->partial_bo; 373 if (!old_bo) 374 return; 375 376 memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes); 377 378 grow->partial_bo = NULL; 379 grow->partial_bo_map = NULL; 380 grow->partial_bytes = 0; 381 382 brw_bo_unreference(old_bo); 383} 384 385static void 386replace_bo_in_reloc_list(struct brw_reloc_list *rlist, 387 uint32_t old_handle, uint32_t new_handle) 388{ 389 for (int i = 0; i < rlist->reloc_count; i++) { 390 if (rlist->relocs[i].target_handle == old_handle) 391 rlist->relocs[i].target_handle = new_handle; 392 } 393} 394 395/** 396 * Grow either the batch or state buffer to a new larger size. 397 * 398 * We can't actually grow buffers, so we allocate a new one, copy over 399 * the existing contents, and update our lists to refer to the new one. 400 * 401 * Note that this is only temporary - each new batch recreates the buffers 402 * at their original target size (BATCH_SZ or STATE_SZ). 403 */ 404static void 405grow_buffer(struct brw_context *brw, 406 struct brw_growing_bo *grow, 407 unsigned existing_bytes, 408 unsigned new_size) 409{ 410 struct brw_batch *batch = &brw->batch; 411 struct brw_bufmgr *bufmgr = brw->bufmgr; 412 struct brw_bo *bo = grow->bo; 413 414 /* We can't grow buffers that are softpinned, as the growing mechanism 415 * involves putting a larger buffer at the same gtt_offset...and we've 416 * only allocated the smaller amount of VMA. Without relocations, this 417 * simply won't work. This should never happen, however. 418 */ 419 assert(!(bo->kflags & EXEC_OBJECT_PINNED)); 420 421 perf_debug("Growing %s - ran out of space\n", bo->name); 422 423 if (grow->partial_bo) { 424 /* We've already grown once, and now we need to do it again. 425 * Finish our last grow operation so we can start a new one. 426 * This should basically never happen. 427 */ 428 perf_debug("Had to grow multiple times"); 429 finish_growing_bos(grow); 430 } 431 432 struct brw_bo *new_bo = 433 brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone); 434 435 /* Copy existing data to the new larger buffer */ 436 grow->partial_bo_map = grow->map; 437 438 if (batch->use_shadow_copy) { 439 /* We can't safely use realloc, as it may move the existing buffer, 440 * breaking existing pointers the caller may still be using. Just 441 * malloc a new copy and memcpy it like the normal BO path. 442 * 443 * Use bo->size rather than new_size because the bufmgr may have 444 * rounded up the size, and we want the shadow size to match. 445 */ 446 grow->map = malloc(new_bo->size); 447 } else { 448 grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE); 449 } 450 451 /* Try to put the new BO at the same GTT offset as the old BO (which 452 * we're throwing away, so it doesn't need to be there). 453 * 454 * This guarantees that our relocations continue to work: values we've 455 * already written into the buffer, values we're going to write into the 456 * buffer, and the validation/relocation lists all will match. 457 * 458 * Also preserve kflags for EXEC_OBJECT_CAPTURE. 459 */ 460 new_bo->gtt_offset = bo->gtt_offset; 461 new_bo->index = bo->index; 462 new_bo->kflags = bo->kflags; 463 464 /* Batch/state buffers are per-context, and if we've run out of space, 465 * we must have actually used them before, so...they will be in the list. 466 */ 467 assert(bo->index < batch->exec_count); 468 assert(batch->exec_bos[bo->index] == bo); 469 470 /* Update the validation list to use the new BO. */ 471 batch->validation_list[bo->index].handle = new_bo->gem_handle; 472 473 if (!batch->use_batch_first) { 474 /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go 475 * update the relocation list entries to point at the new BO as well. 476 * (With newer kernels, the "handle" is an offset into the validation 477 * list, which remains unchanged, so we can skip this.) 478 */ 479 replace_bo_in_reloc_list(&batch->batch_relocs, 480 bo->gem_handle, new_bo->gem_handle); 481 replace_bo_in_reloc_list(&batch->state_relocs, 482 bo->gem_handle, new_bo->gem_handle); 483 } 484 485 /* Exchange the two BOs...without breaking pointers to the old BO. 486 * 487 * Consider this scenario: 488 * 489 * 1. Somebody calls brw_state_batch() to get a region of memory, and 490 * and then creates a brw_address pointing to brw->batch.state.bo. 491 * 2. They then call brw_state_batch() a second time, which happens to 492 * grow and replace the state buffer. They then try to emit a 493 * relocation to their first section of memory. 494 * 495 * If we replace the brw->batch.state.bo pointer at step 2, we would 496 * break the address created in step 1. They'd have a pointer to the 497 * old destroyed BO. Emitting a relocation would add this dead BO to 498 * the validation list...causing /both/ statebuffers to be in the list, 499 * and all kinds of disasters. 500 * 501 * This is not a contrived case - BLORP vertex data upload hits this. 502 * 503 * There are worse scenarios too. Fences for GL sync objects reference 504 * brw->batch.batch.bo. If we replaced the batch pointer when growing, 505 * we'd need to chase down every fence and update it to point to the 506 * new BO. Otherwise, it would refer to a "batch" that never actually 507 * gets submitted, and would fail to trigger. 508 * 509 * To work around both of these issues, we transmutate the buffers in 510 * place, making the existing struct brw_bo represent the new buffer, 511 * and "new_bo" represent the old BO. This is highly unusual, but it 512 * seems like a necessary evil. 513 * 514 * We also defer the memcpy of the existing batch's contents. Callers 515 * may make multiple brw_state_batch calls, and retain pointers to the 516 * old BO's map. We'll perform the memcpy in finish_growing_bo() when 517 * we finally submit the batch, at which point we've finished uploading 518 * state, and nobody should have any old references anymore. 519 * 520 * To do that, we keep a reference to the old BO in grow->partial_bo, 521 * and store the number of bytes to copy in grow->partial_bytes. We 522 * can monkey with the refcounts directly without atomics because these 523 * are per-context BOs and they can only be touched by this thread. 524 */ 525 assert(new_bo->refcount == 1); 526 new_bo->refcount = bo->refcount; 527 bo->refcount = 1; 528 529 assert(list_is_empty(&bo->exports)); 530 assert(list_is_empty(&new_bo->exports)); 531 532 struct brw_bo tmp; 533 memcpy(&tmp, bo, sizeof(struct brw_bo)); 534 memcpy(bo, new_bo, sizeof(struct brw_bo)); 535 memcpy(new_bo, &tmp, sizeof(struct brw_bo)); 536 537 list_inithead(&bo->exports); 538 list_inithead(&new_bo->exports); 539 540 grow->partial_bo = new_bo; /* the one reference of the OLD bo */ 541 grow->partial_bytes = existing_bytes; 542} 543 544void 545brw_batch_require_space(struct brw_context *brw, GLuint sz) 546{ 547 struct brw_batch *batch = &brw->batch; 548 549 const unsigned batch_used = USED_BATCH(*batch) * 4; 550 if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) { 551 brw_batch_flush(brw); 552 } else if (batch_used + sz >= batch->batch.bo->size) { 553 const unsigned new_size = 554 MIN2(batch->batch.bo->size + batch->batch.bo->size / 2, 555 MAX_BATCH_SIZE); 556 grow_buffer(brw, &batch->batch, batch_used, new_size); 557 batch->map_next = (void *) batch->batch.map + batch_used; 558 assert(batch_used + sz < batch->batch.bo->size); 559 } 560} 561 562/** 563 * Called when starting a new batch buffer. 564 */ 565static void 566brw_new_batch(struct brw_context *brw) 567{ 568 /* Unreference any BOs held by the previous batch, and reset counts. */ 569 for (int i = 0; i < brw->batch.exec_count; i++) { 570 brw_bo_unreference(brw->batch.exec_bos[i]); 571 brw->batch.exec_bos[i] = NULL; 572 } 573 brw->batch.batch_relocs.reloc_count = 0; 574 brw->batch.state_relocs.reloc_count = 0; 575 brw->batch.exec_count = 0; 576 brw->batch.aperture_space = 0; 577 578 brw_bo_unreference(brw->batch.state.bo); 579 580 /* Create a new batchbuffer and reset the associated state: */ 581 brw_batch_reset_and_clear_render_cache(brw); 582 583 /* If the kernel supports hardware contexts, then most hardware state is 584 * preserved between batches; we only need to re-emit state that is required 585 * to be in every batch. Otherwise we need to re-emit all the state that 586 * would otherwise be stored in the context (which for all intents and 587 * purposes means everything). 588 */ 589 if (brw->hw_ctx == 0) { 590 brw->ctx.NewDriverState |= BRW_NEW_CONTEXT; 591 brw_upload_invariant_state(brw); 592 } 593 594 brw->ctx.NewDriverState |= BRW_NEW_BATCH; 595 596 brw->ib.index_size = -1; 597 598 /* We need to periodically reap the shader time results, because rollover 599 * happens every few seconds. We also want to see results every once in a 600 * while, because many programs won't cleanly destroy our context, so the 601 * end-of-run printout may not happen. 602 */ 603 if (INTEL_DEBUG(DEBUG_SHADER_TIME)) 604 brw_collect_and_report_shader_time(brw); 605 606 brw_batch_maybe_noop(brw); 607} 608 609/** 610 * Called from brw_batch_flush before emitting MI_BATCHBUFFER_END and 611 * sending it off. 612 * 613 * This function can emit state (say, to preserve registers that aren't saved 614 * between batches). 615 */ 616static void 617brw_finish_batch(struct brw_context *brw) 618{ 619 const struct intel_device_info *devinfo = &brw->screen->devinfo; 620 621 brw->batch.no_wrap = true; 622 623 /* Capture the closing pipeline statistics register values necessary to 624 * support query objects (in the non-hardware context world). 625 */ 626 brw_emit_query_end(brw); 627 628 /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which 629 * assume that the L3 cache is configured according to the hardware 630 * defaults. On Kernel 4.16+, we no longer need to do this. 631 */ 632 if (devinfo->ver >= 7 && 633 !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION)) 634 gfx7_restore_default_l3_config(brw); 635 636 if (devinfo->is_haswell) { 637 /* From the Haswell PRM, Volume 2b, Command Reference: Instructions, 638 * 3DSTATE_CC_STATE_POINTERS > "Note": 639 * 640 * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every 641 * 3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall." 642 * 643 * From the example in the docs, it seems to expect a regular pipe control 644 * flush here as well. We may have done it already, but meh. 645 * 646 * See also WaAvoidRCZCounterRollover. 647 */ 648 brw_emit_mi_flush(brw); 649 BEGIN_BATCH(2); 650 OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2)); 651 OUT_BATCH(brw->cc.state_offset | 1); 652 ADVANCE_BATCH(); 653 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH | 654 PIPE_CONTROL_CS_STALL); 655 } 656 657 /* Do not restore push constant packets during context restore. */ 658 if (devinfo->ver >= 7) 659 gfx7_emit_isp_disable(brw); 660 661 /* Emit MI_BATCH_BUFFER_END to finish our batch. Note that execbuf2 662 * requires our batch size to be QWord aligned, so we pad it out if 663 * necessary by emitting an extra MI_NOOP after the end. 664 */ 665 brw_batch_require_space(brw, 8); 666 *brw->batch.map_next++ = MI_BATCH_BUFFER_END; 667 if (USED_BATCH(brw->batch) & 1) { 668 *brw->batch.map_next++ = MI_NOOP; 669 } 670 671 brw->batch.no_wrap = false; 672} 673 674static void 675throttle(struct brw_context *brw) 676{ 677 /* Wait for the swapbuffers before the one we just emitted, so we 678 * don't get too many swaps outstanding for apps that are GPU-heavy 679 * but not CPU-heavy. 680 * 681 * We're using intelDRI2Flush (called from the loader before 682 * swapbuffer) and glFlush (for front buffer rendering) as the 683 * indicator that a frame is done and then throttle when we get 684 * here as we prepare to render the next frame. At this point for 685 * round trips for swap/copy and getting new buffers are done and 686 * we'll spend less time waiting on the GPU. 687 * 688 * Unfortunately, we don't have a handle to the batch containing 689 * the swap, and getting our hands on that doesn't seem worth it, 690 * so we just use the first batch we emitted after the last swap. 691 */ 692 if (brw->need_swap_throttle && brw->throttle_batch[0]) { 693 if (brw->throttle_batch[1]) { 694 if (!brw->disable_throttling) { 695 brw_bo_wait_rendering(brw->throttle_batch[1]); 696 } 697 brw_bo_unreference(brw->throttle_batch[1]); 698 } 699 brw->throttle_batch[1] = brw->throttle_batch[0]; 700 brw->throttle_batch[0] = NULL; 701 brw->need_swap_throttle = false; 702 /* Throttling here is more precise than the throttle ioctl, so skip it */ 703 brw->need_flush_throttle = false; 704 } 705 706 if (brw->need_flush_throttle) { 707 drmCommandNone(brw->screen->fd, DRM_I915_GEM_THROTTLE); 708 brw->need_flush_throttle = false; 709 } 710} 711 712static int 713execbuffer(int fd, 714 struct brw_batch *batch, 715 uint32_t ctx_id, 716 int used, 717 int in_fence, 718 int *out_fence, 719 int flags) 720{ 721 struct drm_i915_gem_execbuffer2 execbuf = { 722 .buffers_ptr = (uintptr_t) batch->validation_list, 723 .buffer_count = batch->exec_count, 724 .batch_start_offset = 0, 725 .batch_len = used, 726 .flags = flags, 727 .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */ 728 }; 729 730 unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2; 731 732 if (in_fence != -1) { 733 execbuf.rsvd2 = in_fence; 734 execbuf.flags |= I915_EXEC_FENCE_IN; 735 } 736 737 if (out_fence != NULL) { 738 cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR; 739 *out_fence = -1; 740 execbuf.flags |= I915_EXEC_FENCE_OUT; 741 } 742 743 if (num_fences(batch)) { 744 execbuf.flags |= I915_EXEC_FENCE_ARRAY; 745 execbuf.num_cliprects = num_fences(batch); 746 execbuf.cliprects_ptr = 747 (uintptr_t)util_dynarray_begin(&batch->exec_fences); 748 } 749 750 751 int ret = drmIoctl(fd, cmd, &execbuf); 752 if (ret != 0) 753 ret = -errno; 754 755 for (int i = 0; i < batch->exec_count; i++) { 756 struct brw_bo *bo = batch->exec_bos[i]; 757 758 bo->idle = false; 759 bo->index = -1; 760 761 /* Update brw_bo::gtt_offset */ 762 if (batch->validation_list[i].offset != bo->gtt_offset) { 763 DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%" PRIx64 "\n", 764 bo->gem_handle, bo->gtt_offset, 765 (uint64_t)batch->validation_list[i].offset); 766 assert(!(bo->kflags & EXEC_OBJECT_PINNED)); 767 bo->gtt_offset = batch->validation_list[i].offset; 768 } 769 } 770 771 if (ret == 0 && out_fence != NULL) 772 *out_fence = execbuf.rsvd2 >> 32; 773 774 return ret; 775} 776 777static int 778submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd) 779{ 780 struct brw_batch *batch = &brw->batch; 781 int ret = 0; 782 783 if (batch->use_shadow_copy) { 784 void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE); 785 memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch)); 786 787 bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE); 788 memcpy(bo_map, batch->state.map, batch->state_used); 789 } 790 791 brw_bo_unmap(batch->batch.bo); 792 brw_bo_unmap(batch->state.bo); 793 794 if (!brw->screen->devinfo.no_hw) { 795 /* The requirement for using I915_EXEC_NO_RELOC are: 796 * 797 * The addresses written in the objects must match the corresponding 798 * reloc.gtt_offset which in turn must match the corresponding 799 * execobject.offset. 800 * 801 * Any render targets written to in the batch must be flagged with 802 * EXEC_OBJECT_WRITE. 803 * 804 * To avoid stalling, execobject.offset should match the current 805 * address of that object within the active context. 806 */ 807 int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER; 808 809 if (batch->needs_sol_reset) 810 flags |= I915_EXEC_GEN7_SOL_RESET; 811 812 /* Set statebuffer relocations */ 813 const unsigned state_index = batch->state.bo->index; 814 if (state_index < batch->exec_count && 815 batch->exec_bos[state_index] == batch->state.bo) { 816 struct drm_i915_gem_exec_object2 *entry = 817 &batch->validation_list[state_index]; 818 assert(entry->handle == batch->state.bo->gem_handle); 819 entry->relocation_count = batch->state_relocs.reloc_count; 820 entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs; 821 } 822 823 /* Set batchbuffer relocations */ 824 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0]; 825 assert(entry->handle == batch->batch.bo->gem_handle); 826 entry->relocation_count = batch->batch_relocs.reloc_count; 827 entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs; 828 829 if (batch->use_batch_first) { 830 flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT; 831 } else { 832 /* Move the batch to the end of the validation list */ 833 struct drm_i915_gem_exec_object2 tmp; 834 struct brw_bo *tmp_bo; 835 const unsigned index = batch->exec_count - 1; 836 837 tmp = *entry; 838 *entry = batch->validation_list[index]; 839 batch->validation_list[index] = tmp; 840 841 tmp_bo = batch->exec_bos[0]; 842 batch->exec_bos[0] = batch->exec_bos[index]; 843 batch->exec_bos[index] = tmp_bo; 844 } 845 846 ret = execbuffer(brw->screen->fd, batch, brw->hw_ctx, 847 4 * USED_BATCH(*batch), 848 in_fence_fd, out_fence_fd, flags); 849 850 throttle(brw); 851 } 852 853 if (INTEL_DEBUG(DEBUG_BATCH)) { 854 intel_print_batch(&batch->decoder, batch->batch.map, 855 4 * USED_BATCH(*batch), 856 batch->batch.bo->gtt_offset, false); 857 } 858 859 if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB) 860 brw_check_for_reset(brw); 861 862 if (ret != 0) { 863 fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n", 864 strerror(-ret)); 865 abort(); 866 } 867 868 return ret; 869} 870 871/** 872 * The in_fence_fd is ignored if -1. Otherwise this function takes ownership 873 * of the fd. 874 * 875 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership 876 * of the returned fd. 877 */ 878int 879_brw_batch_flush_fence(struct brw_context *brw, 880 int in_fence_fd, int *out_fence_fd, 881 const char *file, int line) 882{ 883 int ret; 884 885 if (USED_BATCH(brw->batch) == 0 && !brw->batch.contains_fence_signal) 886 return 0; 887 888 /* Check that we didn't just wrap our batchbuffer at a bad time. */ 889 assert(!brw->batch.no_wrap); 890 891 brw_finish_batch(brw); 892 brw_upload_finish(&brw->upload); 893 894 finish_growing_bos(&brw->batch.batch); 895 finish_growing_bos(&brw->batch.state); 896 897 if (brw->throttle_batch[0] == NULL) { 898 brw->throttle_batch[0] = brw->batch.batch.bo; 899 brw_bo_reference(brw->throttle_batch[0]); 900 } 901 902 if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) { 903 int bytes_for_commands = 4 * USED_BATCH(brw->batch); 904 int bytes_for_state = brw->batch.state_used; 905 fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt)," 906 " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture)," 907 " %4d batch relocs, %4d state relocs\n", file, line, 908 bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ, 909 bytes_for_state, 100.0f * bytes_for_state / STATE_SZ, 910 brw->batch.exec_count, 911 (float) (brw->batch.aperture_space / (1024 * 1024)), 912 brw->batch.batch_relocs.reloc_count, 913 brw->batch.state_relocs.reloc_count); 914 915 dump_validation_list(&brw->batch); 916 } 917 918 ret = submit_batch(brw, in_fence_fd, out_fence_fd); 919 920 if (INTEL_DEBUG(DEBUG_SYNC)) { 921 fprintf(stderr, "waiting for idle\n"); 922 brw_bo_wait_rendering(brw->batch.batch.bo); 923 } 924 925 /* Start a new batch buffer. */ 926 brw_new_batch(brw); 927 928 return ret; 929} 930 931void 932brw_batch_maybe_noop(struct brw_context *brw) 933{ 934 if (!brw->frontend_noop || USED_BATCH(brw->batch) != 0) 935 return; 936 937 BEGIN_BATCH(1); 938 OUT_BATCH(MI_BATCH_BUFFER_END); 939 ADVANCE_BATCH(); 940} 941 942bool 943brw_batch_references(struct brw_batch *batch, struct brw_bo *bo) 944{ 945 unsigned index = READ_ONCE(bo->index); 946 if (index < batch->exec_count && batch->exec_bos[index] == bo) 947 return true; 948 949 for (int i = 0; i < batch->exec_count; i++) { 950 if (batch->exec_bos[i] == bo) 951 return true; 952 } 953 return false; 954} 955 956/* This is the only way buffers get added to the validate list. 957 */ 958static uint64_t 959emit_reloc(struct brw_batch *batch, 960 struct brw_reloc_list *rlist, uint32_t offset, 961 struct brw_bo *target, int32_t target_offset, 962 unsigned int reloc_flags) 963{ 964 assert(target != NULL); 965 966 if (target->kflags & EXEC_OBJECT_PINNED) { 967 brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE); 968 return intel_canonical_address(target->gtt_offset + target_offset); 969 } 970 971 unsigned int index = add_exec_bo(batch, target); 972 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index]; 973 974 if (rlist->reloc_count == rlist->reloc_array_size) { 975 rlist->reloc_array_size *= 2; 976 rlist->relocs = realloc(rlist->relocs, 977 rlist->reloc_array_size * 978 sizeof(struct drm_i915_gem_relocation_entry)); 979 } 980 981 if (reloc_flags & RELOC_32BIT) { 982 /* Restrict this buffer to the low 32 bits of the address space. 983 * 984 * Altering the validation list flags restricts it for this batch, 985 * but we also alter the BO's kflags to restrict it permanently 986 * (until the BO is destroyed and put back in the cache). Buffers 987 * may stay bound across batches, and we want keep it constrained. 988 */ 989 target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 990 entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS; 991 992 /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */ 993 reloc_flags &= ~RELOC_32BIT; 994 } 995 996 if (reloc_flags) 997 entry->flags |= reloc_flags & batch->valid_reloc_flags; 998 999 rlist->relocs[rlist->reloc_count++] = 1000 (struct drm_i915_gem_relocation_entry) { 1001 .offset = offset, 1002 .delta = target_offset, 1003 .target_handle = batch->use_batch_first ? index : target->gem_handle, 1004 .presumed_offset = entry->offset, 1005 }; 1006 1007 /* Using the old buffer offset, write in what the right data would be, in 1008 * case the buffer doesn't move and we can short-circuit the relocation 1009 * processing in the kernel 1010 */ 1011 return entry->offset + target_offset; 1012} 1013 1014void 1015brw_use_pinned_bo(struct brw_batch *batch, struct brw_bo *bo, 1016 unsigned writable_flag) 1017{ 1018 assert(bo->kflags & EXEC_OBJECT_PINNED); 1019 assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0); 1020 1021 unsigned int index = add_exec_bo(batch, bo); 1022 struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index]; 1023 assert(entry->offset == bo->gtt_offset); 1024 1025 if (writable_flag) 1026 entry->flags |= EXEC_OBJECT_WRITE; 1027} 1028 1029uint64_t 1030brw_batch_reloc(struct brw_batch *batch, uint32_t batch_offset, 1031 struct brw_bo *target, uint32_t target_offset, 1032 unsigned int reloc_flags) 1033{ 1034 assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t)); 1035 1036 return emit_reloc(batch, &batch->batch_relocs, batch_offset, 1037 target, target_offset, reloc_flags); 1038} 1039 1040uint64_t 1041brw_state_reloc(struct brw_batch *batch, uint32_t state_offset, 1042 struct brw_bo *target, uint32_t target_offset, 1043 unsigned int reloc_flags) 1044{ 1045 assert(state_offset <= batch->state.bo->size - sizeof(uint32_t)); 1046 1047 return emit_reloc(batch, &batch->state_relocs, state_offset, 1048 target, target_offset, reloc_flags); 1049} 1050 1051/** 1052 * Reserve some space in the statebuffer, or flush. 1053 * 1054 * This is used to estimate when we're near the end of the batch, 1055 * so we can flush early. 1056 */ 1057void 1058brw_require_statebuffer_space(struct brw_context *brw, int size) 1059{ 1060 if (brw->batch.state_used + size >= STATE_SZ) 1061 brw_batch_flush(brw); 1062} 1063 1064/** 1065 * Allocates a block of space in the batchbuffer for indirect state. 1066 */ 1067void * 1068brw_state_batch(struct brw_context *brw, 1069 int size, 1070 int alignment, 1071 uint32_t *out_offset) 1072{ 1073 struct brw_batch *batch = &brw->batch; 1074 1075 assert(size < batch->state.bo->size); 1076 1077 uint32_t offset = ALIGN(batch->state_used, alignment); 1078 1079 if (offset + size >= STATE_SZ && !batch->no_wrap) { 1080 brw_batch_flush(brw); 1081 offset = ALIGN(batch->state_used, alignment); 1082 } else if (offset + size >= batch->state.bo->size) { 1083 const unsigned new_size = 1084 MIN2(batch->state.bo->size + batch->state.bo->size / 2, 1085 MAX_STATE_SIZE); 1086 grow_buffer(brw, &batch->state, batch->state_used, new_size); 1087 assert(offset + size < batch->state.bo->size); 1088 } 1089 1090 if (INTEL_DEBUG(DEBUG_BATCH)) { 1091 _mesa_hash_table_u64_insert(batch->state_batch_sizes, 1092 offset, (void *) (uintptr_t) size); 1093 } 1094 1095 batch->state_used = offset + size; 1096 1097 *out_offset = offset; 1098 return batch->state.map + (offset >> 2); 1099} 1100 1101void 1102brw_batch_data(struct brw_context *brw, 1103 const void *data, GLuint bytes) 1104{ 1105 assert((bytes & 3) == 0); 1106 brw_batch_require_space(brw, bytes); 1107 memcpy(brw->batch.map_next, data, bytes); 1108 brw->batch.map_next += bytes >> 2; 1109} 1110 1111static void 1112load_sized_register_mem(struct brw_context *brw, 1113 uint32_t reg, 1114 struct brw_bo *bo, 1115 uint32_t offset, 1116 int size) 1117{ 1118 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1119 int i; 1120 1121 /* MI_LOAD_REGISTER_MEM only exists on Gfx7+. */ 1122 assert(devinfo->ver >= 7); 1123 1124 if (devinfo->ver >= 8) { 1125 BEGIN_BATCH(4 * size); 1126 for (i = 0; i < size; i++) { 1127 OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (4 - 2)); 1128 OUT_BATCH(reg + i * 4); 1129 OUT_RELOC64(bo, 0, offset + i * 4); 1130 } 1131 ADVANCE_BATCH(); 1132 } else { 1133 BEGIN_BATCH(3 * size); 1134 for (i = 0; i < size; i++) { 1135 OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (3 - 2)); 1136 OUT_BATCH(reg + i * 4); 1137 OUT_RELOC(bo, 0, offset + i * 4); 1138 } 1139 ADVANCE_BATCH(); 1140 } 1141} 1142 1143void 1144brw_load_register_mem(struct brw_context *brw, 1145 uint32_t reg, 1146 struct brw_bo *bo, 1147 uint32_t offset) 1148{ 1149 load_sized_register_mem(brw, reg, bo, offset, 1); 1150} 1151 1152void 1153brw_load_register_mem64(struct brw_context *brw, 1154 uint32_t reg, 1155 struct brw_bo *bo, 1156 uint32_t offset) 1157{ 1158 load_sized_register_mem(brw, reg, bo, offset, 2); 1159} 1160 1161/* 1162 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM. 1163 */ 1164void 1165brw_store_register_mem32(struct brw_context *brw, 1166 struct brw_bo *bo, uint32_t reg, uint32_t offset) 1167{ 1168 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1169 1170 assert(devinfo->ver >= 6); 1171 1172 if (devinfo->ver >= 8) { 1173 BEGIN_BATCH(4); 1174 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2)); 1175 OUT_BATCH(reg); 1176 OUT_RELOC64(bo, RELOC_WRITE, offset); 1177 ADVANCE_BATCH(); 1178 } else { 1179 BEGIN_BATCH(3); 1180 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); 1181 OUT_BATCH(reg); 1182 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset); 1183 ADVANCE_BATCH(); 1184 } 1185} 1186 1187/* 1188 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM. 1189 */ 1190void 1191brw_store_register_mem64(struct brw_context *brw, 1192 struct brw_bo *bo, uint32_t reg, uint32_t offset) 1193{ 1194 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1195 1196 assert(devinfo->ver >= 6); 1197 1198 /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to 1199 * read a full 64-bit register, we need to do two of them. 1200 */ 1201 if (devinfo->ver >= 8) { 1202 BEGIN_BATCH(8); 1203 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2)); 1204 OUT_BATCH(reg); 1205 OUT_RELOC64(bo, RELOC_WRITE, offset); 1206 OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2)); 1207 OUT_BATCH(reg + sizeof(uint32_t)); 1208 OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t)); 1209 ADVANCE_BATCH(); 1210 } else { 1211 BEGIN_BATCH(6); 1212 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); 1213 OUT_BATCH(reg); 1214 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset); 1215 OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2)); 1216 OUT_BATCH(reg + sizeof(uint32_t)); 1217 OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t)); 1218 ADVANCE_BATCH(); 1219 } 1220} 1221 1222/* 1223 * Write a 32-bit register using immediate data. 1224 */ 1225void 1226brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm) 1227{ 1228 assert(brw->screen->devinfo.ver >= 6); 1229 1230 BEGIN_BATCH(3); 1231 OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2)); 1232 OUT_BATCH(reg); 1233 OUT_BATCH(imm); 1234 ADVANCE_BATCH(); 1235} 1236 1237/* 1238 * Write a 64-bit register using immediate data. 1239 */ 1240void 1241brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm) 1242{ 1243 assert(brw->screen->devinfo.ver >= 6); 1244 1245 BEGIN_BATCH(5); 1246 OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2)); 1247 OUT_BATCH(reg); 1248 OUT_BATCH(imm & 0xffffffff); 1249 OUT_BATCH(reg + 4); 1250 OUT_BATCH(imm >> 32); 1251 ADVANCE_BATCH(); 1252} 1253 1254/* 1255 * Copies a 32-bit register. 1256 */ 1257void 1258brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src) 1259{ 1260 assert(brw->screen->devinfo.verx10 >= 75); 1261 1262 BEGIN_BATCH(3); 1263 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2)); 1264 OUT_BATCH(src); 1265 OUT_BATCH(dest); 1266 ADVANCE_BATCH(); 1267} 1268 1269/* 1270 * Copies a 64-bit register. 1271 */ 1272void 1273brw_load_register_reg64(struct brw_context *brw, uint32_t dest, uint32_t src) 1274{ 1275 assert(brw->screen->devinfo.verx10 >= 75); 1276 1277 BEGIN_BATCH(6); 1278 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2)); 1279 OUT_BATCH(src); 1280 OUT_BATCH(dest); 1281 OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2)); 1282 OUT_BATCH(src + sizeof(uint32_t)); 1283 OUT_BATCH(dest + sizeof(uint32_t)); 1284 ADVANCE_BATCH(); 1285} 1286 1287/* 1288 * Write 32-bits of immediate data to a GPU memory buffer. 1289 */ 1290void 1291brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo, 1292 uint32_t offset, uint32_t imm) 1293{ 1294 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1295 1296 assert(devinfo->ver >= 6); 1297 1298 BEGIN_BATCH(4); 1299 OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2)); 1300 if (devinfo->ver >= 8) 1301 OUT_RELOC64(bo, RELOC_WRITE, offset); 1302 else { 1303 OUT_BATCH(0); /* MBZ */ 1304 OUT_RELOC(bo, RELOC_WRITE, offset); 1305 } 1306 OUT_BATCH(imm); 1307 ADVANCE_BATCH(); 1308} 1309 1310/* 1311 * Write 64-bits of immediate data to a GPU memory buffer. 1312 */ 1313void 1314brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo, 1315 uint32_t offset, uint64_t imm) 1316{ 1317 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1318 1319 assert(devinfo->ver >= 6); 1320 1321 BEGIN_BATCH(5); 1322 OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2)); 1323 if (devinfo->ver >= 8) 1324 OUT_RELOC64(bo, RELOC_WRITE, offset); 1325 else { 1326 OUT_BATCH(0); /* MBZ */ 1327 OUT_RELOC(bo, RELOC_WRITE, offset); 1328 } 1329 OUT_BATCH(imm & 0xffffffffu); 1330 OUT_BATCH(imm >> 32); 1331 ADVANCE_BATCH(); 1332} 1333