1/*
2 * Copyright 2006 VMware, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the
7 * "Software"), to deal in the Software without restriction, including
8 * without limitation the rights to use, copy, modify, merge, publish,
9 * distribute, sublicense, and/or sell copies of the Software, and to
10 * permit persons to whom the Software is furnished to do so, subject to
11 * the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the
14 * next paragraph) shall be included in all copies or substantial portions
15 * of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
18 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
20 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
21 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
22 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
23 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25
26#include "brw_batch.h"
27#include "brw_buffer_objects.h"
28#include "brw_bufmgr.h"
29#include "brw_buffers.h"
30#include "brw_fbo.h"
31#include "brw_context.h"
32#include "brw_defines.h"
33#include "brw_state.h"
34#include "common/intel_decoder.h"
35#include "common/intel_gem.h"
36
37#include "util/hash_table.h"
38
39#include <xf86drm.h>
40#include "drm-uapi/i915_drm.h"
41
42#define FILE_DEBUG_FLAG DEBUG_BUFMGR
43
44/**
45 * Target sizes of the batch and state buffers.  We create the initial
46 * buffers at these sizes, and flush when they're nearly full.  If we
47 * underestimate how close we are to the end, and suddenly need more space
48 * in the middle of a draw, we can grow the buffers, and finish the draw.
49 * At that point, we'll be over our target size, so the next operation
50 * should flush.  Each time we flush the batch, we recreate both buffers
51 * at the original target size, so it doesn't grow without bound.
52 */
53#define BATCH_SZ (20 * 1024)
54#define STATE_SZ (16 * 1024)
55
56static void
57brw_batch_reset(struct brw_context *brw);
58static void
59brw_new_batch(struct brw_context *brw);
60
61static unsigned
62num_fences(struct brw_batch *batch)
63{
64   return util_dynarray_num_elements(&batch->exec_fences,
65                                     struct drm_i915_gem_exec_fence);
66}
67
68
69static void
70dump_validation_list(struct brw_batch *batch)
71{
72   fprintf(stderr, "Validation list (length %d):\n", batch->exec_count);
73
74   for (int i = 0; i < batch->exec_count; i++) {
75      uint64_t flags = batch->validation_list[i].flags;
76      assert(batch->validation_list[i].handle ==
77             batch->exec_bos[i]->gem_handle);
78      fprintf(stderr, "[%2d]: %2d %-14s %p %s%-7s @ 0x%"PRIx64"%s (%"PRIu64"B)\n",
79              i,
80              batch->validation_list[i].handle,
81              batch->exec_bos[i]->name,
82              batch->exec_bos[i],
83              (flags & EXEC_OBJECT_SUPPORTS_48B_ADDRESS) ? "(48b" : "(32b",
84              (flags & EXEC_OBJECT_WRITE) ? " write)" : ")",
85              (uint64_t)batch->validation_list[i].offset,
86              (flags & EXEC_OBJECT_PINNED) ? " (pinned)" : "",
87              batch->exec_bos[i]->size);
88   }
89}
90
91static struct intel_batch_decode_bo
92decode_get_bo(void *v_brw, bool ppgtt, uint64_t address)
93{
94   struct brw_context *brw = v_brw;
95   struct brw_batch *batch = &brw->batch;
96
97   for (int i = 0; i < batch->exec_count; i++) {
98      struct brw_bo *bo = batch->exec_bos[i];
99      /* The decoder zeroes out the top 16 bits, so we need to as well */
100      uint64_t bo_address = bo->gtt_offset & (~0ull >> 16);
101
102      if (address >= bo_address && address < bo_address + bo->size) {
103         return (struct intel_batch_decode_bo) {
104            .addr = bo_address,
105            .size = bo->size,
106            .map = brw_bo_map(brw, bo, MAP_READ),
107         };
108      }
109   }
110
111   return (struct intel_batch_decode_bo) { };
112}
113
114static unsigned
115decode_get_state_size(void *v_brw, uint64_t address, uint64_t base_address)
116{
117   struct brw_context *brw = v_brw;
118   struct brw_batch *batch = &brw->batch;
119   unsigned size = (uintptr_t)
120      _mesa_hash_table_u64_search(batch->state_batch_sizes,
121                                  address - base_address);
122   return size;
123}
124
125static void
126init_reloc_list(struct brw_reloc_list *rlist, int count)
127{
128   rlist->reloc_count = 0;
129   rlist->reloc_array_size = count;
130   rlist->relocs = malloc(rlist->reloc_array_size *
131                          sizeof(struct drm_i915_gem_relocation_entry));
132}
133
134void
135brw_batch_init(struct brw_context *brw)
136{
137   struct brw_screen *screen = brw->screen;
138   struct brw_batch *batch = &brw->batch;
139   const struct intel_device_info *devinfo = &screen->devinfo;
140
141   if (INTEL_DEBUG(DEBUG_BATCH)) {
142      /* The shadow doesn't get relocs written so state decode fails. */
143      batch->use_shadow_copy = false;
144   } else
145      batch->use_shadow_copy = !devinfo->has_llc;
146
147   init_reloc_list(&batch->batch_relocs, 250);
148   init_reloc_list(&batch->state_relocs, 250);
149
150   batch->batch.map = NULL;
151   batch->state.map = NULL;
152   batch->exec_count = 0;
153   batch->exec_array_size = 100;
154   batch->exec_bos =
155      malloc(batch->exec_array_size * sizeof(batch->exec_bos[0]));
156   batch->validation_list =
157      malloc(batch->exec_array_size * sizeof(batch->validation_list[0]));
158   batch->contains_fence_signal = false;
159
160   if (INTEL_DEBUG(DEBUG_BATCH)) {
161      batch->state_batch_sizes =
162         _mesa_hash_table_u64_create(NULL);
163
164      const unsigned decode_flags =
165         INTEL_BATCH_DECODE_FULL |
166         (INTEL_DEBUG(DEBUG_COLOR) ? INTEL_BATCH_DECODE_IN_COLOR : 0) |
167         INTEL_BATCH_DECODE_OFFSETS |
168         INTEL_BATCH_DECODE_FLOATS;
169
170      intel_batch_decode_ctx_init(&batch->decoder, devinfo, stderr,
171                                  decode_flags, NULL, decode_get_bo,
172                                  decode_get_state_size, brw);
173      batch->decoder.max_vbo_decoded_lines = 100;
174   }
175
176   batch->use_batch_first =
177      screen->kernel_features & KERNEL_ALLOWS_EXEC_BATCH_FIRST;
178
179   /* PIPE_CONTROL needs a w/a but only on gfx6 */
180   batch->valid_reloc_flags = EXEC_OBJECT_WRITE;
181   if (devinfo->ver == 6)
182      batch->valid_reloc_flags |= EXEC_OBJECT_NEEDS_GTT;
183
184   brw_batch_reset(brw);
185}
186
187#define READ_ONCE(x) (*(volatile __typeof__(x) *)&(x))
188
189static unsigned
190add_exec_bo(struct brw_batch *batch, struct brw_bo *bo)
191{
192   assert(bo->bufmgr == batch->batch.bo->bufmgr);
193
194   unsigned index = READ_ONCE(bo->index);
195
196   if (index < batch->exec_count && batch->exec_bos[index] == bo)
197      return index;
198
199   /* May have been shared between multiple active batches */
200   for (index = 0; index < batch->exec_count; index++) {
201      if (batch->exec_bos[index] == bo)
202         return index;
203   }
204
205   brw_bo_reference(bo);
206
207   if (batch->exec_count == batch->exec_array_size) {
208      batch->exec_array_size *= 2;
209      batch->exec_bos =
210         realloc(batch->exec_bos,
211                 batch->exec_array_size * sizeof(batch->exec_bos[0]));
212      batch->validation_list =
213         realloc(batch->validation_list,
214                 batch->exec_array_size * sizeof(batch->validation_list[0]));
215   }
216
217   batch->validation_list[batch->exec_count] =
218      (struct drm_i915_gem_exec_object2) {
219         .handle = bo->gem_handle,
220         .offset = bo->gtt_offset,
221         .flags = bo->kflags,
222      };
223
224   bo->index = batch->exec_count;
225   batch->exec_bos[batch->exec_count] = bo;
226   batch->aperture_space += bo->size;
227
228   return batch->exec_count++;
229}
230
231static void
232recreate_growing_buffer(struct brw_context *brw,
233                        struct brw_growing_bo *grow,
234                        const char *name, unsigned size,
235                        enum brw_memory_zone memzone)
236{
237   struct brw_screen *screen = brw->screen;
238   struct brw_batch *batch = &brw->batch;
239   struct brw_bufmgr *bufmgr = screen->bufmgr;
240
241   /* We can't grow buffers when using softpin, so just overallocate them. */
242   if (brw_using_softpin(bufmgr))
243      size *= 2;
244
245   grow->bo = brw_bo_alloc(bufmgr, name, size, memzone);
246   grow->bo->kflags |= can_do_exec_capture(screen) ? EXEC_OBJECT_CAPTURE : 0;
247   grow->partial_bo = NULL;
248   grow->partial_bo_map = NULL;
249   grow->partial_bytes = 0;
250   grow->memzone = memzone;
251
252   if (batch->use_shadow_copy)
253      grow->map = realloc(grow->map, grow->bo->size);
254   else
255      grow->map = brw_bo_map(brw, grow->bo, MAP_READ | MAP_WRITE);
256}
257
258static void
259brw_batch_reset(struct brw_context *brw)
260{
261   struct brw_batch *batch = &brw->batch;
262
263   if (batch->last_bo != NULL) {
264      brw_bo_unreference(batch->last_bo);
265      batch->last_bo = NULL;
266   }
267   batch->last_bo = batch->batch.bo;
268
269   recreate_growing_buffer(brw, &batch->batch, "batchbuffer", BATCH_SZ,
270                           BRW_MEMZONE_OTHER);
271   batch->map_next = batch->batch.map;
272
273   recreate_growing_buffer(brw, &batch->state, "statebuffer", STATE_SZ,
274                           BRW_MEMZONE_DYNAMIC);
275
276   /* Avoid making 0 a valid state offset - otherwise the decoder will try
277    * and decode data when we use offset 0 as a null pointer.
278    */
279   batch->state_used = 1;
280
281   add_exec_bo(batch, batch->batch.bo);
282   assert(batch->batch.bo->index == 0);
283
284   batch->needs_sol_reset = false;
285   batch->state_base_address_emitted = false;
286
287   if (batch->state_batch_sizes)
288      _mesa_hash_table_u64_clear(batch->state_batch_sizes);
289
290   /* Always add workaround_bo which contains a driver identifier to be
291    * recorded in error states.
292    */
293   struct brw_bo *identifier_bo = brw->workaround_bo;
294   if (identifier_bo)
295      add_exec_bo(batch, identifier_bo);
296
297   if (batch->contains_fence_signal)
298      batch->contains_fence_signal = false;
299}
300
301static void
302brw_batch_reset_and_clear_render_cache(struct brw_context *brw)
303{
304   brw_batch_reset(brw);
305   brw_cache_sets_clear(brw);
306}
307
308void
309brw_batch_save_state(struct brw_context *brw)
310{
311   brw->batch.saved.map_next = brw->batch.map_next;
312   brw->batch.saved.batch_reloc_count = brw->batch.batch_relocs.reloc_count;
313   brw->batch.saved.state_reloc_count = brw->batch.state_relocs.reloc_count;
314   brw->batch.saved.exec_count = brw->batch.exec_count;
315}
316
317bool
318brw_batch_saved_state_is_empty(struct brw_context *brw)
319{
320   struct brw_batch *batch = &brw->batch;
321   return (batch->saved.map_next == batch->batch.map);
322}
323
324void
325brw_batch_reset_to_saved(struct brw_context *brw)
326{
327   for (int i = brw->batch.saved.exec_count;
328        i < brw->batch.exec_count; i++) {
329      brw_bo_unreference(brw->batch.exec_bos[i]);
330   }
331   brw->batch.batch_relocs.reloc_count = brw->batch.saved.batch_reloc_count;
332   brw->batch.state_relocs.reloc_count = brw->batch.saved.state_reloc_count;
333   brw->batch.exec_count = brw->batch.saved.exec_count;
334
335   brw->batch.map_next = brw->batch.saved.map_next;
336   if (USED_BATCH(brw->batch) == 0)
337      brw_new_batch(brw);
338}
339
340void
341brw_batch_free(struct brw_batch *batch)
342{
343   if (batch->use_shadow_copy) {
344      free(batch->batch.map);
345      free(batch->state.map);
346   }
347
348   for (int i = 0; i < batch->exec_count; i++) {
349      brw_bo_unreference(batch->exec_bos[i]);
350   }
351   free(batch->batch_relocs.relocs);
352   free(batch->state_relocs.relocs);
353   free(batch->exec_bos);
354   free(batch->validation_list);
355
356   brw_bo_unreference(batch->last_bo);
357   brw_bo_unreference(batch->batch.bo);
358   brw_bo_unreference(batch->state.bo);
359   if (batch->state_batch_sizes) {
360      _mesa_hash_table_u64_destroy(batch->state_batch_sizes);
361      intel_batch_decode_ctx_finish(&batch->decoder);
362   }
363}
364
365/**
366 * Finish copying the old batch/state buffer's contents to the new one
367 * after we tried to "grow" the buffer in an earlier operation.
368 */
369static void
370finish_growing_bos(struct brw_growing_bo *grow)
371{
372   struct brw_bo *old_bo = grow->partial_bo;
373   if (!old_bo)
374      return;
375
376   memcpy(grow->map, grow->partial_bo_map, grow->partial_bytes);
377
378   grow->partial_bo = NULL;
379   grow->partial_bo_map = NULL;
380   grow->partial_bytes = 0;
381
382   brw_bo_unreference(old_bo);
383}
384
385static void
386replace_bo_in_reloc_list(struct brw_reloc_list *rlist,
387                         uint32_t old_handle, uint32_t new_handle)
388{
389   for (int i = 0; i < rlist->reloc_count; i++) {
390      if (rlist->relocs[i].target_handle == old_handle)
391         rlist->relocs[i].target_handle = new_handle;
392   }
393}
394
395/**
396 * Grow either the batch or state buffer to a new larger size.
397 *
398 * We can't actually grow buffers, so we allocate a new one, copy over
399 * the existing contents, and update our lists to refer to the new one.
400 *
401 * Note that this is only temporary - each new batch recreates the buffers
402 * at their original target size (BATCH_SZ or STATE_SZ).
403 */
404static void
405grow_buffer(struct brw_context *brw,
406            struct brw_growing_bo *grow,
407            unsigned existing_bytes,
408            unsigned new_size)
409{
410   struct brw_batch *batch = &brw->batch;
411   struct brw_bufmgr *bufmgr = brw->bufmgr;
412   struct brw_bo *bo = grow->bo;
413
414   /* We can't grow buffers that are softpinned, as the growing mechanism
415    * involves putting a larger buffer at the same gtt_offset...and we've
416    * only allocated the smaller amount of VMA.  Without relocations, this
417    * simply won't work.  This should never happen, however.
418    */
419   assert(!(bo->kflags & EXEC_OBJECT_PINNED));
420
421   perf_debug("Growing %s - ran out of space\n", bo->name);
422
423   if (grow->partial_bo) {
424      /* We've already grown once, and now we need to do it again.
425       * Finish our last grow operation so we can start a new one.
426       * This should basically never happen.
427       */
428      perf_debug("Had to grow multiple times");
429      finish_growing_bos(grow);
430   }
431
432   struct brw_bo *new_bo =
433      brw_bo_alloc(bufmgr, bo->name, new_size, grow->memzone);
434
435   /* Copy existing data to the new larger buffer */
436   grow->partial_bo_map = grow->map;
437
438   if (batch->use_shadow_copy) {
439      /* We can't safely use realloc, as it may move the existing buffer,
440       * breaking existing pointers the caller may still be using.  Just
441       * malloc a new copy and memcpy it like the normal BO path.
442       *
443       * Use bo->size rather than new_size because the bufmgr may have
444       * rounded up the size, and we want the shadow size to match.
445       */
446      grow->map = malloc(new_bo->size);
447   } else {
448      grow->map = brw_bo_map(brw, new_bo, MAP_READ | MAP_WRITE);
449   }
450
451   /* Try to put the new BO at the same GTT offset as the old BO (which
452    * we're throwing away, so it doesn't need to be there).
453    *
454    * This guarantees that our relocations continue to work: values we've
455    * already written into the buffer, values we're going to write into the
456    * buffer, and the validation/relocation lists all will match.
457    *
458    * Also preserve kflags for EXEC_OBJECT_CAPTURE.
459    */
460   new_bo->gtt_offset = bo->gtt_offset;
461   new_bo->index = bo->index;
462   new_bo->kflags = bo->kflags;
463
464   /* Batch/state buffers are per-context, and if we've run out of space,
465    * we must have actually used them before, so...they will be in the list.
466    */
467   assert(bo->index < batch->exec_count);
468   assert(batch->exec_bos[bo->index] == bo);
469
470   /* Update the validation list to use the new BO. */
471   batch->validation_list[bo->index].handle = new_bo->gem_handle;
472
473   if (!batch->use_batch_first) {
474      /* We're not using I915_EXEC_HANDLE_LUT, which means we need to go
475       * update the relocation list entries to point at the new BO as well.
476       * (With newer kernels, the "handle" is an offset into the validation
477       * list, which remains unchanged, so we can skip this.)
478       */
479      replace_bo_in_reloc_list(&batch->batch_relocs,
480                               bo->gem_handle, new_bo->gem_handle);
481      replace_bo_in_reloc_list(&batch->state_relocs,
482                               bo->gem_handle, new_bo->gem_handle);
483   }
484
485   /* Exchange the two BOs...without breaking pointers to the old BO.
486    *
487    * Consider this scenario:
488    *
489    * 1. Somebody calls brw_state_batch() to get a region of memory, and
490    *    and then creates a brw_address pointing to brw->batch.state.bo.
491    * 2. They then call brw_state_batch() a second time, which happens to
492    *    grow and replace the state buffer.  They then try to emit a
493    *    relocation to their first section of memory.
494    *
495    * If we replace the brw->batch.state.bo pointer at step 2, we would
496    * break the address created in step 1.  They'd have a pointer to the
497    * old destroyed BO.  Emitting a relocation would add this dead BO to
498    * the validation list...causing /both/ statebuffers to be in the list,
499    * and all kinds of disasters.
500    *
501    * This is not a contrived case - BLORP vertex data upload hits this.
502    *
503    * There are worse scenarios too.  Fences for GL sync objects reference
504    * brw->batch.batch.bo.  If we replaced the batch pointer when growing,
505    * we'd need to chase down every fence and update it to point to the
506    * new BO.  Otherwise, it would refer to a "batch" that never actually
507    * gets submitted, and would fail to trigger.
508    *
509    * To work around both of these issues, we transmutate the buffers in
510    * place, making the existing struct brw_bo represent the new buffer,
511    * and "new_bo" represent the old BO.  This is highly unusual, but it
512    * seems like a necessary evil.
513    *
514    * We also defer the memcpy of the existing batch's contents.  Callers
515    * may make multiple brw_state_batch calls, and retain pointers to the
516    * old BO's map.  We'll perform the memcpy in finish_growing_bo() when
517    * we finally submit the batch, at which point we've finished uploading
518    * state, and nobody should have any old references anymore.
519    *
520    * To do that, we keep a reference to the old BO in grow->partial_bo,
521    * and store the number of bytes to copy in grow->partial_bytes.  We
522    * can monkey with the refcounts directly without atomics because these
523    * are per-context BOs and they can only be touched by this thread.
524    */
525   assert(new_bo->refcount == 1);
526   new_bo->refcount = bo->refcount;
527   bo->refcount = 1;
528
529   assert(list_is_empty(&bo->exports));
530   assert(list_is_empty(&new_bo->exports));
531
532   struct brw_bo tmp;
533   memcpy(&tmp, bo, sizeof(struct brw_bo));
534   memcpy(bo, new_bo, sizeof(struct brw_bo));
535   memcpy(new_bo, &tmp, sizeof(struct brw_bo));
536
537   list_inithead(&bo->exports);
538   list_inithead(&new_bo->exports);
539
540   grow->partial_bo = new_bo; /* the one reference of the OLD bo */
541   grow->partial_bytes = existing_bytes;
542}
543
544void
545brw_batch_require_space(struct brw_context *brw, GLuint sz)
546{
547   struct brw_batch *batch = &brw->batch;
548
549   const unsigned batch_used = USED_BATCH(*batch) * 4;
550   if (batch_used + sz >= BATCH_SZ && !batch->no_wrap) {
551      brw_batch_flush(brw);
552   } else if (batch_used + sz >= batch->batch.bo->size) {
553      const unsigned new_size =
554         MIN2(batch->batch.bo->size + batch->batch.bo->size / 2,
555              MAX_BATCH_SIZE);
556      grow_buffer(brw, &batch->batch, batch_used, new_size);
557      batch->map_next = (void *) batch->batch.map + batch_used;
558      assert(batch_used + sz < batch->batch.bo->size);
559   }
560}
561
562/**
563 * Called when starting a new batch buffer.
564 */
565static void
566brw_new_batch(struct brw_context *brw)
567{
568   /* Unreference any BOs held by the previous batch, and reset counts. */
569   for (int i = 0; i < brw->batch.exec_count; i++) {
570      brw_bo_unreference(brw->batch.exec_bos[i]);
571      brw->batch.exec_bos[i] = NULL;
572   }
573   brw->batch.batch_relocs.reloc_count = 0;
574   brw->batch.state_relocs.reloc_count = 0;
575   brw->batch.exec_count = 0;
576   brw->batch.aperture_space = 0;
577
578   brw_bo_unreference(brw->batch.state.bo);
579
580   /* Create a new batchbuffer and reset the associated state: */
581   brw_batch_reset_and_clear_render_cache(brw);
582
583   /* If the kernel supports hardware contexts, then most hardware state is
584    * preserved between batches; we only need to re-emit state that is required
585    * to be in every batch.  Otherwise we need to re-emit all the state that
586    * would otherwise be stored in the context (which for all intents and
587    * purposes means everything).
588    */
589   if (brw->hw_ctx == 0) {
590      brw->ctx.NewDriverState |= BRW_NEW_CONTEXT;
591      brw_upload_invariant_state(brw);
592   }
593
594   brw->ctx.NewDriverState |= BRW_NEW_BATCH;
595
596   brw->ib.index_size = -1;
597
598   /* We need to periodically reap the shader time results, because rollover
599    * happens every few seconds.  We also want to see results every once in a
600    * while, because many programs won't cleanly destroy our context, so the
601    * end-of-run printout may not happen.
602    */
603   if (INTEL_DEBUG(DEBUG_SHADER_TIME))
604      brw_collect_and_report_shader_time(brw);
605
606   brw_batch_maybe_noop(brw);
607}
608
609/**
610 * Called from brw_batch_flush before emitting MI_BATCHBUFFER_END and
611 * sending it off.
612 *
613 * This function can emit state (say, to preserve registers that aren't saved
614 * between batches).
615 */
616static void
617brw_finish_batch(struct brw_context *brw)
618{
619   const struct intel_device_info *devinfo = &brw->screen->devinfo;
620
621   brw->batch.no_wrap = true;
622
623   /* Capture the closing pipeline statistics register values necessary to
624    * support query objects (in the non-hardware context world).
625    */
626   brw_emit_query_end(brw);
627
628   /* Work around L3 state leaks into contexts set MI_RESTORE_INHIBIT which
629    * assume that the L3 cache is configured according to the hardware
630    * defaults.  On Kernel 4.16+, we no longer need to do this.
631    */
632   if (devinfo->ver >= 7 &&
633       !(brw->screen->kernel_features & KERNEL_ALLOWS_CONTEXT_ISOLATION))
634      gfx7_restore_default_l3_config(brw);
635
636   if (devinfo->is_haswell) {
637      /* From the Haswell PRM, Volume 2b, Command Reference: Instructions,
638       * 3DSTATE_CC_STATE_POINTERS > "Note":
639       *
640       * "SW must program 3DSTATE_CC_STATE_POINTERS command at the end of every
641       *  3D batch buffer followed by a PIPE_CONTROL with RC flush and CS stall."
642       *
643       * From the example in the docs, it seems to expect a regular pipe control
644       * flush here as well. We may have done it already, but meh.
645       *
646       * See also WaAvoidRCZCounterRollover.
647       */
648      brw_emit_mi_flush(brw);
649      BEGIN_BATCH(2);
650      OUT_BATCH(_3DSTATE_CC_STATE_POINTERS << 16 | (2 - 2));
651      OUT_BATCH(brw->cc.state_offset | 1);
652      ADVANCE_BATCH();
653      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_RENDER_TARGET_FLUSH |
654                                       PIPE_CONTROL_CS_STALL);
655   }
656
657   /* Do not restore push constant packets during context restore. */
658   if (devinfo->ver >= 7)
659      gfx7_emit_isp_disable(brw);
660
661   /* Emit MI_BATCH_BUFFER_END to finish our batch.  Note that execbuf2
662    * requires our batch size to be QWord aligned, so we pad it out if
663    * necessary by emitting an extra MI_NOOP after the end.
664    */
665   brw_batch_require_space(brw, 8);
666   *brw->batch.map_next++ = MI_BATCH_BUFFER_END;
667   if (USED_BATCH(brw->batch) & 1) {
668      *brw->batch.map_next++ = MI_NOOP;
669   }
670
671   brw->batch.no_wrap = false;
672}
673
674static void
675throttle(struct brw_context *brw)
676{
677   /* Wait for the swapbuffers before the one we just emitted, so we
678    * don't get too many swaps outstanding for apps that are GPU-heavy
679    * but not CPU-heavy.
680    *
681    * We're using intelDRI2Flush (called from the loader before
682    * swapbuffer) and glFlush (for front buffer rendering) as the
683    * indicator that a frame is done and then throttle when we get
684    * here as we prepare to render the next frame.  At this point for
685    * round trips for swap/copy and getting new buffers are done and
686    * we'll spend less time waiting on the GPU.
687    *
688    * Unfortunately, we don't have a handle to the batch containing
689    * the swap, and getting our hands on that doesn't seem worth it,
690    * so we just use the first batch we emitted after the last swap.
691    */
692   if (brw->need_swap_throttle && brw->throttle_batch[0]) {
693      if (brw->throttle_batch[1]) {
694         if (!brw->disable_throttling) {
695            brw_bo_wait_rendering(brw->throttle_batch[1]);
696         }
697         brw_bo_unreference(brw->throttle_batch[1]);
698      }
699      brw->throttle_batch[1] = brw->throttle_batch[0];
700      brw->throttle_batch[0] = NULL;
701      brw->need_swap_throttle = false;
702      /* Throttling here is more precise than the throttle ioctl, so skip it */
703      brw->need_flush_throttle = false;
704   }
705
706   if (brw->need_flush_throttle) {
707      drmCommandNone(brw->screen->fd, DRM_I915_GEM_THROTTLE);
708      brw->need_flush_throttle = false;
709   }
710}
711
712static int
713execbuffer(int fd,
714           struct brw_batch *batch,
715           uint32_t ctx_id,
716           int used,
717           int in_fence,
718           int *out_fence,
719           int flags)
720{
721   struct drm_i915_gem_execbuffer2 execbuf = {
722      .buffers_ptr = (uintptr_t) batch->validation_list,
723      .buffer_count = batch->exec_count,
724      .batch_start_offset = 0,
725      .batch_len = used,
726      .flags = flags,
727      .rsvd1 = ctx_id, /* rsvd1 is actually the context ID */
728   };
729
730   unsigned long cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2;
731
732   if (in_fence != -1) {
733      execbuf.rsvd2 = in_fence;
734      execbuf.flags |= I915_EXEC_FENCE_IN;
735   }
736
737   if (out_fence != NULL) {
738      cmd = DRM_IOCTL_I915_GEM_EXECBUFFER2_WR;
739      *out_fence = -1;
740      execbuf.flags |= I915_EXEC_FENCE_OUT;
741   }
742
743   if (num_fences(batch)) {
744      execbuf.flags |= I915_EXEC_FENCE_ARRAY;
745      execbuf.num_cliprects = num_fences(batch);
746      execbuf.cliprects_ptr =
747         (uintptr_t)util_dynarray_begin(&batch->exec_fences);
748   }
749
750
751   int ret = drmIoctl(fd, cmd, &execbuf);
752   if (ret != 0)
753      ret = -errno;
754
755   for (int i = 0; i < batch->exec_count; i++) {
756      struct brw_bo *bo = batch->exec_bos[i];
757
758      bo->idle = false;
759      bo->index = -1;
760
761      /* Update brw_bo::gtt_offset */
762      if (batch->validation_list[i].offset != bo->gtt_offset) {
763         DBG("BO %d migrated: 0x%" PRIx64 " -> 0x%" PRIx64 "\n",
764             bo->gem_handle, bo->gtt_offset,
765             (uint64_t)batch->validation_list[i].offset);
766         assert(!(bo->kflags & EXEC_OBJECT_PINNED));
767         bo->gtt_offset = batch->validation_list[i].offset;
768      }
769   }
770
771   if (ret == 0 && out_fence != NULL)
772      *out_fence = execbuf.rsvd2 >> 32;
773
774   return ret;
775}
776
777static int
778submit_batch(struct brw_context *brw, int in_fence_fd, int *out_fence_fd)
779{
780   struct brw_batch *batch = &brw->batch;
781   int ret = 0;
782
783   if (batch->use_shadow_copy) {
784      void *bo_map = brw_bo_map(brw, batch->batch.bo, MAP_WRITE);
785      memcpy(bo_map, batch->batch.map, 4 * USED_BATCH(*batch));
786
787      bo_map = brw_bo_map(brw, batch->state.bo, MAP_WRITE);
788      memcpy(bo_map, batch->state.map, batch->state_used);
789   }
790
791   brw_bo_unmap(batch->batch.bo);
792   brw_bo_unmap(batch->state.bo);
793
794   if (!brw->screen->devinfo.no_hw) {
795      /* The requirement for using I915_EXEC_NO_RELOC are:
796       *
797       *   The addresses written in the objects must match the corresponding
798       *   reloc.gtt_offset which in turn must match the corresponding
799       *   execobject.offset.
800       *
801       *   Any render targets written to in the batch must be flagged with
802       *   EXEC_OBJECT_WRITE.
803       *
804       *   To avoid stalling, execobject.offset should match the current
805       *   address of that object within the active context.
806       */
807      int flags = I915_EXEC_NO_RELOC | I915_EXEC_RENDER;
808
809      if (batch->needs_sol_reset)
810         flags |= I915_EXEC_GEN7_SOL_RESET;
811
812      /* Set statebuffer relocations */
813      const unsigned state_index = batch->state.bo->index;
814      if (state_index < batch->exec_count &&
815          batch->exec_bos[state_index] == batch->state.bo) {
816         struct drm_i915_gem_exec_object2 *entry =
817            &batch->validation_list[state_index];
818         assert(entry->handle == batch->state.bo->gem_handle);
819         entry->relocation_count = batch->state_relocs.reloc_count;
820         entry->relocs_ptr = (uintptr_t) batch->state_relocs.relocs;
821      }
822
823      /* Set batchbuffer relocations */
824      struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[0];
825      assert(entry->handle == batch->batch.bo->gem_handle);
826      entry->relocation_count = batch->batch_relocs.reloc_count;
827      entry->relocs_ptr = (uintptr_t) batch->batch_relocs.relocs;
828
829      if (batch->use_batch_first) {
830         flags |= I915_EXEC_BATCH_FIRST | I915_EXEC_HANDLE_LUT;
831      } else {
832         /* Move the batch to the end of the validation list */
833         struct drm_i915_gem_exec_object2 tmp;
834         struct brw_bo *tmp_bo;
835         const unsigned index = batch->exec_count - 1;
836
837         tmp = *entry;
838         *entry = batch->validation_list[index];
839         batch->validation_list[index] = tmp;
840
841         tmp_bo = batch->exec_bos[0];
842         batch->exec_bos[0] = batch->exec_bos[index];
843         batch->exec_bos[index] = tmp_bo;
844      }
845
846      ret = execbuffer(brw->screen->fd, batch, brw->hw_ctx,
847                       4 * USED_BATCH(*batch),
848                       in_fence_fd, out_fence_fd, flags);
849
850      throttle(brw);
851   }
852
853   if (INTEL_DEBUG(DEBUG_BATCH)) {
854      intel_print_batch(&batch->decoder, batch->batch.map,
855                        4 * USED_BATCH(*batch),
856                        batch->batch.bo->gtt_offset, false);
857   }
858
859   if (brw->ctx.Const.ResetStrategy == GL_LOSE_CONTEXT_ON_RESET_ARB)
860      brw_check_for_reset(brw);
861
862   if (ret != 0) {
863      fprintf(stderr, "i965: Failed to submit batchbuffer: %s\n",
864              strerror(-ret));
865      abort();
866   }
867
868   return ret;
869}
870
871/**
872 * The in_fence_fd is ignored if -1.  Otherwise this function takes ownership
873 * of the fd.
874 *
875 * The out_fence_fd is ignored if NULL. Otherwise, the caller takes ownership
876 * of the returned fd.
877 */
878int
879_brw_batch_flush_fence(struct brw_context *brw,
880                               int in_fence_fd, int *out_fence_fd,
881                               const char *file, int line)
882{
883   int ret;
884
885   if (USED_BATCH(brw->batch) == 0 && !brw->batch.contains_fence_signal)
886      return 0;
887
888   /* Check that we didn't just wrap our batchbuffer at a bad time. */
889   assert(!brw->batch.no_wrap);
890
891   brw_finish_batch(brw);
892   brw_upload_finish(&brw->upload);
893
894   finish_growing_bos(&brw->batch.batch);
895   finish_growing_bos(&brw->batch.state);
896
897   if (brw->throttle_batch[0] == NULL) {
898      brw->throttle_batch[0] = brw->batch.batch.bo;
899      brw_bo_reference(brw->throttle_batch[0]);
900   }
901
902   if (INTEL_DEBUG(DEBUG_BATCH | DEBUG_SUBMIT)) {
903      int bytes_for_commands = 4 * USED_BATCH(brw->batch);
904      int bytes_for_state = brw->batch.state_used;
905      fprintf(stderr, "%19s:%-3d: Batchbuffer flush with %5db (%0.1f%%) (pkt),"
906              " %5db (%0.1f%%) (state), %4d BOs (%0.1fMb aperture),"
907              " %4d batch relocs, %4d state relocs\n", file, line,
908              bytes_for_commands, 100.0f * bytes_for_commands / BATCH_SZ,
909              bytes_for_state, 100.0f * bytes_for_state / STATE_SZ,
910              brw->batch.exec_count,
911              (float) (brw->batch.aperture_space / (1024 * 1024)),
912              brw->batch.batch_relocs.reloc_count,
913              brw->batch.state_relocs.reloc_count);
914
915      dump_validation_list(&brw->batch);
916   }
917
918   ret = submit_batch(brw, in_fence_fd, out_fence_fd);
919
920   if (INTEL_DEBUG(DEBUG_SYNC)) {
921      fprintf(stderr, "waiting for idle\n");
922      brw_bo_wait_rendering(brw->batch.batch.bo);
923   }
924
925   /* Start a new batch buffer. */
926   brw_new_batch(brw);
927
928   return ret;
929}
930
931void
932brw_batch_maybe_noop(struct brw_context *brw)
933{
934   if (!brw->frontend_noop || USED_BATCH(brw->batch) != 0)
935      return;
936
937   BEGIN_BATCH(1);
938   OUT_BATCH(MI_BATCH_BUFFER_END);
939   ADVANCE_BATCH();
940}
941
942bool
943brw_batch_references(struct brw_batch *batch, struct brw_bo *bo)
944{
945   unsigned index = READ_ONCE(bo->index);
946   if (index < batch->exec_count && batch->exec_bos[index] == bo)
947      return true;
948
949   for (int i = 0; i < batch->exec_count; i++) {
950      if (batch->exec_bos[i] == bo)
951         return true;
952   }
953   return false;
954}
955
956/*  This is the only way buffers get added to the validate list.
957 */
958static uint64_t
959emit_reloc(struct brw_batch *batch,
960           struct brw_reloc_list *rlist, uint32_t offset,
961           struct brw_bo *target, int32_t target_offset,
962           unsigned int reloc_flags)
963{
964   assert(target != NULL);
965
966   if (target->kflags & EXEC_OBJECT_PINNED) {
967      brw_use_pinned_bo(batch, target, reloc_flags & RELOC_WRITE);
968      return intel_canonical_address(target->gtt_offset + target_offset);
969   }
970
971   unsigned int index = add_exec_bo(batch, target);
972   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
973
974   if (rlist->reloc_count == rlist->reloc_array_size) {
975      rlist->reloc_array_size *= 2;
976      rlist->relocs = realloc(rlist->relocs,
977                              rlist->reloc_array_size *
978                              sizeof(struct drm_i915_gem_relocation_entry));
979   }
980
981   if (reloc_flags & RELOC_32BIT) {
982      /* Restrict this buffer to the low 32 bits of the address space.
983       *
984       * Altering the validation list flags restricts it for this batch,
985       * but we also alter the BO's kflags to restrict it permanently
986       * (until the BO is destroyed and put back in the cache).  Buffers
987       * may stay bound across batches, and we want keep it constrained.
988       */
989      target->kflags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
990      entry->flags &= ~EXEC_OBJECT_SUPPORTS_48B_ADDRESS;
991
992      /* RELOC_32BIT is not an EXEC_OBJECT_* flag, so get rid of it. */
993      reloc_flags &= ~RELOC_32BIT;
994   }
995
996   if (reloc_flags)
997      entry->flags |= reloc_flags & batch->valid_reloc_flags;
998
999   rlist->relocs[rlist->reloc_count++] =
1000      (struct drm_i915_gem_relocation_entry) {
1001         .offset = offset,
1002         .delta = target_offset,
1003         .target_handle = batch->use_batch_first ? index : target->gem_handle,
1004         .presumed_offset = entry->offset,
1005      };
1006
1007   /* Using the old buffer offset, write in what the right data would be, in
1008    * case the buffer doesn't move and we can short-circuit the relocation
1009    * processing in the kernel
1010    */
1011   return entry->offset + target_offset;
1012}
1013
1014void
1015brw_use_pinned_bo(struct brw_batch *batch, struct brw_bo *bo,
1016                  unsigned writable_flag)
1017{
1018   assert(bo->kflags & EXEC_OBJECT_PINNED);
1019   assert((writable_flag & ~EXEC_OBJECT_WRITE) == 0);
1020
1021   unsigned int index = add_exec_bo(batch, bo);
1022   struct drm_i915_gem_exec_object2 *entry = &batch->validation_list[index];
1023   assert(entry->offset == bo->gtt_offset);
1024
1025   if (writable_flag)
1026      entry->flags |= EXEC_OBJECT_WRITE;
1027}
1028
1029uint64_t
1030brw_batch_reloc(struct brw_batch *batch, uint32_t batch_offset,
1031                struct brw_bo *target, uint32_t target_offset,
1032                unsigned int reloc_flags)
1033{
1034   assert(batch_offset <= batch->batch.bo->size - sizeof(uint32_t));
1035
1036   return emit_reloc(batch, &batch->batch_relocs, batch_offset,
1037                     target, target_offset, reloc_flags);
1038}
1039
1040uint64_t
1041brw_state_reloc(struct brw_batch *batch, uint32_t state_offset,
1042                struct brw_bo *target, uint32_t target_offset,
1043                unsigned int reloc_flags)
1044{
1045   assert(state_offset <= batch->state.bo->size - sizeof(uint32_t));
1046
1047   return emit_reloc(batch, &batch->state_relocs, state_offset,
1048                     target, target_offset, reloc_flags);
1049}
1050
1051/**
1052 * Reserve some space in the statebuffer, or flush.
1053 *
1054 * This is used to estimate when we're near the end of the batch,
1055 * so we can flush early.
1056 */
1057void
1058brw_require_statebuffer_space(struct brw_context *brw, int size)
1059{
1060   if (brw->batch.state_used + size >= STATE_SZ)
1061      brw_batch_flush(brw);
1062}
1063
1064/**
1065 * Allocates a block of space in the batchbuffer for indirect state.
1066 */
1067void *
1068brw_state_batch(struct brw_context *brw,
1069                int size,
1070                int alignment,
1071                uint32_t *out_offset)
1072{
1073   struct brw_batch *batch = &brw->batch;
1074
1075   assert(size < batch->state.bo->size);
1076
1077   uint32_t offset = ALIGN(batch->state_used, alignment);
1078
1079   if (offset + size >= STATE_SZ && !batch->no_wrap) {
1080      brw_batch_flush(brw);
1081      offset = ALIGN(batch->state_used, alignment);
1082   } else if (offset + size >= batch->state.bo->size) {
1083      const unsigned new_size =
1084         MIN2(batch->state.bo->size + batch->state.bo->size / 2,
1085              MAX_STATE_SIZE);
1086      grow_buffer(brw, &batch->state, batch->state_used, new_size);
1087      assert(offset + size < batch->state.bo->size);
1088   }
1089
1090   if (INTEL_DEBUG(DEBUG_BATCH)) {
1091      _mesa_hash_table_u64_insert(batch->state_batch_sizes,
1092                                  offset, (void *) (uintptr_t) size);
1093   }
1094
1095   batch->state_used = offset + size;
1096
1097   *out_offset = offset;
1098   return batch->state.map + (offset >> 2);
1099}
1100
1101void
1102brw_batch_data(struct brw_context *brw,
1103                       const void *data, GLuint bytes)
1104{
1105   assert((bytes & 3) == 0);
1106   brw_batch_require_space(brw, bytes);
1107   memcpy(brw->batch.map_next, data, bytes);
1108   brw->batch.map_next += bytes >> 2;
1109}
1110
1111static void
1112load_sized_register_mem(struct brw_context *brw,
1113                        uint32_t reg,
1114                        struct brw_bo *bo,
1115                        uint32_t offset,
1116                        int size)
1117{
1118   const struct intel_device_info *devinfo = &brw->screen->devinfo;
1119   int i;
1120
1121   /* MI_LOAD_REGISTER_MEM only exists on Gfx7+. */
1122   assert(devinfo->ver >= 7);
1123
1124   if (devinfo->ver >= 8) {
1125      BEGIN_BATCH(4 * size);
1126      for (i = 0; i < size; i++) {
1127         OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (4 - 2));
1128         OUT_BATCH(reg + i * 4);
1129         OUT_RELOC64(bo, 0, offset + i * 4);
1130      }
1131      ADVANCE_BATCH();
1132   } else {
1133      BEGIN_BATCH(3 * size);
1134      for (i = 0; i < size; i++) {
1135         OUT_BATCH(GFX7_MI_LOAD_REGISTER_MEM | (3 - 2));
1136         OUT_BATCH(reg + i * 4);
1137         OUT_RELOC(bo, 0, offset + i * 4);
1138      }
1139      ADVANCE_BATCH();
1140   }
1141}
1142
1143void
1144brw_load_register_mem(struct brw_context *brw,
1145                      uint32_t reg,
1146                      struct brw_bo *bo,
1147                      uint32_t offset)
1148{
1149   load_sized_register_mem(brw, reg, bo, offset, 1);
1150}
1151
1152void
1153brw_load_register_mem64(struct brw_context *brw,
1154                        uint32_t reg,
1155                        struct brw_bo *bo,
1156                        uint32_t offset)
1157{
1158   load_sized_register_mem(brw, reg, bo, offset, 2);
1159}
1160
1161/*
1162 * Write an arbitrary 32-bit register to a buffer via MI_STORE_REGISTER_MEM.
1163 */
1164void
1165brw_store_register_mem32(struct brw_context *brw,
1166                         struct brw_bo *bo, uint32_t reg, uint32_t offset)
1167{
1168   const struct intel_device_info *devinfo = &brw->screen->devinfo;
1169
1170   assert(devinfo->ver >= 6);
1171
1172   if (devinfo->ver >= 8) {
1173      BEGIN_BATCH(4);
1174      OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1175      OUT_BATCH(reg);
1176      OUT_RELOC64(bo, RELOC_WRITE, offset);
1177      ADVANCE_BATCH();
1178   } else {
1179      BEGIN_BATCH(3);
1180      OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1181      OUT_BATCH(reg);
1182      OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1183      ADVANCE_BATCH();
1184   }
1185}
1186
1187/*
1188 * Write an arbitrary 64-bit register to a buffer via MI_STORE_REGISTER_MEM.
1189 */
1190void
1191brw_store_register_mem64(struct brw_context *brw,
1192                         struct brw_bo *bo, uint32_t reg, uint32_t offset)
1193{
1194   const struct intel_device_info *devinfo = &brw->screen->devinfo;
1195
1196   assert(devinfo->ver >= 6);
1197
1198   /* MI_STORE_REGISTER_MEM only stores a single 32-bit value, so to
1199    * read a full 64-bit register, we need to do two of them.
1200    */
1201   if (devinfo->ver >= 8) {
1202      BEGIN_BATCH(8);
1203      OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1204      OUT_BATCH(reg);
1205      OUT_RELOC64(bo, RELOC_WRITE, offset);
1206      OUT_BATCH(MI_STORE_REGISTER_MEM | (4 - 2));
1207      OUT_BATCH(reg + sizeof(uint32_t));
1208      OUT_RELOC64(bo, RELOC_WRITE, offset + sizeof(uint32_t));
1209      ADVANCE_BATCH();
1210   } else {
1211      BEGIN_BATCH(6);
1212      OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1213      OUT_BATCH(reg);
1214      OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset);
1215      OUT_BATCH(MI_STORE_REGISTER_MEM | (3 - 2));
1216      OUT_BATCH(reg + sizeof(uint32_t));
1217      OUT_RELOC(bo, RELOC_WRITE | RELOC_NEEDS_GGTT, offset + sizeof(uint32_t));
1218      ADVANCE_BATCH();
1219   }
1220}
1221
1222/*
1223 * Write a 32-bit register using immediate data.
1224 */
1225void
1226brw_load_register_imm32(struct brw_context *brw, uint32_t reg, uint32_t imm)
1227{
1228   assert(brw->screen->devinfo.ver >= 6);
1229
1230   BEGIN_BATCH(3);
1231   OUT_BATCH(MI_LOAD_REGISTER_IMM | (3 - 2));
1232   OUT_BATCH(reg);
1233   OUT_BATCH(imm);
1234   ADVANCE_BATCH();
1235}
1236
1237/*
1238 * Write a 64-bit register using immediate data.
1239 */
1240void
1241brw_load_register_imm64(struct brw_context *brw, uint32_t reg, uint64_t imm)
1242{
1243   assert(brw->screen->devinfo.ver >= 6);
1244
1245   BEGIN_BATCH(5);
1246   OUT_BATCH(MI_LOAD_REGISTER_IMM | (5 - 2));
1247   OUT_BATCH(reg);
1248   OUT_BATCH(imm & 0xffffffff);
1249   OUT_BATCH(reg + 4);
1250   OUT_BATCH(imm >> 32);
1251   ADVANCE_BATCH();
1252}
1253
1254/*
1255 * Copies a 32-bit register.
1256 */
1257void
1258brw_load_register_reg(struct brw_context *brw, uint32_t dest, uint32_t src)
1259{
1260   assert(brw->screen->devinfo.verx10 >= 75);
1261
1262   BEGIN_BATCH(3);
1263   OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1264   OUT_BATCH(src);
1265   OUT_BATCH(dest);
1266   ADVANCE_BATCH();
1267}
1268
1269/*
1270 * Copies a 64-bit register.
1271 */
1272void
1273brw_load_register_reg64(struct brw_context *brw, uint32_t dest, uint32_t src)
1274{
1275   assert(brw->screen->devinfo.verx10 >= 75);
1276
1277   BEGIN_BATCH(6);
1278   OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1279   OUT_BATCH(src);
1280   OUT_BATCH(dest);
1281   OUT_BATCH(MI_LOAD_REGISTER_REG | (3 - 2));
1282   OUT_BATCH(src + sizeof(uint32_t));
1283   OUT_BATCH(dest + sizeof(uint32_t));
1284   ADVANCE_BATCH();
1285}
1286
1287/*
1288 * Write 32-bits of immediate data to a GPU memory buffer.
1289 */
1290void
1291brw_store_data_imm32(struct brw_context *brw, struct brw_bo *bo,
1292                     uint32_t offset, uint32_t imm)
1293{
1294   const struct intel_device_info *devinfo = &brw->screen->devinfo;
1295
1296   assert(devinfo->ver >= 6);
1297
1298   BEGIN_BATCH(4);
1299   OUT_BATCH(MI_STORE_DATA_IMM | (4 - 2));
1300   if (devinfo->ver >= 8)
1301      OUT_RELOC64(bo, RELOC_WRITE, offset);
1302   else {
1303      OUT_BATCH(0); /* MBZ */
1304      OUT_RELOC(bo, RELOC_WRITE, offset);
1305   }
1306   OUT_BATCH(imm);
1307   ADVANCE_BATCH();
1308}
1309
1310/*
1311 * Write 64-bits of immediate data to a GPU memory buffer.
1312 */
1313void
1314brw_store_data_imm64(struct brw_context *brw, struct brw_bo *bo,
1315                     uint32_t offset, uint64_t imm)
1316{
1317   const struct intel_device_info *devinfo = &brw->screen->devinfo;
1318
1319   assert(devinfo->ver >= 6);
1320
1321   BEGIN_BATCH(5);
1322   OUT_BATCH(MI_STORE_DATA_IMM | (5 - 2));
1323   if (devinfo->ver >= 8)
1324      OUT_RELOC64(bo, RELOC_WRITE, offset);
1325   else {
1326      OUT_BATCH(0); /* MBZ */
1327      OUT_RELOC(bo, RELOC_WRITE, offset);
1328   }
1329   OUT_BATCH(imm & 0xffffffffu);
1330   OUT_BATCH(imm >> 32);
1331   ADVANCE_BATCH();
1332}
1333