1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included
12 * in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
15 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
20 * DEALINGS IN THE SOFTWARE.
21 */
22
23/**
24 * @file iris_state.c
25 *
26 * ============================= GENXML CODE =============================
27 *              [This file is compiled once per generation.]
28 * =======================================================================
29 *
30 * This is the main state upload code.
31 *
32 * Gallium uses Constant State Objects, or CSOs, for most state.  Large,
33 * complex, or highly reusable state can be created once, and bound and
34 * rebound multiple times.  This is modeled with the pipe->create_*_state()
35 * and pipe->bind_*_state() hooks.  Highly dynamic or inexpensive state is
36 * streamed out on the fly, via pipe->set_*_state() hooks.
37 *
38 * OpenGL involves frequently mutating context state, which is mirrored in
39 * core Mesa by highly mutable data structures.  However, most applications
40 * typically draw the same things over and over - from frame to frame, most
41 * of the same objects are still visible and need to be redrawn.  So, rather
42 * than inventing new state all the time, applications usually mutate to swap
43 * between known states that we've seen before.
44 *
45 * Gallium isolates us from this mutation by tracking API state, and
46 * distilling it into a set of Constant State Objects, or CSOs.  Large,
47 * complex, or typically reusable state can be created once, then reused
48 * multiple times.  Drivers can create and store their own associated data.
49 * This create/bind model corresponds to the pipe->create_*_state() and
50 * pipe->bind_*_state() driver hooks.
51 *
52 * Some state is cheap to create, or expected to be highly dynamic.  Rather
53 * than creating and caching piles of CSOs for these, Gallium simply streams
54 * them out, via the pipe->set_*_state() driver hooks.
55 *
56 * To reduce draw time overhead, we try to compute as much state at create
57 * time as possible.  Wherever possible, we translate the Gallium pipe state
58 * to 3DSTATE commands, and store those commands in the CSO.  At draw time,
59 * we can simply memcpy them into a batch buffer.
60 *
61 * No hardware matches the abstraction perfectly, so some commands require
62 * information from multiple CSOs.  In this case, we can store two copies
63 * of the packet (one in each CSO), and simply | together their DWords at
64 * draw time.  Sometimes the second set is trivial (one or two fields), so
65 * we simply pack it at draw time.
66 *
67 * There are two main components in the file below.  First, the CSO hooks
68 * create/bind/track state.  The second are the draw-time upload functions,
69 * iris_upload_render_state() and iris_upload_compute_state(), which read
70 * the context state and emit the commands into the actual batch.
71 */
72
73#include <stdio.h>
74#include <errno.h>
75
76#if HAVE_VALGRIND
77#include <valgrind.h>
78#include <memcheck.h>
79#define VG(x) x
80#ifdef DEBUG
81#define __gen_validate_value(x) VALGRIND_CHECK_MEM_IS_DEFINED(&(x), sizeof(x))
82#endif
83#else
84#define VG(x)
85#endif
86
87#include "pipe/p_defines.h"
88#include "pipe/p_state.h"
89#include "pipe/p_context.h"
90#include "pipe/p_screen.h"
91#include "util/u_dual_blend.h"
92#include "util/u_inlines.h"
93#include "util/format/u_format.h"
94#include "util/u_framebuffer.h"
95#include "util/u_transfer.h"
96#include "util/u_upload_mgr.h"
97#include "util/u_viewport.h"
98#include "util/u_memory.h"
99#include "drm-uapi/i915_drm.h"
100#include "nir.h"
101#include "intel/compiler/brw_compiler.h"
102#include "intel/common/intel_aux_map.h"
103#include "intel/common/intel_l3_config.h"
104#include "intel/common/intel_sample_positions.h"
105#include "iris_batch.h"
106#include "iris_context.h"
107#include "iris_defines.h"
108#include "iris_pipe.h"
109#include "iris_resource.h"
110
111#include "iris_genx_macros.h"
112#include "intel/common/intel_guardband.h"
113
114/**
115 * Statically assert that PIPE_* enums match the hardware packets.
116 * (As long as they match, we don't need to translate them.)
117 */
118UNUSED static void pipe_asserts()
119{
120#define PIPE_ASSERT(x) STATIC_ASSERT((int)x)
121
122   /* pipe_logicop happens to match the hardware. */
123   PIPE_ASSERT(PIPE_LOGICOP_CLEAR == LOGICOP_CLEAR);
124   PIPE_ASSERT(PIPE_LOGICOP_NOR == LOGICOP_NOR);
125   PIPE_ASSERT(PIPE_LOGICOP_AND_INVERTED == LOGICOP_AND_INVERTED);
126   PIPE_ASSERT(PIPE_LOGICOP_COPY_INVERTED == LOGICOP_COPY_INVERTED);
127   PIPE_ASSERT(PIPE_LOGICOP_AND_REVERSE == LOGICOP_AND_REVERSE);
128   PIPE_ASSERT(PIPE_LOGICOP_INVERT == LOGICOP_INVERT);
129   PIPE_ASSERT(PIPE_LOGICOP_XOR == LOGICOP_XOR);
130   PIPE_ASSERT(PIPE_LOGICOP_NAND == LOGICOP_NAND);
131   PIPE_ASSERT(PIPE_LOGICOP_AND == LOGICOP_AND);
132   PIPE_ASSERT(PIPE_LOGICOP_EQUIV == LOGICOP_EQUIV);
133   PIPE_ASSERT(PIPE_LOGICOP_NOOP == LOGICOP_NOOP);
134   PIPE_ASSERT(PIPE_LOGICOP_OR_INVERTED == LOGICOP_OR_INVERTED);
135   PIPE_ASSERT(PIPE_LOGICOP_COPY == LOGICOP_COPY);
136   PIPE_ASSERT(PIPE_LOGICOP_OR_REVERSE == LOGICOP_OR_REVERSE);
137   PIPE_ASSERT(PIPE_LOGICOP_OR == LOGICOP_OR);
138   PIPE_ASSERT(PIPE_LOGICOP_SET == LOGICOP_SET);
139
140   /* pipe_blend_func happens to match the hardware. */
141   PIPE_ASSERT(PIPE_BLENDFACTOR_ONE == BLENDFACTOR_ONE);
142   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_COLOR == BLENDFACTOR_SRC_COLOR);
143   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA == BLENDFACTOR_SRC_ALPHA);
144   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_ALPHA == BLENDFACTOR_DST_ALPHA);
145   PIPE_ASSERT(PIPE_BLENDFACTOR_DST_COLOR == BLENDFACTOR_DST_COLOR);
146   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE == BLENDFACTOR_SRC_ALPHA_SATURATE);
147   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_COLOR == BLENDFACTOR_CONST_COLOR);
148   PIPE_ASSERT(PIPE_BLENDFACTOR_CONST_ALPHA == BLENDFACTOR_CONST_ALPHA);
149   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_COLOR == BLENDFACTOR_SRC1_COLOR);
150   PIPE_ASSERT(PIPE_BLENDFACTOR_SRC1_ALPHA == BLENDFACTOR_SRC1_ALPHA);
151   PIPE_ASSERT(PIPE_BLENDFACTOR_ZERO == BLENDFACTOR_ZERO);
152   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_COLOR == BLENDFACTOR_INV_SRC_COLOR);
153   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC_ALPHA == BLENDFACTOR_INV_SRC_ALPHA);
154   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_ALPHA == BLENDFACTOR_INV_DST_ALPHA);
155   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_DST_COLOR == BLENDFACTOR_INV_DST_COLOR);
156   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_COLOR == BLENDFACTOR_INV_CONST_COLOR);
157   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_CONST_ALPHA == BLENDFACTOR_INV_CONST_ALPHA);
158   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_COLOR == BLENDFACTOR_INV_SRC1_COLOR);
159   PIPE_ASSERT(PIPE_BLENDFACTOR_INV_SRC1_ALPHA == BLENDFACTOR_INV_SRC1_ALPHA);
160
161   /* pipe_blend_func happens to match the hardware. */
162   PIPE_ASSERT(PIPE_BLEND_ADD == BLENDFUNCTION_ADD);
163   PIPE_ASSERT(PIPE_BLEND_SUBTRACT == BLENDFUNCTION_SUBTRACT);
164   PIPE_ASSERT(PIPE_BLEND_REVERSE_SUBTRACT == BLENDFUNCTION_REVERSE_SUBTRACT);
165   PIPE_ASSERT(PIPE_BLEND_MIN == BLENDFUNCTION_MIN);
166   PIPE_ASSERT(PIPE_BLEND_MAX == BLENDFUNCTION_MAX);
167
168   /* pipe_stencil_op happens to match the hardware. */
169   PIPE_ASSERT(PIPE_STENCIL_OP_KEEP == STENCILOP_KEEP);
170   PIPE_ASSERT(PIPE_STENCIL_OP_ZERO == STENCILOP_ZERO);
171   PIPE_ASSERT(PIPE_STENCIL_OP_REPLACE == STENCILOP_REPLACE);
172   PIPE_ASSERT(PIPE_STENCIL_OP_INCR == STENCILOP_INCRSAT);
173   PIPE_ASSERT(PIPE_STENCIL_OP_DECR == STENCILOP_DECRSAT);
174   PIPE_ASSERT(PIPE_STENCIL_OP_INCR_WRAP == STENCILOP_INCR);
175   PIPE_ASSERT(PIPE_STENCIL_OP_DECR_WRAP == STENCILOP_DECR);
176   PIPE_ASSERT(PIPE_STENCIL_OP_INVERT == STENCILOP_INVERT);
177
178   /* pipe_sprite_coord_mode happens to match 3DSTATE_SBE */
179   PIPE_ASSERT(PIPE_SPRITE_COORD_UPPER_LEFT == UPPERLEFT);
180   PIPE_ASSERT(PIPE_SPRITE_COORD_LOWER_LEFT == LOWERLEFT);
181#undef PIPE_ASSERT
182}
183
184static unsigned
185translate_prim_type(enum pipe_prim_type prim, uint8_t verts_per_patch)
186{
187   static const unsigned map[] = {
188      [PIPE_PRIM_POINTS]                   = _3DPRIM_POINTLIST,
189      [PIPE_PRIM_LINES]                    = _3DPRIM_LINELIST,
190      [PIPE_PRIM_LINE_LOOP]                = _3DPRIM_LINELOOP,
191      [PIPE_PRIM_LINE_STRIP]               = _3DPRIM_LINESTRIP,
192      [PIPE_PRIM_TRIANGLES]                = _3DPRIM_TRILIST,
193      [PIPE_PRIM_TRIANGLE_STRIP]           = _3DPRIM_TRISTRIP,
194      [PIPE_PRIM_TRIANGLE_FAN]             = _3DPRIM_TRIFAN,
195      [PIPE_PRIM_QUADS]                    = _3DPRIM_QUADLIST,
196      [PIPE_PRIM_QUAD_STRIP]               = _3DPRIM_QUADSTRIP,
197      [PIPE_PRIM_POLYGON]                  = _3DPRIM_POLYGON,
198      [PIPE_PRIM_LINES_ADJACENCY]          = _3DPRIM_LINELIST_ADJ,
199      [PIPE_PRIM_LINE_STRIP_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
200      [PIPE_PRIM_TRIANGLES_ADJACENCY]      = _3DPRIM_TRILIST_ADJ,
201      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
202      [PIPE_PRIM_PATCHES]                  = _3DPRIM_PATCHLIST_1 - 1,
203   };
204
205   return map[prim] + (prim == PIPE_PRIM_PATCHES ? verts_per_patch : 0);
206}
207
208static unsigned
209translate_compare_func(enum pipe_compare_func pipe_func)
210{
211   static const unsigned map[] = {
212      [PIPE_FUNC_NEVER]    = COMPAREFUNCTION_NEVER,
213      [PIPE_FUNC_LESS]     = COMPAREFUNCTION_LESS,
214      [PIPE_FUNC_EQUAL]    = COMPAREFUNCTION_EQUAL,
215      [PIPE_FUNC_LEQUAL]   = COMPAREFUNCTION_LEQUAL,
216      [PIPE_FUNC_GREATER]  = COMPAREFUNCTION_GREATER,
217      [PIPE_FUNC_NOTEQUAL] = COMPAREFUNCTION_NOTEQUAL,
218      [PIPE_FUNC_GEQUAL]   = COMPAREFUNCTION_GEQUAL,
219      [PIPE_FUNC_ALWAYS]   = COMPAREFUNCTION_ALWAYS,
220   };
221   return map[pipe_func];
222}
223
224static unsigned
225translate_shadow_func(enum pipe_compare_func pipe_func)
226{
227   /* Gallium specifies the result of shadow comparisons as:
228    *
229    *    1 if ref <op> texel,
230    *    0 otherwise.
231    *
232    * The hardware does:
233    *
234    *    0 if texel <op> ref,
235    *    1 otherwise.
236    *
237    * So we need to flip the operator and also negate.
238    */
239   static const unsigned map[] = {
240      [PIPE_FUNC_NEVER]    = PREFILTEROP_ALWAYS,
241      [PIPE_FUNC_LESS]     = PREFILTEROP_LEQUAL,
242      [PIPE_FUNC_EQUAL]    = PREFILTEROP_NOTEQUAL,
243      [PIPE_FUNC_LEQUAL]   = PREFILTEROP_LESS,
244      [PIPE_FUNC_GREATER]  = PREFILTEROP_GEQUAL,
245      [PIPE_FUNC_NOTEQUAL] = PREFILTEROP_EQUAL,
246      [PIPE_FUNC_GEQUAL]   = PREFILTEROP_GREATER,
247      [PIPE_FUNC_ALWAYS]   = PREFILTEROP_NEVER,
248   };
249   return map[pipe_func];
250}
251
252static unsigned
253translate_cull_mode(unsigned pipe_face)
254{
255   static const unsigned map[4] = {
256      [PIPE_FACE_NONE]           = CULLMODE_NONE,
257      [PIPE_FACE_FRONT]          = CULLMODE_FRONT,
258      [PIPE_FACE_BACK]           = CULLMODE_BACK,
259      [PIPE_FACE_FRONT_AND_BACK] = CULLMODE_BOTH,
260   };
261   return map[pipe_face];
262}
263
264static unsigned
265translate_fill_mode(unsigned pipe_polymode)
266{
267   static const unsigned map[4] = {
268      [PIPE_POLYGON_MODE_FILL]           = FILL_MODE_SOLID,
269      [PIPE_POLYGON_MODE_LINE]           = FILL_MODE_WIREFRAME,
270      [PIPE_POLYGON_MODE_POINT]          = FILL_MODE_POINT,
271      [PIPE_POLYGON_MODE_FILL_RECTANGLE] = FILL_MODE_SOLID,
272   };
273   return map[pipe_polymode];
274}
275
276static unsigned
277translate_mip_filter(enum pipe_tex_mipfilter pipe_mip)
278{
279   static const unsigned map[] = {
280      [PIPE_TEX_MIPFILTER_NEAREST] = MIPFILTER_NEAREST,
281      [PIPE_TEX_MIPFILTER_LINEAR]  = MIPFILTER_LINEAR,
282      [PIPE_TEX_MIPFILTER_NONE]    = MIPFILTER_NONE,
283   };
284   return map[pipe_mip];
285}
286
287static uint32_t
288translate_wrap(unsigned pipe_wrap)
289{
290   static const unsigned map[] = {
291      [PIPE_TEX_WRAP_REPEAT]                 = TCM_WRAP,
292      [PIPE_TEX_WRAP_CLAMP]                  = TCM_HALF_BORDER,
293      [PIPE_TEX_WRAP_CLAMP_TO_EDGE]          = TCM_CLAMP,
294      [PIPE_TEX_WRAP_CLAMP_TO_BORDER]        = TCM_CLAMP_BORDER,
295      [PIPE_TEX_WRAP_MIRROR_REPEAT]          = TCM_MIRROR,
296      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE]   = TCM_MIRROR_ONCE,
297
298      /* These are unsupported. */
299      [PIPE_TEX_WRAP_MIRROR_CLAMP]           = -1,
300      [PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER] = -1,
301   };
302   return map[pipe_wrap];
303}
304
305/**
306 * Allocate space for some indirect state.
307 *
308 * Return a pointer to the map (to fill it out) and a state ref (for
309 * referring to the state in GPU commands).
310 */
311static void *
312upload_state(struct u_upload_mgr *uploader,
313             struct iris_state_ref *ref,
314             unsigned size,
315             unsigned alignment)
316{
317   void *p = NULL;
318   u_upload_alloc(uploader, 0, size, alignment, &ref->offset, &ref->res, &p);
319   return p;
320}
321
322/**
323 * Stream out temporary/short-lived state.
324 *
325 * This allocates space, pins the BO, and includes the BO address in the
326 * returned offset (which works because all state lives in 32-bit memory
327 * zones).
328 */
329static uint32_t *
330stream_state(struct iris_batch *batch,
331             struct u_upload_mgr *uploader,
332             struct pipe_resource **out_res,
333             unsigned size,
334             unsigned alignment,
335             uint32_t *out_offset)
336{
337   void *ptr = NULL;
338
339   u_upload_alloc(uploader, 0, size, alignment, out_offset, out_res, &ptr);
340
341   struct iris_bo *bo = iris_resource_bo(*out_res);
342   iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
343
344   iris_record_state_size(batch->state_sizes,
345                          bo->address + *out_offset, size);
346
347   *out_offset += iris_bo_offset_from_base_address(bo);
348
349   return ptr;
350}
351
352/**
353 * stream_state() + memcpy.
354 */
355static uint32_t
356emit_state(struct iris_batch *batch,
357           struct u_upload_mgr *uploader,
358           struct pipe_resource **out_res,
359           const void *data,
360           unsigned size,
361           unsigned alignment)
362{
363   unsigned offset = 0;
364   uint32_t *map =
365      stream_state(batch, uploader, out_res, size, alignment, &offset);
366
367   if (map)
368      memcpy(map, data, size);
369
370   return offset;
371}
372
373/**
374 * Did field 'x' change between 'old_cso' and 'new_cso'?
375 *
376 * (If so, we may want to set some dirty flags.)
377 */
378#define cso_changed(x) (!old_cso || (old_cso->x != new_cso->x))
379#define cso_changed_memcmp(x) \
380   (!old_cso || memcmp(old_cso->x, new_cso->x, sizeof(old_cso->x)) != 0)
381
382static void
383flush_before_state_base_change(struct iris_batch *batch)
384{
385   const struct intel_device_info *devinfo = &batch->screen->devinfo;
386
387   /* Flush before emitting STATE_BASE_ADDRESS.
388    *
389    * This isn't documented anywhere in the PRM.  However, it seems to be
390    * necessary prior to changing the surface state base address.  We've
391    * seen issues in Vulkan where we get GPU hangs when using multi-level
392    * command buffers which clear depth, reset state base address, and then
393    * go render stuff.
394    *
395    * Normally, in GL, we would trust the kernel to do sufficient stalls
396    * and flushes prior to executing our batch.  However, it doesn't seem
397    * as if the kernel's flushing is always sufficient and we don't want to
398    * rely on it.
399    *
400    * We make this an end-of-pipe sync instead of a normal flush because we
401    * do not know the current status of the GPU.  On Haswell at least,
402    * having a fast-clear operation in flight at the same time as a normal
403    * rendering operation can cause hangs.  Since the kernel's flushing is
404    * insufficient, we need to ensure that any rendering operations from
405    * other processes are definitely complete before we try to do our own
406    * rendering.  It's a bit of a big hammer but it appears to work.
407    */
408   iris_emit_end_of_pipe_sync(batch,
409                              "change STATE_BASE_ADDRESS (flushes)",
410                              PIPE_CONTROL_RENDER_TARGET_FLUSH |
411                              PIPE_CONTROL_DEPTH_CACHE_FLUSH |
412                              PIPE_CONTROL_DATA_CACHE_FLUSH |
413                              /* Wa_1606662791:
414                               *
415                               *   Software must program PIPE_CONTROL command
416                               *   with "HDC Pipeline Flush" prior to
417                               *   programming of the below two non-pipeline
418                               *   state :
419                               *      * STATE_BASE_ADDRESS
420                               *      * 3DSTATE_BINDING_TABLE_POOL_ALLOC
421                               */
422                              ((GFX_VER == 12 && devinfo->revision == 0 /* A0 */ ?
423                                PIPE_CONTROL_FLUSH_HDC : 0)));
424}
425
426static void
427flush_after_state_base_change(struct iris_batch *batch)
428{
429   /* After re-setting the surface state base address, we have to do some
430    * cache flusing so that the sampler engine will pick up the new
431    * SURFACE_STATE objects and binding tables. From the Broadwell PRM,
432    * Shared Function > 3D Sampler > State > State Caching (page 96):
433    *
434    *    Coherency with system memory in the state cache, like the texture
435    *    cache is handled partially by software. It is expected that the
436    *    command stream or shader will issue Cache Flush operation or
437    *    Cache_Flush sampler message to ensure that the L1 cache remains
438    *    coherent with system memory.
439    *
440    *    [...]
441    *
442    *    Whenever the value of the Dynamic_State_Base_Addr,
443    *    Surface_State_Base_Addr are altered, the L1 state cache must be
444    *    invalidated to ensure the new surface or sampler state is fetched
445    *    from system memory.
446    *
447    * The PIPE_CONTROL command has a "State Cache Invalidation Enable" bit
448    * which, according the PIPE_CONTROL instruction documentation in the
449    * Broadwell PRM:
450    *
451    *    Setting this bit is independent of any other bit in this packet.
452    *    This bit controls the invalidation of the L1 and L2 state caches
453    *    at the top of the pipe i.e. at the parsing time.
454    *
455    * Unfortunately, experimentation seems to indicate that state cache
456    * invalidation through a PIPE_CONTROL does nothing whatsoever in
457    * regards to surface state and binding tables.  In stead, it seems that
458    * invalidating the texture cache is what is actually needed.
459    *
460    * XXX:  As far as we have been able to determine through
461    * experimentation, shows that flush the texture cache appears to be
462    * sufficient.  The theory here is that all of the sampling/rendering
463    * units cache the binding table in the texture cache.  However, we have
464    * yet to be able to actually confirm this.
465    */
466   iris_emit_end_of_pipe_sync(batch,
467                              "change STATE_BASE_ADDRESS (invalidates)",
468                              PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
469                              PIPE_CONTROL_CONST_CACHE_INVALIDATE |
470                              PIPE_CONTROL_STATE_CACHE_INVALIDATE);
471}
472
473static void
474_iris_emit_lri(struct iris_batch *batch, uint32_t reg, uint32_t val)
475{
476   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
477      lri.RegisterOffset = reg;
478      lri.DataDWord      = val;
479   }
480}
481#define iris_emit_lri(b, r, v) _iris_emit_lri(b, GENX(r##_num), v)
482
483static void
484_iris_emit_lrr(struct iris_batch *batch, uint32_t dst, uint32_t src)
485{
486   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_REG), lrr) {
487      lrr.SourceRegisterAddress = src;
488      lrr.DestinationRegisterAddress = dst;
489   }
490}
491
492static void
493iris_load_register_reg32(struct iris_batch *batch, uint32_t dst,
494                         uint32_t src)
495{
496   _iris_emit_lrr(batch, dst, src);
497}
498
499static void
500iris_load_register_reg64(struct iris_batch *batch, uint32_t dst,
501                         uint32_t src)
502{
503   _iris_emit_lrr(batch, dst, src);
504   _iris_emit_lrr(batch, dst + 4, src + 4);
505}
506
507static void
508iris_load_register_imm32(struct iris_batch *batch, uint32_t reg,
509                         uint32_t val)
510{
511   _iris_emit_lri(batch, reg, val);
512}
513
514static void
515iris_load_register_imm64(struct iris_batch *batch, uint32_t reg,
516                         uint64_t val)
517{
518   _iris_emit_lri(batch, reg + 0, val & 0xffffffff);
519   _iris_emit_lri(batch, reg + 4, val >> 32);
520}
521
522/**
523 * Emit MI_LOAD_REGISTER_MEM to load a 32-bit MMIO register from a buffer.
524 */
525static void
526iris_load_register_mem32(struct iris_batch *batch, uint32_t reg,
527                         struct iris_bo *bo, uint32_t offset)
528{
529   iris_batch_sync_region_start(batch);
530   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
531      lrm.RegisterAddress = reg;
532      lrm.MemoryAddress = ro_bo(bo, offset);
533   }
534   iris_batch_sync_region_end(batch);
535}
536
537/**
538 * Load a 64-bit value from a buffer into a MMIO register via
539 * two MI_LOAD_REGISTER_MEM commands.
540 */
541static void
542iris_load_register_mem64(struct iris_batch *batch, uint32_t reg,
543                         struct iris_bo *bo, uint32_t offset)
544{
545   iris_load_register_mem32(batch, reg + 0, bo, offset + 0);
546   iris_load_register_mem32(batch, reg + 4, bo, offset + 4);
547}
548
549static void
550iris_store_register_mem32(struct iris_batch *batch, uint32_t reg,
551                          struct iris_bo *bo, uint32_t offset,
552                          bool predicated)
553{
554   iris_batch_sync_region_start(batch);
555   iris_emit_cmd(batch, GENX(MI_STORE_REGISTER_MEM), srm) {
556      srm.RegisterAddress = reg;
557      srm.MemoryAddress = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
558      srm.PredicateEnable = predicated;
559   }
560   iris_batch_sync_region_end(batch);
561}
562
563static void
564iris_store_register_mem64(struct iris_batch *batch, uint32_t reg,
565                          struct iris_bo *bo, uint32_t offset,
566                          bool predicated)
567{
568   iris_store_register_mem32(batch, reg + 0, bo, offset + 0, predicated);
569   iris_store_register_mem32(batch, reg + 4, bo, offset + 4, predicated);
570}
571
572static void
573iris_store_data_imm32(struct iris_batch *batch,
574                      struct iris_bo *bo, uint32_t offset,
575                      uint32_t imm)
576{
577   iris_batch_sync_region_start(batch);
578   iris_emit_cmd(batch, GENX(MI_STORE_DATA_IMM), sdi) {
579      sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
580      sdi.ImmediateData = imm;
581   }
582   iris_batch_sync_region_end(batch);
583}
584
585static void
586iris_store_data_imm64(struct iris_batch *batch,
587                      struct iris_bo *bo, uint32_t offset,
588                      uint64_t imm)
589{
590   /* Can't use iris_emit_cmd because MI_STORE_DATA_IMM has a length of
591    * 2 in genxml but it's actually variable length and we need 5 DWords.
592    */
593   void *map = iris_get_command_space(batch, 4 * 5);
594   iris_batch_sync_region_start(batch);
595   _iris_pack_command(batch, GENX(MI_STORE_DATA_IMM), map, sdi) {
596      sdi.DWordLength = 5 - 2;
597      sdi.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
598      sdi.ImmediateData = imm;
599   }
600   iris_batch_sync_region_end(batch);
601}
602
603static void
604iris_copy_mem_mem(struct iris_batch *batch,
605                  struct iris_bo *dst_bo, uint32_t dst_offset,
606                  struct iris_bo *src_bo, uint32_t src_offset,
607                  unsigned bytes)
608{
609   /* MI_COPY_MEM_MEM operates on DWords. */
610   assert(bytes % 4 == 0);
611   assert(dst_offset % 4 == 0);
612   assert(src_offset % 4 == 0);
613   iris_batch_sync_region_start(batch);
614
615   for (unsigned i = 0; i < bytes; i += 4) {
616      iris_emit_cmd(batch, GENX(MI_COPY_MEM_MEM), cp) {
617         cp.DestinationMemoryAddress = rw_bo(dst_bo, dst_offset + i,
618                                             IRIS_DOMAIN_OTHER_WRITE);
619         cp.SourceMemoryAddress = ro_bo(src_bo, src_offset + i);
620      }
621   }
622
623   iris_batch_sync_region_end(batch);
624}
625
626static void
627emit_pipeline_select(struct iris_batch *batch, uint32_t pipeline)
628{
629#if GFX_VER >= 8 && GFX_VER < 10
630   /* From the Broadwell PRM, Volume 2a: Instructions, PIPELINE_SELECT:
631    *
632    *   Software must clear the COLOR_CALC_STATE Valid field in
633    *   3DSTATE_CC_STATE_POINTERS command prior to send a PIPELINE_SELECT
634    *   with Pipeline Select set to GPGPU.
635    *
636    * The internal hardware docs recommend the same workaround for Gfx9
637    * hardware too.
638    */
639   if (pipeline == GPGPU)
640      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), t);
641#endif
642
643
644   /* From "BXML » GT » MI » vol1a GPU Overview » [Instruction]
645    * PIPELINE_SELECT [DevBWR+]":
646    *
647    *    "Project: DEVSNB+
648    *
649    *     Software must ensure all the write caches are flushed through a
650    *     stalling PIPE_CONTROL command followed by another PIPE_CONTROL
651    *     command to invalidate read only caches prior to programming
652    *     MI_PIPELINE_SELECT command to change the Pipeline Select Mode."
653    */
654    iris_emit_pipe_control_flush(batch,
655                                 "workaround: PIPELINE_SELECT flushes (1/2)",
656                                 PIPE_CONTROL_RENDER_TARGET_FLUSH |
657                                 PIPE_CONTROL_DEPTH_CACHE_FLUSH |
658                                 PIPE_CONTROL_DATA_CACHE_FLUSH |
659                                 PIPE_CONTROL_CS_STALL);
660
661    iris_emit_pipe_control_flush(batch,
662                                 "workaround: PIPELINE_SELECT flushes (2/2)",
663                                 PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
664                                 PIPE_CONTROL_CONST_CACHE_INVALIDATE |
665                                 PIPE_CONTROL_STATE_CACHE_INVALIDATE |
666                                 PIPE_CONTROL_INSTRUCTION_INVALIDATE);
667
668   iris_emit_cmd(batch, GENX(PIPELINE_SELECT), sel) {
669#if GFX_VER >= 9
670      sel.MaskBits = GFX_VER >= 12 ? 0x13 : 3;
671      sel.MediaSamplerDOPClockGateEnable = GFX_VER >= 12;
672#endif
673      sel.PipelineSelection = pipeline;
674   }
675}
676
677UNUSED static void
678init_glk_barrier_mode(struct iris_batch *batch, uint32_t value)
679{
680#if GFX_VER == 9
681   /* Project: DevGLK
682    *
683    *    "This chicken bit works around a hardware issue with barrier
684    *     logic encountered when switching between GPGPU and 3D pipelines.
685    *     To workaround the issue, this mode bit should be set after a
686    *     pipeline is selected."
687    */
688   iris_emit_reg(batch, GENX(SLICE_COMMON_ECO_CHICKEN1), reg) {
689      reg.GLKBarrierMode = value;
690      reg.GLKBarrierModeMask = 1;
691   }
692#endif
693}
694
695static void
696init_state_base_address(struct iris_batch *batch)
697{
698   struct isl_device *isl_dev = &batch->screen->isl_dev;
699   uint32_t mocs = isl_mocs(isl_dev, 0, false);
700   flush_before_state_base_change(batch);
701
702   /* We program most base addresses once at context initialization time.
703    * Each base address points at a 4GB memory zone, and never needs to
704    * change.  See iris_bufmgr.h for a description of the memory zones.
705    *
706    * The one exception is Surface State Base Address, which needs to be
707    * updated occasionally.  See iris_binder.c for the details there.
708    */
709   iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
710      sba.GeneralStateMOCS            = mocs;
711      sba.StatelessDataPortAccessMOCS = mocs;
712      sba.DynamicStateMOCS            = mocs;
713      sba.IndirectObjectMOCS          = mocs;
714      sba.InstructionMOCS             = mocs;
715      sba.SurfaceStateMOCS            = mocs;
716
717      sba.GeneralStateBaseAddressModifyEnable   = true;
718      sba.DynamicStateBaseAddressModifyEnable   = true;
719      sba.IndirectObjectBaseAddressModifyEnable = true;
720      sba.InstructionBaseAddressModifyEnable    = true;
721      sba.GeneralStateBufferSizeModifyEnable    = true;
722      sba.DynamicStateBufferSizeModifyEnable    = true;
723#if (GFX_VER >= 9)
724      sba.BindlessSurfaceStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_BINDLESS_START);
725      sba.BindlessSurfaceStateSize = (IRIS_BINDLESS_SIZE >> 12) - 1;
726      sba.BindlessSurfaceStateBaseAddressModifyEnable = true;
727      sba.BindlessSurfaceStateMOCS    = mocs;
728#endif
729      sba.IndirectObjectBufferSizeModifyEnable  = true;
730      sba.InstructionBuffersizeModifyEnable     = true;
731
732      sba.InstructionBaseAddress  = ro_bo(NULL, IRIS_MEMZONE_SHADER_START);
733      sba.DynamicStateBaseAddress = ro_bo(NULL, IRIS_MEMZONE_DYNAMIC_START);
734
735      sba.GeneralStateBufferSize   = 0xfffff;
736      sba.IndirectObjectBufferSize = 0xfffff;
737      sba.InstructionBufferSize    = 0xfffff;
738      sba.DynamicStateBufferSize   = 0xfffff;
739   }
740
741   flush_after_state_base_change(batch);
742}
743
744static void
745iris_emit_l3_config(struct iris_batch *batch,
746                    const struct intel_l3_config *cfg)
747{
748   assert(cfg || GFX_VER >= 12);
749
750#if GFX_VER >= 12
751#define L3_ALLOCATION_REG GENX(L3ALLOC)
752#define L3_ALLOCATION_REG_num GENX(L3ALLOC_num)
753#else
754#define L3_ALLOCATION_REG GENX(L3CNTLREG)
755#define L3_ALLOCATION_REG_num GENX(L3CNTLREG_num)
756#endif
757
758   iris_emit_reg(batch, L3_ALLOCATION_REG, reg) {
759#if GFX_VER < 11
760      reg.SLMEnable = cfg->n[INTEL_L3P_SLM] > 0;
761#endif
762#if GFX_VER == 11
763      /* Wa_1406697149: Bit 9 "Error Detection Behavior Control" must be set
764       * in L3CNTLREG register. The default setting of the bit is not the
765       * desirable behavior.
766       */
767      reg.ErrorDetectionBehaviorControl = true;
768      reg.UseFullWays = true;
769#endif
770      if (GFX_VER < 12 || cfg) {
771         reg.URBAllocation = cfg->n[INTEL_L3P_URB];
772         reg.ROAllocation = cfg->n[INTEL_L3P_RO];
773         reg.DCAllocation = cfg->n[INTEL_L3P_DC];
774         reg.AllAllocation = cfg->n[INTEL_L3P_ALL];
775      } else {
776#if GFX_VER >= 12
777         reg.L3FullWayAllocationEnable = true;
778#endif
779      }
780   }
781}
782
783#if GFX_VER == 9
784static void
785iris_enable_obj_preemption(struct iris_batch *batch, bool enable)
786{
787   /* A fixed function pipe flush is required before modifying this field */
788   iris_emit_end_of_pipe_sync(batch, enable ? "enable preemption"
789                                            : "disable preemption",
790                              PIPE_CONTROL_RENDER_TARGET_FLUSH);
791
792   /* enable object level preemption */
793   iris_emit_reg(batch, GENX(CS_CHICKEN1), reg) {
794      reg.ReplayMode = enable;
795      reg.ReplayModeMask = true;
796   }
797}
798#endif
799
800/**
801 * Compute an \p n x \p m pixel hashing table usable as slice, subslice or
802 * pixel pipe hashing table.  The resulting table is the cyclic repetition of
803 * a fixed pattern with periodicity equal to \p period.
804 *
805 * If \p index is specified to be equal to \p period, a 2-way hashing table
806 * will be generated such that indices 0 and 1 are returned for the following
807 * fractions of entries respectively:
808 *
809 *   p_0 = ceil(period / 2) / period
810 *   p_1 = floor(period / 2) / period
811 *
812 * If \p index is even and less than \p period, a 3-way hashing table will be
813 * generated such that indices 0, 1 and 2 are returned for the following
814 * fractions of entries:
815 *
816 *   p_0 = (ceil(period / 2) - 1) / period
817 *   p_1 = floor(period / 2) / period
818 *   p_2 = 1 / period
819 *
820 * The equations above apply if \p flip is equal to 0, if it is equal to 1 p_0
821 * and p_1 will be swapped for the result.  Note that in the context of pixel
822 * pipe hashing this can be always 0 on Gfx12 platforms, since the hardware
823 * transparently remaps logical indices found on the table to physical pixel
824 * pipe indices from the highest to lowest EU count.
825 */
826UNUSED static void
827calculate_pixel_hashing_table(unsigned n, unsigned m,
828                              unsigned period, unsigned index, bool flip,
829                              uint32_t *p)
830{
831   for (unsigned i = 0; i < n; i++) {
832      for (unsigned j = 0; j < m; j++) {
833         const unsigned k = (i + j) % period;
834         p[j + m * i] = (k == index ? 2 : (k & 1) ^ flip);
835      }
836   }
837}
838
839#if GFX_VER == 11
840static void
841gfx11_upload_pixel_hashing_tables(struct iris_batch *batch)
842{
843   const struct intel_device_info *devinfo = &batch->screen->devinfo;
844   assert(devinfo->ppipe_subslices[2] == 0);
845
846   if (devinfo->ppipe_subslices[0] == devinfo->ppipe_subslices[1])
847      return;
848
849   struct iris_context *ice = batch->ice;
850   assert(&ice->batches[IRIS_BATCH_RENDER] == batch);
851
852   unsigned size = GENX(SLICE_HASH_TABLE_length) * 4;
853   uint32_t hash_address;
854   struct pipe_resource *tmp = NULL;
855   uint32_t *map =
856      stream_state(batch, ice->state.dynamic_uploader, &tmp,
857                   size, 64, &hash_address);
858   pipe_resource_reference(&tmp, NULL);
859
860   const bool flip = devinfo->ppipe_subslices[0] < devinfo->ppipe_subslices[1];
861   struct GENX(SLICE_HASH_TABLE) table;
862   calculate_pixel_hashing_table(16, 16, 3, 3, flip, table.Entry[0]);
863
864   GENX(SLICE_HASH_TABLE_pack)(NULL, map, &table);
865
866   iris_emit_cmd(batch, GENX(3DSTATE_SLICE_TABLE_STATE_POINTERS), ptr) {
867      ptr.SliceHashStatePointerValid = true;
868      ptr.SliceHashTableStatePointer = hash_address;
869   }
870
871   iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), mode) {
872      mode.SliceHashingTableEnable = true;
873   }
874}
875#elif GFX_VERx10 == 120
876static void
877gfx12_upload_pixel_hashing_tables(struct iris_batch *batch)
878{
879   const struct intel_device_info *devinfo = &batch->screen->devinfo;
880   /* For each n calculate ppipes_of[n], equal to the number of pixel pipes
881    * present with n active dual subslices.
882    */
883   unsigned ppipes_of[3] = {};
884
885   for (unsigned n = 0; n < ARRAY_SIZE(ppipes_of); n++) {
886      for (unsigned p = 0; p < ARRAY_SIZE(devinfo->ppipe_subslices); p++)
887         ppipes_of[n] += (devinfo->ppipe_subslices[p] == n);
888   }
889
890   /* Gfx12 has three pixel pipes. */
891   assert(ppipes_of[0] + ppipes_of[1] + ppipes_of[2] == 3);
892
893   if (ppipes_of[2] == 3 || ppipes_of[0] == 2) {
894      /* All three pixel pipes have the maximum number of active dual
895       * subslices, or there is only one active pixel pipe: Nothing to do.
896       */
897      return;
898   }
899
900   iris_emit_cmd(batch, GENX(3DSTATE_SUBSLICE_HASH_TABLE), p) {
901      p.SliceHashControl[0] = TABLE_0;
902
903      if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
904         calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.TwoWayTableEntry[0]);
905      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
906         calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.TwoWayTableEntry[0]);
907
908      if (ppipes_of[2] == 2 && ppipes_of[1] == 1)
909         calculate_pixel_hashing_table(8, 16, 5, 4, 0, p.ThreeWayTableEntry[0]);
910      else if (ppipes_of[2] == 2 && ppipes_of[0] == 1)
911         calculate_pixel_hashing_table(8, 16, 2, 2, 0, p.ThreeWayTableEntry[0]);
912      else if (ppipes_of[2] == 1 && ppipes_of[1] == 1 && ppipes_of[0] == 1)
913         calculate_pixel_hashing_table(8, 16, 3, 3, 0, p.ThreeWayTableEntry[0]);
914      else
915         unreachable("Illegal fusing.");
916   }
917
918   iris_emit_cmd(batch, GENX(3DSTATE_3D_MODE), p) {
919      p.SubsliceHashingTableEnable = true;
920      p.SubsliceHashingTableEnableMask = true;
921   }
922}
923#endif
924
925static void
926iris_alloc_push_constants(struct iris_batch *batch)
927{
928   const struct intel_device_info *devinfo = &batch->screen->devinfo;
929
930   /* For now, we set a static partitioning of the push constant area,
931    * assuming that all stages could be in use.
932    *
933    * TODO: Try lazily allocating the HS/DS/GS sections as needed, and
934    *       see if that improves performance by offering more space to
935    *       the VS/FS when those aren't in use.  Also, try dynamically
936    *       enabling/disabling it like i965 does.  This would be more
937    *       stalls and may not actually help; we don't know yet.
938    */
939
940   /* Divide as equally as possible with any remainder given to FRAGMENT. */
941   const unsigned push_constant_kb = devinfo->max_constant_urb_size_kb;
942   const unsigned stage_size = push_constant_kb / 5;
943   const unsigned frag_size = push_constant_kb - 4 * stage_size;
944
945   for (int i = 0; i <= MESA_SHADER_FRAGMENT; i++) {
946      iris_emit_cmd(batch, GENX(3DSTATE_PUSH_CONSTANT_ALLOC_VS), alloc) {
947         alloc._3DCommandSubOpcode = 18 + i;
948         alloc.ConstantBufferOffset = stage_size * i;
949         alloc.ConstantBufferSize = i == MESA_SHADER_FRAGMENT ? frag_size : stage_size;
950      }
951   }
952}
953
954#if GFX_VER >= 12
955static void
956init_aux_map_state(struct iris_batch *batch);
957#endif
958
959/**
960 * Upload initial GPU state for any kind of context.
961 *
962 * These need to happen for both render and compute.
963 */
964static void
965iris_init_common_context(struct iris_batch *batch)
966{
967#if GFX_VER == 11
968   iris_emit_reg(batch, GENX(SAMPLER_MODE), reg) {
969      reg.HeaderlessMessageforPreemptableContexts = 1;
970      reg.HeaderlessMessageforPreemptableContextsMask = 1;
971   }
972
973   /* Bit 1 must be set in HALF_SLICE_CHICKEN7. */
974   iris_emit_reg(batch, GENX(HALF_SLICE_CHICKEN7), reg) {
975      reg.EnabledTexelOffsetPrecisionFix = 1;
976      reg.EnabledTexelOffsetPrecisionFixMask = 1;
977   }
978#endif
979}
980
981/**
982 * Upload the initial GPU state for a render context.
983 *
984 * This sets some invariant state that needs to be programmed a particular
985 * way, but we never actually change.
986 */
987static void
988iris_init_render_context(struct iris_batch *batch)
989{
990   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
991
992   iris_batch_sync_region_start(batch);
993
994   emit_pipeline_select(batch, _3D);
995
996   iris_emit_l3_config(batch, batch->screen->l3_config_3d);
997
998   init_state_base_address(batch);
999
1000   iris_init_common_context(batch);
1001
1002#if GFX_VER >= 9
1003   iris_emit_reg(batch, GENX(CS_DEBUG_MODE2), reg) {
1004      reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1005      reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1006   }
1007#else
1008   iris_emit_reg(batch, GENX(INSTPM), reg) {
1009      reg.CONSTANT_BUFFERAddressOffsetDisable = true;
1010      reg.CONSTANT_BUFFERAddressOffsetDisableMask = true;
1011   }
1012#endif
1013
1014#if GFX_VER == 9
1015   iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1016      reg.FloatBlendOptimizationEnable = true;
1017      reg.FloatBlendOptimizationEnableMask = true;
1018      reg.MSCRAWHazardAvoidanceBit = true;
1019      reg.MSCRAWHazardAvoidanceBitMask = true;
1020      reg.PartialResolveDisableInVC = true;
1021      reg.PartialResolveDisableInVCMask = true;
1022   }
1023
1024   if (devinfo->is_geminilake)
1025      init_glk_barrier_mode(batch, GLK_BARRIER_MODE_3D_HULL);
1026#endif
1027
1028#if GFX_VER == 11
1029   iris_emit_reg(batch, GENX(TCCNTLREG), reg) {
1030      reg.L3DataPartialWriteMergingEnable = true;
1031      reg.ColorZPartialWriteMergingEnable = true;
1032      reg.URBPartialWriteMergingEnable = true;
1033      reg.TCDisable = true;
1034   }
1035
1036   /* Hardware specification recommends disabling repacking for the
1037    * compatibility with decompression mechanism in display controller.
1038    */
1039   if (devinfo->disable_ccs_repack) {
1040      iris_emit_reg(batch, GENX(CACHE_MODE_0), reg) {
1041         reg.DisableRepackingforCompression = true;
1042         reg.DisableRepackingforCompressionMask = true;
1043      }
1044   }
1045
1046   gfx11_upload_pixel_hashing_tables(batch);
1047#endif
1048
1049#if GFX_VERx10 == 120
1050   gfx12_upload_pixel_hashing_tables(batch);
1051#endif
1052
1053   /* 3DSTATE_DRAWING_RECTANGLE is non-pipelined, so we want to avoid
1054    * changing it dynamically.  We set it to the maximum size here, and
1055    * instead include the render target dimensions in the viewport, so
1056    * viewport extents clipping takes care of pruning stray geometry.
1057    */
1058   iris_emit_cmd(batch, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
1059      rect.ClippedDrawingRectangleXMax = UINT16_MAX;
1060      rect.ClippedDrawingRectangleYMax = UINT16_MAX;
1061   }
1062
1063   /* Set the initial MSAA sample positions. */
1064   iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_PATTERN), pat) {
1065      INTEL_SAMPLE_POS_1X(pat._1xSample);
1066      INTEL_SAMPLE_POS_2X(pat._2xSample);
1067      INTEL_SAMPLE_POS_4X(pat._4xSample);
1068      INTEL_SAMPLE_POS_8X(pat._8xSample);
1069#if GFX_VER >= 9
1070      INTEL_SAMPLE_POS_16X(pat._16xSample);
1071#endif
1072   }
1073
1074   /* Use the legacy AA line coverage computation. */
1075   iris_emit_cmd(batch, GENX(3DSTATE_AA_LINE_PARAMETERS), foo);
1076
1077   /* Disable chromakeying (it's for media) */
1078   iris_emit_cmd(batch, GENX(3DSTATE_WM_CHROMAKEY), foo);
1079
1080   /* We want regular rendering, not special HiZ operations. */
1081   iris_emit_cmd(batch, GENX(3DSTATE_WM_HZ_OP), foo);
1082
1083   /* No polygon stippling offsets are necessary. */
1084   /* TODO: may need to set an offset for origin-UL framebuffers */
1085   iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_OFFSET), foo);
1086
1087   iris_alloc_push_constants(batch);
1088
1089
1090#if GFX_VER >= 12
1091   init_aux_map_state(batch);
1092#endif
1093
1094   iris_batch_sync_region_end(batch);
1095}
1096
1097static void
1098iris_init_compute_context(struct iris_batch *batch)
1099{
1100   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
1101
1102   iris_batch_sync_region_start(batch);
1103
1104   /* Wa_1607854226:
1105    *
1106    *  Start with pipeline in 3D mode to set the STATE_BASE_ADDRESS.
1107    */
1108#if GFX_VER == 12
1109   emit_pipeline_select(batch, _3D);
1110#else
1111   emit_pipeline_select(batch, GPGPU);
1112#endif
1113
1114   iris_emit_l3_config(batch, batch->screen->l3_config_cs);
1115
1116   init_state_base_address(batch);
1117
1118   iris_init_common_context(batch);
1119
1120#if GFX_VER == 12
1121   emit_pipeline_select(batch, GPGPU);
1122#endif
1123
1124#if GFX_VER == 9
1125   if (devinfo->is_geminilake)
1126      init_glk_barrier_mode(batch, GLK_BARRIER_MODE_GPGPU);
1127#endif
1128
1129#if GFX_VER >= 12
1130   init_aux_map_state(batch);
1131#endif
1132
1133   iris_batch_sync_region_end(batch);
1134}
1135
1136struct iris_vertex_buffer_state {
1137   /** The VERTEX_BUFFER_STATE hardware structure. */
1138   uint32_t state[GENX(VERTEX_BUFFER_STATE_length)];
1139
1140   /** The resource to source vertex data from. */
1141   struct pipe_resource *resource;
1142
1143   int offset;
1144};
1145
1146struct iris_depth_buffer_state {
1147   /* Depth/HiZ/Stencil related hardware packets. */
1148   uint32_t packets[GENX(3DSTATE_DEPTH_BUFFER_length) +
1149                    GENX(3DSTATE_STENCIL_BUFFER_length) +
1150                    GENX(3DSTATE_HIER_DEPTH_BUFFER_length) +
1151                    GENX(3DSTATE_CLEAR_PARAMS_length)];
1152};
1153
1154#if GFX_VERx10 == 120
1155   enum iris_depth_reg_mode {
1156      IRIS_DEPTH_REG_MODE_HW_DEFAULT = 0,
1157      IRIS_DEPTH_REG_MODE_D16,
1158      IRIS_DEPTH_REG_MODE_UNKNOWN,
1159   };
1160#endif
1161
1162/**
1163 * Generation-specific context state (ice->state.genx->...).
1164 *
1165 * Most state can go in iris_context directly, but these encode hardware
1166 * packets which vary by generation.
1167 */
1168struct iris_genx_state {
1169   struct iris_vertex_buffer_state vertex_buffers[33];
1170   uint32_t last_index_buffer[GENX(3DSTATE_INDEX_BUFFER_length)];
1171
1172   struct iris_depth_buffer_state depth_buffer;
1173
1174   uint32_t so_buffers[4 * GENX(3DSTATE_SO_BUFFER_length)];
1175
1176#if GFX_VER == 8
1177   bool pma_fix_enabled;
1178#endif
1179
1180#if GFX_VER == 9
1181   /* Is object level preemption enabled? */
1182   bool object_preemption;
1183#endif
1184
1185#if GFX_VERx10 == 120
1186   enum iris_depth_reg_mode depth_reg_mode;
1187#endif
1188
1189   struct {
1190#if GFX_VER == 8
1191      struct brw_image_param image_param[PIPE_MAX_SHADER_IMAGES];
1192#endif
1193   } shaders[MESA_SHADER_STAGES];
1194};
1195
1196/**
1197 * The pipe->set_blend_color() driver hook.
1198 *
1199 * This corresponds to our COLOR_CALC_STATE.
1200 */
1201static void
1202iris_set_blend_color(struct pipe_context *ctx,
1203                     const struct pipe_blend_color *state)
1204{
1205   struct iris_context *ice = (struct iris_context *) ctx;
1206
1207   /* Our COLOR_CALC_STATE is exactly pipe_blend_color, so just memcpy */
1208   memcpy(&ice->state.blend_color, state, sizeof(struct pipe_blend_color));
1209   ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1210}
1211
1212/**
1213 * Gallium CSO for blend state (see pipe_blend_state).
1214 */
1215struct iris_blend_state {
1216   /** Partial 3DSTATE_PS_BLEND */
1217   uint32_t ps_blend[GENX(3DSTATE_PS_BLEND_length)];
1218
1219   /** Partial BLEND_STATE */
1220   uint32_t blend_state[GENX(BLEND_STATE_length) +
1221                        BRW_MAX_DRAW_BUFFERS * GENX(BLEND_STATE_ENTRY_length)];
1222
1223   bool alpha_to_coverage; /* for shader key */
1224
1225   /** Bitfield of whether blending is enabled for RT[i] - for aux resolves */
1226   uint8_t blend_enables;
1227
1228   /** Bitfield of whether color writes are enabled for RT[i] */
1229   uint8_t color_write_enables;
1230
1231   /** Does RT[0] use dual color blending? */
1232   bool dual_color_blending;
1233};
1234
1235static enum pipe_blendfactor
1236fix_blendfactor(enum pipe_blendfactor f, bool alpha_to_one)
1237{
1238   if (alpha_to_one) {
1239      if (f == PIPE_BLENDFACTOR_SRC1_ALPHA)
1240         return PIPE_BLENDFACTOR_ONE;
1241
1242      if (f == PIPE_BLENDFACTOR_INV_SRC1_ALPHA)
1243         return PIPE_BLENDFACTOR_ZERO;
1244   }
1245
1246   return f;
1247}
1248
1249/**
1250 * The pipe->create_blend_state() driver hook.
1251 *
1252 * Translates a pipe_blend_state into iris_blend_state.
1253 */
1254static void *
1255iris_create_blend_state(struct pipe_context *ctx,
1256                        const struct pipe_blend_state *state)
1257{
1258   struct iris_blend_state *cso = malloc(sizeof(struct iris_blend_state));
1259   uint32_t *blend_entry = cso->blend_state + GENX(BLEND_STATE_length);
1260
1261   cso->blend_enables = 0;
1262   cso->color_write_enables = 0;
1263   STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS <= 8);
1264
1265   cso->alpha_to_coverage = state->alpha_to_coverage;
1266
1267   bool indep_alpha_blend = false;
1268
1269   for (int i = 0; i < BRW_MAX_DRAW_BUFFERS; i++) {
1270      const struct pipe_rt_blend_state *rt =
1271         &state->rt[state->independent_blend_enable ? i : 0];
1272
1273      enum pipe_blendfactor src_rgb =
1274         fix_blendfactor(rt->rgb_src_factor, state->alpha_to_one);
1275      enum pipe_blendfactor src_alpha =
1276         fix_blendfactor(rt->alpha_src_factor, state->alpha_to_one);
1277      enum pipe_blendfactor dst_rgb =
1278         fix_blendfactor(rt->rgb_dst_factor, state->alpha_to_one);
1279      enum pipe_blendfactor dst_alpha =
1280         fix_blendfactor(rt->alpha_dst_factor, state->alpha_to_one);
1281
1282      if (rt->rgb_func != rt->alpha_func ||
1283          src_rgb != src_alpha || dst_rgb != dst_alpha)
1284         indep_alpha_blend = true;
1285
1286      if (rt->blend_enable)
1287         cso->blend_enables |= 1u << i;
1288
1289      if (rt->colormask)
1290         cso->color_write_enables |= 1u << i;
1291
1292      iris_pack_state(GENX(BLEND_STATE_ENTRY), blend_entry, be) {
1293         be.LogicOpEnable = state->logicop_enable;
1294         be.LogicOpFunction = state->logicop_func;
1295
1296         be.PreBlendSourceOnlyClampEnable = false;
1297         be.ColorClampRange = COLORCLAMP_RTFORMAT;
1298         be.PreBlendColorClampEnable = true;
1299         be.PostBlendColorClampEnable = true;
1300
1301         be.ColorBufferBlendEnable = rt->blend_enable;
1302
1303         be.ColorBlendFunction          = rt->rgb_func;
1304         be.AlphaBlendFunction          = rt->alpha_func;
1305
1306         /* The casts prevent warnings about implicit enum type conversions. */
1307         be.SourceBlendFactor           = (int) src_rgb;
1308         be.SourceAlphaBlendFactor      = (int) src_alpha;
1309         be.DestinationBlendFactor      = (int) dst_rgb;
1310         be.DestinationAlphaBlendFactor = (int) dst_alpha;
1311
1312         be.WriteDisableRed   = !(rt->colormask & PIPE_MASK_R);
1313         be.WriteDisableGreen = !(rt->colormask & PIPE_MASK_G);
1314         be.WriteDisableBlue  = !(rt->colormask & PIPE_MASK_B);
1315         be.WriteDisableAlpha = !(rt->colormask & PIPE_MASK_A);
1316      }
1317      blend_entry += GENX(BLEND_STATE_ENTRY_length);
1318   }
1319
1320   iris_pack_command(GENX(3DSTATE_PS_BLEND), cso->ps_blend, pb) {
1321      /* pb.HasWriteableRT is filled in at draw time.
1322       * pb.AlphaTestEnable is filled in at draw time.
1323       *
1324       * pb.ColorBufferBlendEnable is filled in at draw time so we can avoid
1325       * setting it when dual color blending without an appropriate shader.
1326       */
1327
1328      pb.AlphaToCoverageEnable = state->alpha_to_coverage;
1329      pb.IndependentAlphaBlendEnable = indep_alpha_blend;
1330
1331      /* The casts prevent warnings about implicit enum type conversions. */
1332      pb.SourceBlendFactor =
1333         (int) fix_blendfactor(state->rt[0].rgb_src_factor, state->alpha_to_one);
1334      pb.SourceAlphaBlendFactor =
1335         (int) fix_blendfactor(state->rt[0].alpha_src_factor, state->alpha_to_one);
1336      pb.DestinationBlendFactor =
1337         (int) fix_blendfactor(state->rt[0].rgb_dst_factor, state->alpha_to_one);
1338      pb.DestinationAlphaBlendFactor =
1339         (int) fix_blendfactor(state->rt[0].alpha_dst_factor, state->alpha_to_one);
1340   }
1341
1342   iris_pack_state(GENX(BLEND_STATE), cso->blend_state, bs) {
1343      bs.AlphaToCoverageEnable = state->alpha_to_coverage;
1344      bs.IndependentAlphaBlendEnable = indep_alpha_blend;
1345      bs.AlphaToOneEnable = state->alpha_to_one;
1346      bs.AlphaToCoverageDitherEnable = state->alpha_to_coverage;
1347      bs.ColorDitherEnable = state->dither;
1348      /* bl.AlphaTestEnable and bs.AlphaTestFunction are filled in later. */
1349   }
1350
1351   cso->dual_color_blending = util_blend_state_is_dual(state, 0);
1352
1353   return cso;
1354}
1355
1356/**
1357 * The pipe->bind_blend_state() driver hook.
1358 *
1359 * Bind a blending CSO and flag related dirty bits.
1360 */
1361static void
1362iris_bind_blend_state(struct pipe_context *ctx, void *state)
1363{
1364   struct iris_context *ice = (struct iris_context *) ctx;
1365   struct iris_blend_state *cso = state;
1366
1367   ice->state.cso_blend = cso;
1368
1369   ice->state.dirty |= IRIS_DIRTY_PS_BLEND;
1370   ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1371   ice->state.stage_dirty |= ice->state.stage_dirty_for_nos[IRIS_NOS_BLEND];
1372
1373   if (GFX_VER == 8)
1374      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1375}
1376
1377/**
1378 * Return true if the FS writes to any color outputs which are not disabled
1379 * via color masking.
1380 */
1381static bool
1382has_writeable_rt(const struct iris_blend_state *cso_blend,
1383                 const struct shader_info *fs_info)
1384{
1385   if (!fs_info)
1386      return false;
1387
1388   unsigned rt_outputs = fs_info->outputs_written >> FRAG_RESULT_DATA0;
1389
1390   if (fs_info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR))
1391      rt_outputs = (1 << BRW_MAX_DRAW_BUFFERS) - 1;
1392
1393   return cso_blend->color_write_enables & rt_outputs;
1394}
1395
1396/**
1397 * Gallium CSO for depth, stencil, and alpha testing state.
1398 */
1399struct iris_depth_stencil_alpha_state {
1400   /** Partial 3DSTATE_WM_DEPTH_STENCIL. */
1401   uint32_t wmds[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
1402
1403#if GFX_VER >= 12
1404   uint32_t depth_bounds[GENX(3DSTATE_DEPTH_BOUNDS_length)];
1405#endif
1406
1407   /** Outbound to BLEND_STATE, 3DSTATE_PS_BLEND, COLOR_CALC_STATE. */
1408   unsigned alpha_enabled:1;
1409   unsigned alpha_func:3;     /**< PIPE_FUNC_x */
1410   float alpha_ref_value;     /**< reference value */
1411
1412   /** Outbound to resolve and cache set tracking. */
1413   bool depth_writes_enabled;
1414   bool stencil_writes_enabled;
1415
1416   /** Outbound to Gfx8-9 PMA stall equations */
1417   bool depth_test_enabled;
1418};
1419
1420/**
1421 * The pipe->create_depth_stencil_alpha_state() driver hook.
1422 *
1423 * We encode most of 3DSTATE_WM_DEPTH_STENCIL, and just save off the alpha
1424 * testing state since we need pieces of it in a variety of places.
1425 */
1426static void *
1427iris_create_zsa_state(struct pipe_context *ctx,
1428                      const struct pipe_depth_stencil_alpha_state *state)
1429{
1430   struct iris_depth_stencil_alpha_state *cso =
1431      malloc(sizeof(struct iris_depth_stencil_alpha_state));
1432
1433   bool two_sided_stencil = state->stencil[1].enabled;
1434
1435   cso->alpha_enabled = state->alpha_enabled;
1436   cso->alpha_func = state->alpha_func;
1437   cso->alpha_ref_value = state->alpha_ref_value;
1438   cso->depth_writes_enabled = state->depth_writemask;
1439   cso->depth_test_enabled = state->depth_enabled;
1440   cso->stencil_writes_enabled =
1441      state->stencil[0].writemask != 0 ||
1442      (two_sided_stencil && state->stencil[1].writemask != 0);
1443
1444   /* gallium frontends need to optimize away EQUAL writes for us. */
1445   assert(!(state->depth_func == PIPE_FUNC_EQUAL && state->depth_writemask));
1446
1447   iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), cso->wmds, wmds) {
1448      wmds.StencilFailOp = state->stencil[0].fail_op;
1449      wmds.StencilPassDepthFailOp = state->stencil[0].zfail_op;
1450      wmds.StencilPassDepthPassOp = state->stencil[0].zpass_op;
1451      wmds.StencilTestFunction =
1452         translate_compare_func(state->stencil[0].func);
1453      wmds.BackfaceStencilFailOp = state->stencil[1].fail_op;
1454      wmds.BackfaceStencilPassDepthFailOp = state->stencil[1].zfail_op;
1455      wmds.BackfaceStencilPassDepthPassOp = state->stencil[1].zpass_op;
1456      wmds.BackfaceStencilTestFunction =
1457         translate_compare_func(state->stencil[1].func);
1458      wmds.DepthTestFunction = translate_compare_func(state->depth_func);
1459      wmds.DoubleSidedStencilEnable = two_sided_stencil;
1460      wmds.StencilTestEnable = state->stencil[0].enabled;
1461      wmds.StencilBufferWriteEnable =
1462         state->stencil[0].writemask != 0 ||
1463         (two_sided_stencil && state->stencil[1].writemask != 0);
1464      wmds.DepthTestEnable = state->depth_enabled;
1465      wmds.DepthBufferWriteEnable = state->depth_writemask;
1466      wmds.StencilTestMask = state->stencil[0].valuemask;
1467      wmds.StencilWriteMask = state->stencil[0].writemask;
1468      wmds.BackfaceStencilTestMask = state->stencil[1].valuemask;
1469      wmds.BackfaceStencilWriteMask = state->stencil[1].writemask;
1470      /* wmds.[Backface]StencilReferenceValue are merged later */
1471#if GFX_VER >= 12
1472      wmds.StencilReferenceValueModifyDisable = true;
1473#endif
1474   }
1475
1476#if GFX_VER >= 12
1477   iris_pack_command(GENX(3DSTATE_DEPTH_BOUNDS), cso->depth_bounds, depth_bounds) {
1478      depth_bounds.DepthBoundsTestValueModifyDisable = false;
1479      depth_bounds.DepthBoundsTestEnableModifyDisable = false;
1480      depth_bounds.DepthBoundsTestEnable = state->depth_bounds_test;
1481      depth_bounds.DepthBoundsTestMinValue = state->depth_bounds_min;
1482      depth_bounds.DepthBoundsTestMaxValue = state->depth_bounds_max;
1483   }
1484#endif
1485
1486   return cso;
1487}
1488
1489/**
1490 * The pipe->bind_depth_stencil_alpha_state() driver hook.
1491 *
1492 * Bind a depth/stencil/alpha CSO and flag related dirty bits.
1493 */
1494static void
1495iris_bind_zsa_state(struct pipe_context *ctx, void *state)
1496{
1497   struct iris_context *ice = (struct iris_context *) ctx;
1498   struct iris_depth_stencil_alpha_state *old_cso = ice->state.cso_zsa;
1499   struct iris_depth_stencil_alpha_state *new_cso = state;
1500
1501   if (new_cso) {
1502      if (cso_changed(alpha_ref_value))
1503         ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
1504
1505      if (cso_changed(alpha_enabled))
1506         ice->state.dirty |= IRIS_DIRTY_PS_BLEND | IRIS_DIRTY_BLEND_STATE;
1507
1508      if (cso_changed(alpha_func))
1509         ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
1510
1511      if (cso_changed(depth_writes_enabled) || cso_changed(stencil_writes_enabled))
1512         ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
1513
1514      ice->state.depth_writes_enabled = new_cso->depth_writes_enabled;
1515      ice->state.stencil_writes_enabled = new_cso->stencil_writes_enabled;
1516
1517#if GFX_VER >= 12
1518      if (cso_changed(depth_bounds))
1519         ice->state.dirty |= IRIS_DIRTY_DEPTH_BOUNDS;
1520#endif
1521   }
1522
1523   ice->state.cso_zsa = new_cso;
1524   ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
1525   ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
1526   ice->state.stage_dirty |=
1527      ice->state.stage_dirty_for_nos[IRIS_NOS_DEPTH_STENCIL_ALPHA];
1528
1529   if (GFX_VER == 8)
1530      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
1531}
1532
1533#if GFX_VER == 8
1534static bool
1535want_pma_fix(struct iris_context *ice)
1536{
1537   UNUSED struct iris_screen *screen = (void *) ice->ctx.screen;
1538   UNUSED const struct intel_device_info *devinfo = &screen->devinfo;
1539   const struct brw_wm_prog_data *wm_prog_data = (void *)
1540      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
1541   const struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
1542   const struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
1543   const struct iris_blend_state *cso_blend = ice->state.cso_blend;
1544
1545   /* In very specific combinations of state, we can instruct Gfx8-9 hardware
1546    * to avoid stalling at the pixel mask array.  The state equations are
1547    * documented in these places:
1548    *
1549    * - Gfx8 Depth PMA Fix:   CACHE_MODE_1::NP_PMA_FIX_ENABLE
1550    * - Gfx9 Stencil PMA Fix: CACHE_MODE_0::STC PMA Optimization Enable
1551    *
1552    * Both equations share some common elements:
1553    *
1554    *    no_hiz_op =
1555    *       !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1556    *         3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1557    *         3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1558    *         3DSTATE_WM_HZ_OP::StencilBufferClear) &&
1559    *
1560    *    killpixels =
1561    *       3DSTATE_WM::ForceKillPix != ForceOff &&
1562    *       (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1563    *        3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1564    *        3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1565    *        3DSTATE_PS_BLEND::AlphaTestEnable ||
1566    *        3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1567    *
1568    *    (Technically the stencil PMA treats ForceKillPix differently,
1569    *     but I think this is a documentation oversight, and we don't
1570    *     ever use it in this way, so it doesn't matter).
1571    *
1572    *    common_pma_fix =
1573    *       3DSTATE_WM::ForceThreadDispatch != 1 &&
1574    *       3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0 &&
1575    *       3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1576    *       3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1577    *       3DSTATE_WM::EDSC_Mode != EDSC_PREPS &&
1578    *       3DSTATE_PS_EXTRA::PixelShaderValid &&
1579    *       no_hiz_op
1580    *
1581    * These are always true:
1582    *
1583    *    3DSTATE_RASTER::ForceSampleCount == NUMRASTSAMPLES_0
1584    *    3DSTATE_PS_EXTRA::PixelShaderValid
1585    *
1586    * Also, we never use the normal drawing path for HiZ ops; these are true:
1587    *
1588    *    !(3DSTATE_WM_HZ_OP::DepthBufferClear ||
1589    *      3DSTATE_WM_HZ_OP::DepthBufferResolve ||
1590    *      3DSTATE_WM_HZ_OP::Hierarchical Depth Buffer Resolve Enable ||
1591    *      3DSTATE_WM_HZ_OP::StencilBufferClear)
1592    *
1593    * This happens sometimes:
1594    *
1595    *    3DSTATE_WM::ForceThreadDispatch != 1
1596    *
1597    * However, we choose to ignore it as it either agrees with the signal
1598    * (dispatch was already enabled, so nothing out of the ordinary), or
1599    * there are no framebuffer attachments (so no depth or HiZ anyway,
1600    * meaning the PMA signal will already be disabled).
1601    */
1602
1603   if (!cso_fb->zsbuf)
1604      return false;
1605
1606   struct iris_resource *zres, *sres;
1607   iris_get_depth_stencil_resources(cso_fb->zsbuf->texture, &zres, &sres);
1608
1609   /* 3DSTATE_DEPTH_BUFFER::SURFACE_TYPE != NULL &&
1610    * 3DSTATE_DEPTH_BUFFER::HIZ Enable &&
1611    */
1612   if (!zres || !iris_resource_level_has_hiz(zres, cso_fb->zsbuf->u.tex.level))
1613      return false;
1614
1615   /* 3DSTATE_WM::EDSC_Mode != EDSC_PREPS */
1616   if (wm_prog_data->early_fragment_tests)
1617      return false;
1618
1619   /* 3DSTATE_WM::ForceKillPix != ForceOff &&
1620    * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
1621    *  3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
1622    *  3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
1623    *  3DSTATE_PS_BLEND::AlphaTestEnable ||
1624    *  3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
1625    */
1626   bool killpixels = wm_prog_data->uses_kill || wm_prog_data->uses_omask ||
1627                     cso_blend->alpha_to_coverage || cso_zsa->alpha_enabled;
1628
1629   /* The Gfx8 depth PMA equation becomes:
1630    *
1631    *    depth_writes =
1632    *       3DSTATE_WM_DEPTH_STENCIL::DepthWriteEnable &&
1633    *       3DSTATE_DEPTH_BUFFER::DEPTH_WRITE_ENABLE
1634    *
1635    *    stencil_writes =
1636    *       3DSTATE_WM_DEPTH_STENCIL::Stencil Buffer Write Enable &&
1637    *       3DSTATE_DEPTH_BUFFER::STENCIL_WRITE_ENABLE &&
1638    *       3DSTATE_STENCIL_BUFFER::STENCIL_BUFFER_ENABLE
1639    *
1640    *    Z_PMA_OPT =
1641    *       common_pma_fix &&
1642    *       3DSTATE_WM_DEPTH_STENCIL::DepthTestEnable &&
1643    *       ((killpixels && (depth_writes || stencil_writes)) ||
1644    *        3DSTATE_PS_EXTRA::PixelShaderComputedDepthMode != PSCDEPTH_OFF)
1645    *
1646    */
1647   if (!cso_zsa->depth_test_enabled)
1648      return false;
1649
1650   return wm_prog_data->computed_depth_mode != PSCDEPTH_OFF ||
1651          (killpixels && (cso_zsa->depth_writes_enabled ||
1652                          (sres && cso_zsa->stencil_writes_enabled)));
1653}
1654#endif
1655
1656void
1657genX(update_pma_fix)(struct iris_context *ice,
1658                     struct iris_batch *batch,
1659                     bool enable)
1660{
1661#if GFX_VER == 8
1662   struct iris_genx_state *genx = ice->state.genx;
1663
1664   if (genx->pma_fix_enabled == enable)
1665      return;
1666
1667   genx->pma_fix_enabled = enable;
1668
1669   /* According to the Broadwell PIPE_CONTROL documentation, software should
1670    * emit a PIPE_CONTROL with the CS Stall and Depth Cache Flush bits set
1671    * prior to the LRI.  If stencil buffer writes are enabled, then a Render        * Cache Flush is also necessary.
1672    *
1673    * The Gfx9 docs say to use a depth stall rather than a command streamer
1674    * stall.  However, the hardware seems to violently disagree.  A full
1675    * command streamer stall seems to be needed in both cases.
1676    */
1677   iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1678                                PIPE_CONTROL_CS_STALL |
1679                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1680                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
1681
1682   iris_emit_reg(batch, GENX(CACHE_MODE_1), reg) {
1683      reg.NPPMAFixEnable = enable;
1684      reg.NPEarlyZFailsDisable = enable;
1685      reg.NPPMAFixEnableMask = true;
1686      reg.NPEarlyZFailsDisableMask = true;
1687   }
1688
1689   /* After the LRI, a PIPE_CONTROL with both the Depth Stall and Depth Cache
1690    * Flush bits is often necessary.  We do it regardless because it's easier.
1691    * The render cache flush is also necessary if stencil writes are enabled.
1692    *
1693    * Again, the Gfx9 docs give a different set of flushes but the Broadwell
1694    * flushes seem to work just as well.
1695    */
1696   iris_emit_pipe_control_flush(batch, "PMA fix change (1/2)",
1697                                PIPE_CONTROL_DEPTH_STALL |
1698                                PIPE_CONTROL_DEPTH_CACHE_FLUSH |
1699                                PIPE_CONTROL_RENDER_TARGET_FLUSH);
1700#endif
1701}
1702
1703/**
1704 * Gallium CSO for rasterizer state.
1705 */
1706struct iris_rasterizer_state {
1707   uint32_t sf[GENX(3DSTATE_SF_length)];
1708   uint32_t clip[GENX(3DSTATE_CLIP_length)];
1709   uint32_t raster[GENX(3DSTATE_RASTER_length)];
1710   uint32_t wm[GENX(3DSTATE_WM_length)];
1711   uint32_t line_stipple[GENX(3DSTATE_LINE_STIPPLE_length)];
1712
1713   uint8_t num_clip_plane_consts;
1714   bool clip_halfz; /* for CC_VIEWPORT */
1715   bool depth_clip_near; /* for CC_VIEWPORT */
1716   bool depth_clip_far; /* for CC_VIEWPORT */
1717   bool flatshade; /* for shader state */
1718   bool flatshade_first; /* for stream output */
1719   bool clamp_fragment_color; /* for shader state */
1720   bool light_twoside; /* for shader state */
1721   bool rasterizer_discard; /* for 3DSTATE_STREAMOUT and 3DSTATE_CLIP */
1722   bool half_pixel_center; /* for 3DSTATE_MULTISAMPLE */
1723   bool line_stipple_enable;
1724   bool poly_stipple_enable;
1725   bool multisample;
1726   bool force_persample_interp;
1727   bool conservative_rasterization;
1728   bool fill_mode_point;
1729   bool fill_mode_line;
1730   bool fill_mode_point_or_line;
1731   enum pipe_sprite_coord_mode sprite_coord_mode; /* PIPE_SPRITE_* */
1732   uint16_t sprite_coord_enable;
1733};
1734
1735static float
1736get_line_width(const struct pipe_rasterizer_state *state)
1737{
1738   float line_width = state->line_width;
1739
1740   /* From the OpenGL 4.4 spec:
1741    *
1742    * "The actual width of non-antialiased lines is determined by rounding
1743    *  the supplied width to the nearest integer, then clamping it to the
1744    *  implementation-dependent maximum non-antialiased line width."
1745    */
1746   if (!state->multisample && !state->line_smooth)
1747      line_width = roundf(state->line_width);
1748
1749   if (!state->multisample && state->line_smooth && line_width < 1.5f) {
1750      /* For 1 pixel line thickness or less, the general anti-aliasing
1751       * algorithm gives up, and a garbage line is generated.  Setting a
1752       * Line Width of 0.0 specifies the rasterization of the "thinnest"
1753       * (one-pixel-wide), non-antialiased lines.
1754       *
1755       * Lines rendered with zero Line Width are rasterized using the
1756       * "Grid Intersection Quantization" rules as specified by the
1757       * "Zero-Width (Cosmetic) Line Rasterization" section of the docs.
1758       */
1759      line_width = 0.0f;
1760   }
1761
1762   return line_width;
1763}
1764
1765/**
1766 * The pipe->create_rasterizer_state() driver hook.
1767 */
1768static void *
1769iris_create_rasterizer_state(struct pipe_context *ctx,
1770                             const struct pipe_rasterizer_state *state)
1771{
1772   struct iris_rasterizer_state *cso =
1773      malloc(sizeof(struct iris_rasterizer_state));
1774
1775   cso->multisample = state->multisample;
1776   cso->force_persample_interp = state->force_persample_interp;
1777   cso->clip_halfz = state->clip_halfz;
1778   cso->depth_clip_near = state->depth_clip_near;
1779   cso->depth_clip_far = state->depth_clip_far;
1780   cso->flatshade = state->flatshade;
1781   cso->flatshade_first = state->flatshade_first;
1782   cso->clamp_fragment_color = state->clamp_fragment_color;
1783   cso->light_twoside = state->light_twoside;
1784   cso->rasterizer_discard = state->rasterizer_discard;
1785   cso->half_pixel_center = state->half_pixel_center;
1786   cso->sprite_coord_mode = state->sprite_coord_mode;
1787   cso->sprite_coord_enable = state->sprite_coord_enable;
1788   cso->line_stipple_enable = state->line_stipple_enable;
1789   cso->poly_stipple_enable = state->poly_stipple_enable;
1790   cso->conservative_rasterization =
1791      state->conservative_raster_mode == PIPE_CONSERVATIVE_RASTER_POST_SNAP;
1792
1793   cso->fill_mode_point =
1794      state->fill_front == PIPE_POLYGON_MODE_POINT ||
1795      state->fill_back == PIPE_POLYGON_MODE_POINT;
1796   cso->fill_mode_line =
1797      state->fill_front == PIPE_POLYGON_MODE_LINE ||
1798      state->fill_back == PIPE_POLYGON_MODE_LINE;
1799   cso->fill_mode_point_or_line =
1800      cso->fill_mode_point ||
1801      cso->fill_mode_line;
1802
1803   if (state->clip_plane_enable != 0)
1804      cso->num_clip_plane_consts = util_logbase2(state->clip_plane_enable) + 1;
1805   else
1806      cso->num_clip_plane_consts = 0;
1807
1808   float line_width = get_line_width(state);
1809
1810   iris_pack_command(GENX(3DSTATE_SF), cso->sf, sf) {
1811      sf.StatisticsEnable = true;
1812      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1813      sf.LineEndCapAntialiasingRegionWidth =
1814         state->line_smooth ? _10pixels : _05pixels;
1815      sf.LastPixelEnable = state->line_last_pixel;
1816      sf.LineWidth = line_width;
1817      sf.SmoothPointEnable = (state->point_smooth || state->multisample) &&
1818                             !state->point_quad_rasterization;
1819      sf.PointWidthSource = state->point_size_per_vertex ? Vertex : State;
1820      sf.PointWidth = CLAMP(state->point_size, 0.125f, 255.875f);
1821
1822      if (state->flatshade_first) {
1823         sf.TriangleFanProvokingVertexSelect = 1;
1824      } else {
1825         sf.TriangleStripListProvokingVertexSelect = 2;
1826         sf.TriangleFanProvokingVertexSelect = 2;
1827         sf.LineStripListProvokingVertexSelect = 1;
1828      }
1829   }
1830
1831   iris_pack_command(GENX(3DSTATE_RASTER), cso->raster, rr) {
1832      rr.FrontWinding = state->front_ccw ? CounterClockwise : Clockwise;
1833      rr.CullMode = translate_cull_mode(state->cull_face);
1834      rr.FrontFaceFillMode = translate_fill_mode(state->fill_front);
1835      rr.BackFaceFillMode = translate_fill_mode(state->fill_back);
1836      rr.DXMultisampleRasterizationEnable = state->multisample;
1837      rr.GlobalDepthOffsetEnableSolid = state->offset_tri;
1838      rr.GlobalDepthOffsetEnableWireframe = state->offset_line;
1839      rr.GlobalDepthOffsetEnablePoint = state->offset_point;
1840      rr.GlobalDepthOffsetConstant = state->offset_units * 2;
1841      rr.GlobalDepthOffsetScale = state->offset_scale;
1842      rr.GlobalDepthOffsetClamp = state->offset_clamp;
1843      rr.SmoothPointEnable = state->point_smooth;
1844      rr.AntialiasingEnable = state->line_smooth;
1845      rr.ScissorRectangleEnable = state->scissor;
1846#if GFX_VER >= 9
1847      rr.ViewportZNearClipTestEnable = state->depth_clip_near;
1848      rr.ViewportZFarClipTestEnable = state->depth_clip_far;
1849      rr.ConservativeRasterizationEnable =
1850         cso->conservative_rasterization;
1851#else
1852      rr.ViewportZClipTestEnable = (state->depth_clip_near || state->depth_clip_far);
1853#endif
1854   }
1855
1856   iris_pack_command(GENX(3DSTATE_CLIP), cso->clip, cl) {
1857      /* cl.NonPerspectiveBarycentricEnable is filled in at draw time from
1858       * the FS program; cl.ForceZeroRTAIndexEnable is filled in from the FB.
1859       */
1860      cl.EarlyCullEnable = true;
1861      cl.UserClipDistanceClipTestEnableBitmask = state->clip_plane_enable;
1862      cl.ForceUserClipDistanceClipTestEnableBitmask = true;
1863      cl.APIMode = state->clip_halfz ? APIMODE_D3D : APIMODE_OGL;
1864      cl.GuardbandClipTestEnable = true;
1865      cl.ClipEnable = true;
1866      cl.MinimumPointWidth = 0.125;
1867      cl.MaximumPointWidth = 255.875;
1868
1869      if (state->flatshade_first) {
1870         cl.TriangleFanProvokingVertexSelect = 1;
1871      } else {
1872         cl.TriangleStripListProvokingVertexSelect = 2;
1873         cl.TriangleFanProvokingVertexSelect = 2;
1874         cl.LineStripListProvokingVertexSelect = 1;
1875      }
1876   }
1877
1878   iris_pack_command(GENX(3DSTATE_WM), cso->wm, wm) {
1879      /* wm.BarycentricInterpolationMode and wm.EarlyDepthStencilControl are
1880       * filled in at draw time from the FS program.
1881       */
1882      wm.LineAntialiasingRegionWidth = _10pixels;
1883      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1884      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1885      wm.LineStippleEnable = state->line_stipple_enable;
1886      wm.PolygonStippleEnable = state->poly_stipple_enable;
1887   }
1888
1889   /* Remap from 0..255 back to 1..256 */
1890   const unsigned line_stipple_factor = state->line_stipple_factor + 1;
1891
1892   iris_pack_command(GENX(3DSTATE_LINE_STIPPLE), cso->line_stipple, line) {
1893      if (state->line_stipple_enable) {
1894         line.LineStipplePattern = state->line_stipple_pattern;
1895         line.LineStippleInverseRepeatCount = 1.0f / line_stipple_factor;
1896         line.LineStippleRepeatCount = line_stipple_factor;
1897      }
1898   }
1899
1900   return cso;
1901}
1902
1903/**
1904 * The pipe->bind_rasterizer_state() driver hook.
1905 *
1906 * Bind a rasterizer CSO and flag related dirty bits.
1907 */
1908static void
1909iris_bind_rasterizer_state(struct pipe_context *ctx, void *state)
1910{
1911   struct iris_context *ice = (struct iris_context *) ctx;
1912   struct iris_rasterizer_state *old_cso = ice->state.cso_rast;
1913   struct iris_rasterizer_state *new_cso = state;
1914
1915   if (new_cso) {
1916      /* Try to avoid re-emitting 3DSTATE_LINE_STIPPLE, it's non-pipelined */
1917      if (cso_changed_memcmp(line_stipple))
1918         ice->state.dirty |= IRIS_DIRTY_LINE_STIPPLE;
1919
1920      if (cso_changed(half_pixel_center))
1921         ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
1922
1923      if (cso_changed(line_stipple_enable) || cso_changed(poly_stipple_enable))
1924         ice->state.dirty |= IRIS_DIRTY_WM;
1925
1926      if (cso_changed(rasterizer_discard))
1927         ice->state.dirty |= IRIS_DIRTY_STREAMOUT | IRIS_DIRTY_CLIP;
1928
1929      if (cso_changed(flatshade_first))
1930         ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
1931
1932      if (cso_changed(depth_clip_near) || cso_changed(depth_clip_far) ||
1933          cso_changed(clip_halfz))
1934         ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
1935
1936      if (cso_changed(sprite_coord_enable) ||
1937          cso_changed(sprite_coord_mode) ||
1938          cso_changed(light_twoside))
1939         ice->state.dirty |= IRIS_DIRTY_SBE;
1940
1941      if (cso_changed(conservative_rasterization))
1942         ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
1943   }
1944
1945   ice->state.cso_rast = new_cso;
1946   ice->state.dirty |= IRIS_DIRTY_RASTER;
1947   ice->state.dirty |= IRIS_DIRTY_CLIP;
1948   ice->state.stage_dirty |=
1949      ice->state.stage_dirty_for_nos[IRIS_NOS_RASTERIZER];
1950}
1951
1952/**
1953 * Return true if the given wrap mode requires the border color to exist.
1954 *
1955 * (We can skip uploading it if the sampler isn't going to use it.)
1956 */
1957static bool
1958wrap_mode_needs_border_color(unsigned wrap_mode)
1959{
1960   return wrap_mode == TCM_CLAMP_BORDER || wrap_mode == TCM_HALF_BORDER;
1961}
1962
1963/**
1964 * Gallium CSO for sampler state.
1965 */
1966struct iris_sampler_state {
1967   union pipe_color_union border_color;
1968   bool needs_border_color;
1969
1970   uint32_t sampler_state[GENX(SAMPLER_STATE_length)];
1971};
1972
1973/**
1974 * The pipe->create_sampler_state() driver hook.
1975 *
1976 * We fill out SAMPLER_STATE (except for the border color pointer), and
1977 * store that on the CPU.  It doesn't make sense to upload it to a GPU
1978 * buffer object yet, because 3DSTATE_SAMPLER_STATE_POINTERS requires
1979 * all bound sampler states to be in contiguous memor.
1980 */
1981static void *
1982iris_create_sampler_state(struct pipe_context *ctx,
1983                          const struct pipe_sampler_state *state)
1984{
1985   struct iris_sampler_state *cso = CALLOC_STRUCT(iris_sampler_state);
1986
1987   if (!cso)
1988      return NULL;
1989
1990   STATIC_ASSERT(PIPE_TEX_FILTER_NEAREST == MAPFILTER_NEAREST);
1991   STATIC_ASSERT(PIPE_TEX_FILTER_LINEAR == MAPFILTER_LINEAR);
1992
1993   unsigned wrap_s = translate_wrap(state->wrap_s);
1994   unsigned wrap_t = translate_wrap(state->wrap_t);
1995   unsigned wrap_r = translate_wrap(state->wrap_r);
1996
1997   memcpy(&cso->border_color, &state->border_color, sizeof(cso->border_color));
1998
1999   cso->needs_border_color = wrap_mode_needs_border_color(wrap_s) ||
2000                             wrap_mode_needs_border_color(wrap_t) ||
2001                             wrap_mode_needs_border_color(wrap_r);
2002
2003   float min_lod = state->min_lod;
2004   unsigned mag_img_filter = state->mag_img_filter;
2005
2006   // XXX: explain this code ported from ilo...I don't get it at all...
2007   if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE &&
2008       state->min_lod > 0.0f) {
2009      min_lod = 0.0f;
2010      mag_img_filter = state->min_img_filter;
2011   }
2012
2013   iris_pack_state(GENX(SAMPLER_STATE), cso->sampler_state, samp) {
2014      samp.TCXAddressControlMode = wrap_s;
2015      samp.TCYAddressControlMode = wrap_t;
2016      samp.TCZAddressControlMode = wrap_r;
2017      samp.CubeSurfaceControlMode = state->seamless_cube_map;
2018      samp.NonnormalizedCoordinateEnable = !state->normalized_coords;
2019      samp.MinModeFilter = state->min_img_filter;
2020      samp.MagModeFilter = mag_img_filter;
2021      samp.MipModeFilter = translate_mip_filter(state->min_mip_filter);
2022      samp.MaximumAnisotropy = RATIO21;
2023
2024      if (state->max_anisotropy >= 2) {
2025         if (state->min_img_filter == PIPE_TEX_FILTER_LINEAR) {
2026            samp.MinModeFilter = MAPFILTER_ANISOTROPIC;
2027            samp.AnisotropicAlgorithm = EWAApproximation;
2028         }
2029
2030         if (state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)
2031            samp.MagModeFilter = MAPFILTER_ANISOTROPIC;
2032
2033         samp.MaximumAnisotropy =
2034            MIN2((state->max_anisotropy - 2) / 2, RATIO161);
2035      }
2036
2037      /* Set address rounding bits if not using nearest filtering. */
2038      if (state->min_img_filter != PIPE_TEX_FILTER_NEAREST) {
2039         samp.UAddressMinFilterRoundingEnable = true;
2040         samp.VAddressMinFilterRoundingEnable = true;
2041         samp.RAddressMinFilterRoundingEnable = true;
2042      }
2043
2044      if (state->mag_img_filter != PIPE_TEX_FILTER_NEAREST) {
2045         samp.UAddressMagFilterRoundingEnable = true;
2046         samp.VAddressMagFilterRoundingEnable = true;
2047         samp.RAddressMagFilterRoundingEnable = true;
2048      }
2049
2050      if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
2051         samp.ShadowFunction = translate_shadow_func(state->compare_func);
2052
2053      const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
2054
2055      samp.LODPreClampMode = CLAMP_MODE_OGL;
2056      samp.MinLOD = CLAMP(min_lod, 0, hw_max_lod);
2057      samp.MaxLOD = CLAMP(state->max_lod, 0, hw_max_lod);
2058      samp.TextureLODBias = CLAMP(state->lod_bias, -16, 15);
2059
2060      /* .BorderColorPointer is filled in by iris_bind_sampler_states. */
2061   }
2062
2063   return cso;
2064}
2065
2066/**
2067 * The pipe->bind_sampler_states() driver hook.
2068 */
2069static void
2070iris_bind_sampler_states(struct pipe_context *ctx,
2071                         enum pipe_shader_type p_stage,
2072                         unsigned start, unsigned count,
2073                         void **states)
2074{
2075   struct iris_context *ice = (struct iris_context *) ctx;
2076   gl_shader_stage stage = stage_from_pipe(p_stage);
2077   struct iris_shader_state *shs = &ice->state.shaders[stage];
2078
2079   assert(start + count <= IRIS_MAX_TEXTURE_SAMPLERS);
2080
2081   bool dirty = false;
2082
2083   for (int i = 0; i < count; i++) {
2084      struct iris_sampler_state *state = states ? states[i] : NULL;
2085      if (shs->samplers[start + i] != state) {
2086         shs->samplers[start + i] = state;
2087         dirty = true;
2088      }
2089   }
2090
2091   if (dirty)
2092      ice->state.stage_dirty |= IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage;
2093}
2094
2095/**
2096 * Upload the sampler states into a contiguous area of GPU memory, for
2097 * for 3DSTATE_SAMPLER_STATE_POINTERS_*.
2098 *
2099 * Also fill out the border color state pointers.
2100 */
2101static void
2102iris_upload_sampler_states(struct iris_context *ice, gl_shader_stage stage)
2103{
2104   struct iris_shader_state *shs = &ice->state.shaders[stage];
2105   const struct shader_info *info = iris_get_shader_info(ice, stage);
2106
2107   /* We assume gallium frontends will call pipe->bind_sampler_states()
2108    * if the program's number of textures changes.
2109    */
2110   unsigned count = info ? BITSET_LAST_BIT(info->textures_used) : 0;
2111
2112   if (!count)
2113      return;
2114
2115   /* Assemble the SAMPLER_STATEs into a contiguous table that lives
2116    * in the dynamic state memory zone, so we can point to it via the
2117    * 3DSTATE_SAMPLER_STATE_POINTERS_* commands.
2118    */
2119   unsigned size = count * 4 * GENX(SAMPLER_STATE_length);
2120   uint32_t *map =
2121      upload_state(ice->state.dynamic_uploader, &shs->sampler_table, size, 32);
2122   if (unlikely(!map))
2123      return;
2124
2125   struct pipe_resource *res = shs->sampler_table.res;
2126   struct iris_bo *bo = iris_resource_bo(res);
2127
2128   iris_record_state_size(ice->state.sizes,
2129                          bo->address + shs->sampler_table.offset, size);
2130
2131   shs->sampler_table.offset += iris_bo_offset_from_base_address(bo);
2132
2133   /* Make sure all land in the same BO */
2134   iris_border_color_pool_reserve(ice, IRIS_MAX_TEXTURE_SAMPLERS);
2135
2136   ice->state.need_border_colors &= ~(1 << stage);
2137
2138   for (int i = 0; i < count; i++) {
2139      struct iris_sampler_state *state = shs->samplers[i];
2140      struct iris_sampler_view *tex = shs->textures[i];
2141
2142      if (!state) {
2143         memset(map, 0, 4 * GENX(SAMPLER_STATE_length));
2144      } else if (!state->needs_border_color) {
2145         memcpy(map, state->sampler_state, 4 * GENX(SAMPLER_STATE_length));
2146      } else {
2147         ice->state.need_border_colors |= 1 << stage;
2148
2149         /* We may need to swizzle the border color for format faking.
2150          * A/LA formats are faked as R/RG with 000R or R00G swizzles.
2151          * This means we need to move the border color's A channel into
2152          * the R or G channels so that those read swizzles will move it
2153          * back into A.
2154          */
2155         union pipe_color_union *color = &state->border_color;
2156         union pipe_color_union tmp;
2157         if (tex) {
2158            enum pipe_format internal_format = tex->res->internal_format;
2159
2160            if (util_format_is_alpha(internal_format)) {
2161               unsigned char swz[4] = {
2162                  PIPE_SWIZZLE_W, PIPE_SWIZZLE_0,
2163                  PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2164               };
2165               util_format_apply_color_swizzle(&tmp, color, swz, true);
2166               color = &tmp;
2167            } else if (util_format_is_luminance_alpha(internal_format) &&
2168                       internal_format != PIPE_FORMAT_L8A8_SRGB) {
2169               unsigned char swz[4] = {
2170                  PIPE_SWIZZLE_X, PIPE_SWIZZLE_W,
2171                  PIPE_SWIZZLE_0, PIPE_SWIZZLE_0
2172               };
2173               util_format_apply_color_swizzle(&tmp, color, swz, true);
2174               color = &tmp;
2175            }
2176         }
2177
2178         /* Stream out the border color and merge the pointer. */
2179         uint32_t offset = iris_upload_border_color(ice, color);
2180
2181         uint32_t dynamic[GENX(SAMPLER_STATE_length)];
2182         iris_pack_state(GENX(SAMPLER_STATE), dynamic, dyns) {
2183            dyns.BorderColorPointer = offset;
2184         }
2185
2186         for (uint32_t j = 0; j < GENX(SAMPLER_STATE_length); j++)
2187            map[j] = state->sampler_state[j] | dynamic[j];
2188      }
2189
2190      map += GENX(SAMPLER_STATE_length);
2191   }
2192}
2193
2194static enum isl_channel_select
2195fmt_swizzle(const struct iris_format_info *fmt, enum pipe_swizzle swz)
2196{
2197   switch (swz) {
2198   case PIPE_SWIZZLE_X: return fmt->swizzle.r;
2199   case PIPE_SWIZZLE_Y: return fmt->swizzle.g;
2200   case PIPE_SWIZZLE_Z: return fmt->swizzle.b;
2201   case PIPE_SWIZZLE_W: return fmt->swizzle.a;
2202   case PIPE_SWIZZLE_1: return ISL_CHANNEL_SELECT_ONE;
2203   case PIPE_SWIZZLE_0: return ISL_CHANNEL_SELECT_ZERO;
2204   default: unreachable("invalid swizzle");
2205   }
2206}
2207
2208static void
2209fill_buffer_surface_state(struct isl_device *isl_dev,
2210                          struct iris_resource *res,
2211                          void *map,
2212                          enum isl_format format,
2213                          struct isl_swizzle swizzle,
2214                          unsigned offset,
2215                          unsigned size,
2216                          isl_surf_usage_flags_t usage)
2217{
2218   const struct isl_format_layout *fmtl = isl_format_get_layout(format);
2219   const unsigned cpp = format == ISL_FORMAT_RAW ? 1 : fmtl->bpb / 8;
2220
2221   /* The ARB_texture_buffer_specification says:
2222    *
2223    *    "The number of texels in the buffer texture's texel array is given by
2224    *
2225    *       floor(<buffer_size> / (<components> * sizeof(<base_type>)),
2226    *
2227    *     where <buffer_size> is the size of the buffer object, in basic
2228    *     machine units and <components> and <base_type> are the element count
2229    *     and base data type for elements, as specified in Table X.1.  The
2230    *     number of texels in the texel array is then clamped to the
2231    *     implementation-dependent limit MAX_TEXTURE_BUFFER_SIZE_ARB."
2232    *
2233    * We need to clamp the size in bytes to MAX_TEXTURE_BUFFER_SIZE * stride,
2234    * so that when ISL divides by stride to obtain the number of texels, that
2235    * texel count is clamped to MAX_TEXTURE_BUFFER_SIZE.
2236    */
2237   unsigned final_size =
2238      MIN3(size, res->bo->size - res->offset - offset,
2239           IRIS_MAX_TEXTURE_BUFFER_SIZE * cpp);
2240
2241   isl_buffer_fill_state(isl_dev, map,
2242                         .address = res->bo->address + res->offset + offset,
2243                         .size_B = final_size,
2244                         .format = format,
2245                         .swizzle = swizzle,
2246                         .stride_B = cpp,
2247                         .mocs = iris_mocs(res->bo, isl_dev, usage));
2248}
2249
2250#define SURFACE_STATE_ALIGNMENT 64
2251
2252/**
2253 * Allocate several contiguous SURFACE_STATE structures, one for each
2254 * supported auxiliary surface mode.  This only allocates the CPU-side
2255 * copy, they will need to be uploaded later after they're filled in.
2256 */
2257static void
2258alloc_surface_states(struct iris_surface_state *surf_state,
2259                     unsigned aux_usages)
2260{
2261   const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2262
2263   /* If this changes, update this to explicitly align pointers */
2264   STATIC_ASSERT(surf_size == SURFACE_STATE_ALIGNMENT);
2265
2266   assert(aux_usages != 0);
2267
2268   /* In case we're re-allocating them... */
2269   free(surf_state->cpu);
2270
2271   surf_state->num_states = util_bitcount(aux_usages);
2272   surf_state->cpu = calloc(surf_state->num_states, surf_size);
2273   surf_state->ref.offset = 0;
2274   pipe_resource_reference(&surf_state->ref.res, NULL);
2275
2276   assert(surf_state->cpu);
2277}
2278
2279/**
2280 * Upload the CPU side SURFACE_STATEs into a GPU buffer.
2281 */
2282static void
2283upload_surface_states(struct u_upload_mgr *mgr,
2284                      struct iris_surface_state *surf_state)
2285{
2286   const unsigned surf_size = 4 * GENX(RENDER_SURFACE_STATE_length);
2287   const unsigned bytes = surf_state->num_states * surf_size;
2288
2289   void *map =
2290      upload_state(mgr, &surf_state->ref, bytes, SURFACE_STATE_ALIGNMENT);
2291
2292   surf_state->ref.offset +=
2293      iris_bo_offset_from_base_address(iris_resource_bo(surf_state->ref.res));
2294
2295   if (map)
2296      memcpy(map, surf_state->cpu, bytes);
2297}
2298
2299/**
2300 * Update resource addresses in a set of SURFACE_STATE descriptors,
2301 * and re-upload them if necessary.
2302 */
2303static bool
2304update_surface_state_addrs(struct u_upload_mgr *mgr,
2305                           struct iris_surface_state *surf_state,
2306                           struct iris_bo *bo)
2307{
2308   if (surf_state->bo_address == bo->address)
2309      return false;
2310
2311   STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) % 64 == 0);
2312   STATIC_ASSERT(GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_bits) == 64);
2313
2314   uint64_t *ss_addr = (uint64_t *) &surf_state->cpu[GENX(RENDER_SURFACE_STATE_SurfaceBaseAddress_start) / 32];
2315
2316   /* First, update the CPU copies.  We assume no other fields exist in
2317    * the QWord containing Surface Base Address.
2318    */
2319   for (unsigned i = 0; i < surf_state->num_states; i++) {
2320      *ss_addr = *ss_addr - surf_state->bo_address + bo->address;
2321      ss_addr = ((void *) ss_addr) + SURFACE_STATE_ALIGNMENT;
2322   }
2323
2324   /* Next, upload the updated copies to a GPU buffer. */
2325   upload_surface_states(mgr, surf_state);
2326
2327   surf_state->bo_address = bo->address;
2328
2329   return true;
2330}
2331
2332static void
2333fill_surface_state(struct isl_device *isl_dev,
2334                   void *map,
2335                   struct iris_resource *res,
2336                   struct isl_surf *surf,
2337                   struct isl_view *view,
2338                   unsigned aux_usage,
2339                   uint32_t extra_main_offset,
2340                   uint32_t tile_x_sa,
2341                   uint32_t tile_y_sa)
2342{
2343   struct isl_surf_fill_state_info f = {
2344      .surf = surf,
2345      .view = view,
2346      .mocs = iris_mocs(res->bo, isl_dev, view->usage),
2347      .address = res->bo->address + res->offset + extra_main_offset,
2348      .x_offset_sa = tile_x_sa,
2349      .y_offset_sa = tile_y_sa,
2350   };
2351
2352   if (aux_usage != ISL_AUX_USAGE_NONE) {
2353      f.aux_surf = &res->aux.surf;
2354      f.aux_usage = aux_usage;
2355      f.clear_color = res->aux.clear_color;
2356
2357      if (res->aux.bo)
2358         f.aux_address = res->aux.bo->address + res->aux.offset;
2359
2360      if (res->aux.clear_color_bo) {
2361         f.clear_address = res->aux.clear_color_bo->address +
2362                           res->aux.clear_color_offset;
2363         f.use_clear_address = isl_dev->info->ver > 9;
2364      }
2365   }
2366
2367   isl_surf_fill_state_s(isl_dev, map, &f);
2368}
2369
2370/**
2371 * The pipe->create_sampler_view() driver hook.
2372 */
2373static struct pipe_sampler_view *
2374iris_create_sampler_view(struct pipe_context *ctx,
2375                         struct pipe_resource *tex,
2376                         const struct pipe_sampler_view *tmpl)
2377{
2378   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2379   const struct intel_device_info *devinfo = &screen->devinfo;
2380   struct iris_sampler_view *isv = calloc(1, sizeof(struct iris_sampler_view));
2381
2382   if (!isv)
2383      return NULL;
2384
2385   /* initialize base object */
2386   isv->base = *tmpl;
2387   isv->base.context = ctx;
2388   isv->base.texture = NULL;
2389   pipe_reference_init(&isv->base.reference, 1);
2390   pipe_resource_reference(&isv->base.texture, tex);
2391
2392   if (util_format_is_depth_or_stencil(tmpl->format)) {
2393      struct iris_resource *zres, *sres;
2394      const struct util_format_description *desc =
2395         util_format_description(tmpl->format);
2396
2397      iris_get_depth_stencil_resources(tex, &zres, &sres);
2398
2399      tex = util_format_has_depth(desc) ? &zres->base.b : &sres->base.b;
2400   }
2401
2402   isv->res = (struct iris_resource *) tex;
2403
2404   alloc_surface_states(&isv->surface_state, isv->res->aux.sampler_usages);
2405
2406   isv->surface_state.bo_address = isv->res->bo->address;
2407
2408   isl_surf_usage_flags_t usage = ISL_SURF_USAGE_TEXTURE_BIT;
2409
2410   if (isv->base.target == PIPE_TEXTURE_CUBE ||
2411       isv->base.target == PIPE_TEXTURE_CUBE_ARRAY)
2412      usage |= ISL_SURF_USAGE_CUBE_BIT;
2413
2414   const struct iris_format_info fmt =
2415      iris_format_for_usage(devinfo, tmpl->format, usage);
2416
2417   isv->clear_color = isv->res->aux.clear_color;
2418
2419   isv->view = (struct isl_view) {
2420      .format = fmt.fmt,
2421      .swizzle = (struct isl_swizzle) {
2422         .r = fmt_swizzle(&fmt, tmpl->swizzle_r),
2423         .g = fmt_swizzle(&fmt, tmpl->swizzle_g),
2424         .b = fmt_swizzle(&fmt, tmpl->swizzle_b),
2425         .a = fmt_swizzle(&fmt, tmpl->swizzle_a),
2426      },
2427      .usage = usage,
2428   };
2429
2430   void *map = isv->surface_state.cpu;
2431
2432   /* Fill out SURFACE_STATE for this view. */
2433   if (tmpl->target != PIPE_BUFFER) {
2434      isv->view.base_level = tmpl->u.tex.first_level;
2435      isv->view.levels = tmpl->u.tex.last_level - tmpl->u.tex.first_level + 1;
2436
2437      if (tmpl->target == PIPE_TEXTURE_3D) {
2438         isv->view.base_array_layer = 0;
2439         isv->view.array_len = 1;
2440      } else {
2441         isv->view.base_array_layer = tmpl->u.tex.first_layer;
2442         isv->view.array_len =
2443            tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2444      }
2445
2446      unsigned aux_modes = isv->res->aux.sampler_usages;
2447      while (aux_modes) {
2448         enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2449
2450         fill_surface_state(&screen->isl_dev, map, isv->res, &isv->res->surf,
2451                            &isv->view, aux_usage, 0, 0, 0);
2452
2453         map += SURFACE_STATE_ALIGNMENT;
2454      }
2455   } else {
2456      fill_buffer_surface_state(&screen->isl_dev, isv->res, map,
2457                                isv->view.format, isv->view.swizzle,
2458                                tmpl->u.buf.offset, tmpl->u.buf.size,
2459                                ISL_SURF_USAGE_TEXTURE_BIT);
2460   }
2461
2462   return &isv->base;
2463}
2464
2465static void
2466iris_sampler_view_destroy(struct pipe_context *ctx,
2467                          struct pipe_sampler_view *state)
2468{
2469   struct iris_sampler_view *isv = (void *) state;
2470   pipe_resource_reference(&state->texture, NULL);
2471   pipe_resource_reference(&isv->surface_state.ref.res, NULL);
2472   free(isv->surface_state.cpu);
2473   free(isv);
2474}
2475
2476/**
2477 * The pipe->create_surface() driver hook.
2478 *
2479 * In Gallium nomenclature, "surfaces" are a view of a resource that
2480 * can be bound as a render target or depth/stencil buffer.
2481 */
2482static struct pipe_surface *
2483iris_create_surface(struct pipe_context *ctx,
2484                    struct pipe_resource *tex,
2485                    const struct pipe_surface *tmpl)
2486{
2487   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2488   const struct intel_device_info *devinfo = &screen->devinfo;
2489
2490   isl_surf_usage_flags_t usage = 0;
2491   if (tmpl->writable)
2492      usage = ISL_SURF_USAGE_STORAGE_BIT;
2493   else if (util_format_is_depth_or_stencil(tmpl->format))
2494      usage = ISL_SURF_USAGE_DEPTH_BIT;
2495   else
2496      usage = ISL_SURF_USAGE_RENDER_TARGET_BIT;
2497
2498   const struct iris_format_info fmt =
2499      iris_format_for_usage(devinfo, tmpl->format, usage);
2500
2501   if ((usage & ISL_SURF_USAGE_RENDER_TARGET_BIT) &&
2502       !isl_format_supports_rendering(devinfo, fmt.fmt)) {
2503      /* Framebuffer validation will reject this invalid case, but it
2504       * hasn't had the opportunity yet.  In the meantime, we need to
2505       * avoid hitting ISL asserts about unsupported formats below.
2506       */
2507      return NULL;
2508   }
2509
2510   struct iris_surface *surf = calloc(1, sizeof(struct iris_surface));
2511   struct pipe_surface *psurf = &surf->base;
2512   struct iris_resource *res = (struct iris_resource *) tex;
2513
2514   if (!surf)
2515      return NULL;
2516
2517   pipe_reference_init(&psurf->reference, 1);
2518   pipe_resource_reference(&psurf->texture, tex);
2519   psurf->context = ctx;
2520   psurf->format = tmpl->format;
2521   psurf->width = tex->width0;
2522   psurf->height = tex->height0;
2523   psurf->texture = tex;
2524   psurf->u.tex.first_layer = tmpl->u.tex.first_layer;
2525   psurf->u.tex.last_layer = tmpl->u.tex.last_layer;
2526   psurf->u.tex.level = tmpl->u.tex.level;
2527
2528   uint32_t array_len = tmpl->u.tex.last_layer - tmpl->u.tex.first_layer + 1;
2529
2530   struct isl_view *view = &surf->view;
2531   *view = (struct isl_view) {
2532      .format = fmt.fmt,
2533      .base_level = tmpl->u.tex.level,
2534      .levels = 1,
2535      .base_array_layer = tmpl->u.tex.first_layer,
2536      .array_len = array_len,
2537      .swizzle = ISL_SWIZZLE_IDENTITY,
2538      .usage = usage,
2539   };
2540
2541#if GFX_VER == 8
2542   struct isl_view *read_view = &surf->read_view;
2543   *read_view = (struct isl_view) {
2544      .format = fmt.fmt,
2545      .base_level = tmpl->u.tex.level,
2546      .levels = 1,
2547      .base_array_layer = tmpl->u.tex.first_layer,
2548      .array_len = array_len,
2549      .swizzle = ISL_SWIZZLE_IDENTITY,
2550      .usage = ISL_SURF_USAGE_TEXTURE_BIT,
2551   };
2552
2553   struct isl_surf read_surf = res->surf;
2554   uint64_t read_surf_offset_B = 0;
2555   uint32_t read_surf_tile_x_sa = 0, read_surf_tile_y_sa = 0;
2556   if (tex->target == PIPE_TEXTURE_3D && array_len == 1) {
2557      /* The minimum array element field of the surface state structure is
2558       * ignored by the sampler unit for 3D textures on some hardware.  If the
2559       * render buffer is a single slice of a 3D texture, create a 2D texture
2560       * covering that slice.
2561       *
2562       * TODO: This only handles the case where we're rendering to a single
2563       * slice of an array texture.  If we have layered rendering combined
2564       * with non-coherent FB fetch and a non-zero base_array_layer, then
2565       * we're going to run into problems.
2566       *
2567       * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/4904
2568       */
2569      isl_surf_get_image_surf(&screen->isl_dev, &res->surf,
2570                              read_view->base_level,
2571                              0, read_view->base_array_layer,
2572                              &read_surf, &read_surf_offset_B,
2573                              &read_surf_tile_x_sa, &read_surf_tile_y_sa);
2574      read_view->base_level = 0;
2575      read_view->base_array_layer = 0;
2576      assert(read_view->array_len == 1);
2577   } else if (tex->target == PIPE_TEXTURE_1D_ARRAY) {
2578      /* Convert 1D array textures to 2D arrays because shaders always provide
2579       * the array index coordinate at the Z component to avoid recompiles
2580       * when changing the texture target of the framebuffer.
2581       */
2582      assert(read_surf.dim_layout == ISL_DIM_LAYOUT_GFX4_2D);
2583      read_surf.dim = ISL_SURF_DIM_2D;
2584   }
2585#endif
2586
2587   surf->clear_color = res->aux.clear_color;
2588
2589   /* Bail early for depth/stencil - we don't want SURFACE_STATE for them. */
2590   if (res->surf.usage & (ISL_SURF_USAGE_DEPTH_BIT |
2591                          ISL_SURF_USAGE_STENCIL_BIT))
2592      return psurf;
2593
2594
2595   alloc_surface_states(&surf->surface_state, res->aux.possible_usages);
2596   surf->surface_state.bo_address = res->bo->address;
2597
2598#if GFX_VER == 8
2599   alloc_surface_states(&surf->surface_state_read, res->aux.possible_usages);
2600   surf->surface_state_read.bo_address = res->bo->address;
2601#endif
2602
2603   if (!isl_format_is_compressed(res->surf.format)) {
2604      void *map = surf->surface_state.cpu;
2605      UNUSED void *map_read = surf->surface_state_read.cpu;
2606
2607      /* This is a normal surface.  Fill out a SURFACE_STATE for each possible
2608       * auxiliary surface mode and return the pipe_surface.
2609       */
2610      unsigned aux_modes = res->aux.possible_usages;
2611      while (aux_modes) {
2612         enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
2613         fill_surface_state(&screen->isl_dev, map, res, &res->surf,
2614                            view, aux_usage, 0, 0, 0);
2615         map += SURFACE_STATE_ALIGNMENT;
2616
2617#if GFX_VER == 8
2618         fill_surface_state(&screen->isl_dev, map_read, res,
2619                            &read_surf, read_view, aux_usage,
2620                            read_surf_offset_B,
2621                            read_surf_tile_x_sa, read_surf_tile_y_sa);
2622         map_read += SURFACE_STATE_ALIGNMENT;
2623#endif
2624      }
2625
2626      return psurf;
2627   }
2628
2629   /* The resource has a compressed format, which is not renderable, but we
2630    * have a renderable view format.  We must be attempting to upload blocks
2631    * of compressed data via an uncompressed view.
2632    *
2633    * In this case, we can assume there are no auxiliary buffers, a single
2634    * miplevel, and that the resource is single-sampled.  Gallium may try
2635    * and create an uncompressed view with multiple layers, however.
2636    */
2637   assert(!isl_format_is_compressed(fmt.fmt));
2638   assert(res->aux.possible_usages == 1 << ISL_AUX_USAGE_NONE);
2639   assert(res->surf.samples == 1);
2640   assert(view->levels == 1);
2641
2642   struct isl_surf isl_surf;
2643   uint64_t offset_B = 0;
2644   uint32_t tile_x_el = 0, tile_y_el = 0;
2645   bool ok = isl_surf_get_uncompressed_surf(&screen->isl_dev, &res->surf,
2646                                            view, &isl_surf, view,
2647                                            &offset_B, &tile_x_el, &tile_y_el);
2648   if (!ok) {
2649      free(surf);
2650      return NULL;
2651   }
2652
2653   psurf->width = isl_surf.logical_level0_px.width;
2654   psurf->height = isl_surf.logical_level0_px.height;
2655
2656   struct isl_surf_fill_state_info f = {
2657      .surf = &isl_surf,
2658      .view = view,
2659      .mocs = iris_mocs(res->bo, &screen->isl_dev,
2660                        ISL_SURF_USAGE_RENDER_TARGET_BIT),
2661      .address = res->bo->address + offset_B,
2662      .x_offset_sa = tile_x_el, /* Single-sampled, so el == sa */
2663      .y_offset_sa = tile_y_el, /* Single-sampled, so el == sa */
2664   };
2665
2666   isl_surf_fill_state_s(&screen->isl_dev, surf->surface_state.cpu, &f);
2667
2668   return psurf;
2669}
2670
2671#if GFX_VER < 9
2672static void
2673fill_default_image_param(struct brw_image_param *param)
2674{
2675   memset(param, 0, sizeof(*param));
2676   /* Set the swizzling shifts to all-ones to effectively disable swizzling --
2677    * See emit_address_calculation() in brw_fs_surface_builder.cpp for a more
2678    * detailed explanation of these parameters.
2679    */
2680   param->swizzling[0] = 0xff;
2681   param->swizzling[1] = 0xff;
2682}
2683
2684static void
2685fill_buffer_image_param(struct brw_image_param *param,
2686                        enum pipe_format pfmt,
2687                        unsigned size)
2688{
2689   const unsigned cpp = util_format_get_blocksize(pfmt);
2690
2691   fill_default_image_param(param);
2692   param->size[0] = size / cpp;
2693   param->stride[0] = cpp;
2694}
2695#else
2696#define isl_surf_fill_image_param(x, ...)
2697#define fill_default_image_param(x, ...)
2698#define fill_buffer_image_param(x, ...)
2699#endif
2700
2701/**
2702 * The pipe->set_shader_images() driver hook.
2703 */
2704static void
2705iris_set_shader_images(struct pipe_context *ctx,
2706                       enum pipe_shader_type p_stage,
2707                       unsigned start_slot, unsigned count,
2708                       unsigned unbind_num_trailing_slots,
2709                       const struct pipe_image_view *p_images)
2710{
2711   struct iris_context *ice = (struct iris_context *) ctx;
2712   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
2713   gl_shader_stage stage = stage_from_pipe(p_stage);
2714   struct iris_shader_state *shs = &ice->state.shaders[stage];
2715#if GFX_VER == 8
2716   struct iris_genx_state *genx = ice->state.genx;
2717   struct brw_image_param *image_params = genx->shaders[stage].image_param;
2718#endif
2719
2720   shs->bound_image_views &=
2721      ~u_bit_consecutive(start_slot, count + unbind_num_trailing_slots);
2722
2723   for (unsigned i = 0; i < count; i++) {
2724      struct iris_image_view *iv = &shs->image[start_slot + i];
2725
2726      if (p_images && p_images[i].resource) {
2727         const struct pipe_image_view *img = &p_images[i];
2728         struct iris_resource *res = (void *) img->resource;
2729
2730         util_copy_image_view(&iv->base, img);
2731
2732         shs->bound_image_views |= 1 << (start_slot + i);
2733
2734         res->bind_history |= PIPE_BIND_SHADER_IMAGE;
2735         res->bind_stages |= 1 << stage;
2736
2737         enum isl_format isl_fmt = iris_image_view_get_format(ice, img);
2738
2739         /* Render compression with images supported on gfx12+ only. */
2740         unsigned aux_usages = GFX_VER >= 12 ? res->aux.possible_usages :
2741            1 << ISL_AUX_USAGE_NONE;
2742
2743         alloc_surface_states(&iv->surface_state, aux_usages);
2744         iv->surface_state.bo_address = res->bo->address;
2745
2746         void *map = iv->surface_state.cpu;
2747
2748         if (res->base.b.target != PIPE_BUFFER) {
2749            struct isl_view view = {
2750               .format = isl_fmt,
2751               .base_level = img->u.tex.level,
2752               .levels = 1,
2753               .base_array_layer = img->u.tex.first_layer,
2754               .array_len = img->u.tex.last_layer - img->u.tex.first_layer + 1,
2755               .swizzle = ISL_SWIZZLE_IDENTITY,
2756               .usage = ISL_SURF_USAGE_STORAGE_BIT,
2757            };
2758
2759            /* If using untyped fallback. */
2760            if (isl_fmt == ISL_FORMAT_RAW) {
2761               fill_buffer_surface_state(&screen->isl_dev, res, map,
2762                                         isl_fmt, ISL_SWIZZLE_IDENTITY,
2763                                         0, res->bo->size,
2764                                         ISL_SURF_USAGE_STORAGE_BIT);
2765            } else {
2766               unsigned aux_modes = aux_usages;
2767               while (aux_modes) {
2768                  enum isl_aux_usage usage = u_bit_scan(&aux_modes);
2769
2770                  fill_surface_state(&screen->isl_dev, map, res, &res->surf,
2771                                     &view, usage, 0, 0, 0);
2772
2773                  map += SURFACE_STATE_ALIGNMENT;
2774               }
2775            }
2776
2777            isl_surf_fill_image_param(&screen->isl_dev,
2778                                      &image_params[start_slot + i],
2779                                      &res->surf, &view);
2780         } else {
2781            util_range_add(&res->base.b, &res->valid_buffer_range, img->u.buf.offset,
2782                           img->u.buf.offset + img->u.buf.size);
2783
2784            fill_buffer_surface_state(&screen->isl_dev, res, map,
2785                                      isl_fmt, ISL_SWIZZLE_IDENTITY,
2786                                      img->u.buf.offset, img->u.buf.size,
2787                                      ISL_SURF_USAGE_STORAGE_BIT);
2788            fill_buffer_image_param(&image_params[start_slot + i],
2789                                    img->format, img->u.buf.size);
2790         }
2791
2792         upload_surface_states(ice->state.surface_uploader, &iv->surface_state);
2793      } else {
2794         pipe_resource_reference(&iv->base.resource, NULL);
2795         pipe_resource_reference(&iv->surface_state.ref.res, NULL);
2796         fill_default_image_param(&image_params[start_slot + i]);
2797      }
2798   }
2799
2800   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
2801   ice->state.dirty |=
2802      stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
2803                                   : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
2804
2805   /* Broadwell also needs brw_image_params re-uploaded */
2806   if (GFX_VER < 9) {
2807      ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
2808      shs->sysvals_need_upload = true;
2809   }
2810
2811   if (unbind_num_trailing_slots) {
2812      iris_set_shader_images(ctx, p_stage, start_slot + count,
2813                             unbind_num_trailing_slots, 0, NULL);
2814   }
2815}
2816
2817
2818/**
2819 * The pipe->set_sampler_views() driver hook.
2820 */
2821static void
2822iris_set_sampler_views(struct pipe_context *ctx,
2823                       enum pipe_shader_type p_stage,
2824                       unsigned start, unsigned count,
2825                       unsigned unbind_num_trailing_slots,
2826                       bool take_ownership,
2827                       struct pipe_sampler_view **views)
2828{
2829   struct iris_context *ice = (struct iris_context *) ctx;
2830   gl_shader_stage stage = stage_from_pipe(p_stage);
2831   struct iris_shader_state *shs = &ice->state.shaders[stage];
2832   unsigned i;
2833
2834   shs->bound_sampler_views &=
2835      ~u_bit_consecutive(start, count + unbind_num_trailing_slots);
2836
2837   for (i = 0; i < count; i++) {
2838      struct pipe_sampler_view *pview = views ? views[i] : NULL;
2839
2840      if (take_ownership) {
2841         pipe_sampler_view_reference((struct pipe_sampler_view **)
2842                                     &shs->textures[start + i], NULL);
2843         shs->textures[start + i] = (struct iris_sampler_view *)pview;
2844      } else {
2845         pipe_sampler_view_reference((struct pipe_sampler_view **)
2846                                     &shs->textures[start + i], pview);
2847      }
2848      struct iris_sampler_view *view = (void *) pview;
2849      if (view) {
2850         view->res->bind_history |= PIPE_BIND_SAMPLER_VIEW;
2851         view->res->bind_stages |= 1 << stage;
2852
2853         shs->bound_sampler_views |= 1 << (start + i);
2854
2855         update_surface_state_addrs(ice->state.surface_uploader,
2856                                    &view->surface_state, view->res->bo);
2857      }
2858   }
2859   for (; i < count + unbind_num_trailing_slots; i++) {
2860      pipe_sampler_view_reference((struct pipe_sampler_view **)
2861                                  &shs->textures[start + i], NULL);
2862   }
2863
2864   ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_BINDINGS_VS << stage);
2865   ice->state.dirty |=
2866      stage == MESA_SHADER_COMPUTE ? IRIS_DIRTY_COMPUTE_RESOLVES_AND_FLUSHES
2867                                   : IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
2868}
2869
2870static void
2871iris_set_compute_resources(struct pipe_context *ctx,
2872                           unsigned start, unsigned count,
2873                           struct pipe_surface **resources)
2874{
2875   assert(count == 0);
2876}
2877
2878static void
2879iris_set_global_binding(struct pipe_context *ctx,
2880                        unsigned start_slot, unsigned count,
2881                        struct pipe_resource **resources,
2882                        uint32_t **handles)
2883{
2884   struct iris_context *ice = (struct iris_context *) ctx;
2885
2886   assert(start_slot + count <= IRIS_MAX_GLOBAL_BINDINGS);
2887   for (unsigned i = 0; i < count; i++) {
2888      if (resources && resources[i]) {
2889         pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
2890                                 resources[i]);
2891         struct iris_resource *res = (void *) resources[i];
2892         uint64_t addr = res->bo->address;
2893         memcpy(handles[i], &addr, sizeof(addr));
2894      } else {
2895         pipe_resource_reference(&ice->state.global_bindings[start_slot + i],
2896                                 NULL);
2897      }
2898   }
2899
2900   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_CS;
2901}
2902
2903/**
2904 * The pipe->set_tess_state() driver hook.
2905 */
2906static void
2907iris_set_tess_state(struct pipe_context *ctx,
2908                    const float default_outer_level[4],
2909                    const float default_inner_level[2])
2910{
2911   struct iris_context *ice = (struct iris_context *) ctx;
2912   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_TESS_CTRL];
2913
2914   memcpy(&ice->state.default_outer_level[0], &default_outer_level[0], 4 * sizeof(float));
2915   memcpy(&ice->state.default_inner_level[0], &default_inner_level[0], 2 * sizeof(float));
2916
2917   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_TCS;
2918   shs->sysvals_need_upload = true;
2919}
2920
2921static void
2922iris_set_patch_vertices(struct pipe_context *ctx, uint8_t patch_vertices)
2923{
2924   struct iris_context *ice = (struct iris_context *) ctx;
2925
2926   ice->state.patch_vertices = patch_vertices;
2927}
2928
2929static void
2930iris_surface_destroy(struct pipe_context *ctx, struct pipe_surface *p_surf)
2931{
2932   struct iris_surface *surf = (void *) p_surf;
2933   pipe_resource_reference(&p_surf->texture, NULL);
2934   pipe_resource_reference(&surf->surface_state.ref.res, NULL);
2935   pipe_resource_reference(&surf->surface_state_read.ref.res, NULL);
2936   free(surf->surface_state.cpu);
2937   free(surf->surface_state_read.cpu);
2938   free(surf);
2939}
2940
2941static void
2942iris_set_clip_state(struct pipe_context *ctx,
2943                    const struct pipe_clip_state *state)
2944{
2945   struct iris_context *ice = (struct iris_context *) ctx;
2946   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_VERTEX];
2947   struct iris_shader_state *gshs = &ice->state.shaders[MESA_SHADER_GEOMETRY];
2948   struct iris_shader_state *tshs = &ice->state.shaders[MESA_SHADER_TESS_EVAL];
2949
2950   memcpy(&ice->state.clip_planes, state, sizeof(*state));
2951
2952   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS |
2953                             IRIS_STAGE_DIRTY_CONSTANTS_GS |
2954                             IRIS_STAGE_DIRTY_CONSTANTS_TES;
2955   shs->sysvals_need_upload = true;
2956   gshs->sysvals_need_upload = true;
2957   tshs->sysvals_need_upload = true;
2958}
2959
2960/**
2961 * The pipe->set_polygon_stipple() driver hook.
2962 */
2963static void
2964iris_set_polygon_stipple(struct pipe_context *ctx,
2965                         const struct pipe_poly_stipple *state)
2966{
2967   struct iris_context *ice = (struct iris_context *) ctx;
2968   memcpy(&ice->state.poly_stipple, state, sizeof(*state));
2969   ice->state.dirty |= IRIS_DIRTY_POLYGON_STIPPLE;
2970}
2971
2972/**
2973 * The pipe->set_sample_mask() driver hook.
2974 */
2975static void
2976iris_set_sample_mask(struct pipe_context *ctx, unsigned sample_mask)
2977{
2978   struct iris_context *ice = (struct iris_context *) ctx;
2979
2980   /* We only support 16x MSAA, so we have 16 bits of sample maks.
2981    * st/mesa may pass us 0xffffffff though, meaning "enable all samples".
2982    */
2983   ice->state.sample_mask = sample_mask & 0xffff;
2984   ice->state.dirty |= IRIS_DIRTY_SAMPLE_MASK;
2985}
2986
2987/**
2988 * The pipe->set_scissor_states() driver hook.
2989 *
2990 * This corresponds to our SCISSOR_RECT state structures.  It's an
2991 * exact match, so we just store them, and memcpy them out later.
2992 */
2993static void
2994iris_set_scissor_states(struct pipe_context *ctx,
2995                        unsigned start_slot,
2996                        unsigned num_scissors,
2997                        const struct pipe_scissor_state *rects)
2998{
2999   struct iris_context *ice = (struct iris_context *) ctx;
3000
3001   for (unsigned i = 0; i < num_scissors; i++) {
3002      if (rects[i].minx == rects[i].maxx || rects[i].miny == rects[i].maxy) {
3003         /* If the scissor was out of bounds and got clamped to 0 width/height
3004          * at the bounds, the subtraction of 1 from maximums could produce a
3005          * negative number and thus not clip anything.  Instead, just provide
3006          * a min > max scissor inside the bounds, which produces the expected
3007          * no rendering.
3008          */
3009         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3010            .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
3011         };
3012      } else {
3013         ice->state.scissors[start_slot + i] = (struct pipe_scissor_state) {
3014            .minx = rects[i].minx,     .miny = rects[i].miny,
3015            .maxx = rects[i].maxx - 1, .maxy = rects[i].maxy - 1,
3016         };
3017      }
3018   }
3019
3020   ice->state.dirty |= IRIS_DIRTY_SCISSOR_RECT;
3021}
3022
3023/**
3024 * The pipe->set_stencil_ref() driver hook.
3025 *
3026 * This is added to 3DSTATE_WM_DEPTH_STENCIL dynamically at draw time.
3027 */
3028static void
3029iris_set_stencil_ref(struct pipe_context *ctx,
3030                     const struct pipe_stencil_ref state)
3031{
3032   struct iris_context *ice = (struct iris_context *) ctx;
3033   memcpy(&ice->state.stencil_ref, &state, sizeof(state));
3034   if (GFX_VER >= 12)
3035      ice->state.dirty |= IRIS_DIRTY_STENCIL_REF;
3036   else if (GFX_VER >= 9)
3037      ice->state.dirty |= IRIS_DIRTY_WM_DEPTH_STENCIL;
3038   else
3039      ice->state.dirty |= IRIS_DIRTY_COLOR_CALC_STATE;
3040}
3041
3042static float
3043viewport_extent(const struct pipe_viewport_state *state, int axis, float sign)
3044{
3045   return copysignf(state->scale[axis], sign) + state->translate[axis];
3046}
3047
3048/**
3049 * The pipe->set_viewport_states() driver hook.
3050 *
3051 * This corresponds to our SF_CLIP_VIEWPORT states.  We can't calculate
3052 * the guardband yet, as we need the framebuffer dimensions, but we can
3053 * at least fill out the rest.
3054 */
3055static void
3056iris_set_viewport_states(struct pipe_context *ctx,
3057                         unsigned start_slot,
3058                         unsigned count,
3059                         const struct pipe_viewport_state *states)
3060{
3061   struct iris_context *ice = (struct iris_context *) ctx;
3062
3063   memcpy(&ice->state.viewports[start_slot], states, sizeof(*states) * count);
3064
3065   ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3066
3067   if (ice->state.cso_rast && (!ice->state.cso_rast->depth_clip_near ||
3068                               !ice->state.cso_rast->depth_clip_far))
3069      ice->state.dirty |= IRIS_DIRTY_CC_VIEWPORT;
3070}
3071
3072/**
3073 * The pipe->set_framebuffer_state() driver hook.
3074 *
3075 * Sets the current draw FBO, including color render targets, depth,
3076 * and stencil buffers.
3077 */
3078static void
3079iris_set_framebuffer_state(struct pipe_context *ctx,
3080                           const struct pipe_framebuffer_state *state)
3081{
3082   struct iris_context *ice = (struct iris_context *) ctx;
3083   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3084   struct isl_device *isl_dev = &screen->isl_dev;
3085   struct pipe_framebuffer_state *cso = &ice->state.framebuffer;
3086   struct iris_resource *zres;
3087   struct iris_resource *stencil_res;
3088
3089   unsigned samples = util_framebuffer_get_num_samples(state);
3090   unsigned layers = util_framebuffer_get_num_layers(state);
3091
3092   if (cso->samples != samples) {
3093      ice->state.dirty |= IRIS_DIRTY_MULTISAMPLE;
3094
3095      /* We need to toggle 3DSTATE_PS::32 Pixel Dispatch Enable */
3096      if (GFX_VER >= 9 && (cso->samples == 16 || samples == 16))
3097         ice->state.stage_dirty |= IRIS_STAGE_DIRTY_FS;
3098   }
3099
3100   if (cso->nr_cbufs != state->nr_cbufs) {
3101      ice->state.dirty |= IRIS_DIRTY_BLEND_STATE;
3102   }
3103
3104   if ((cso->layers == 0) != (layers == 0)) {
3105      ice->state.dirty |= IRIS_DIRTY_CLIP;
3106   }
3107
3108   if (cso->width != state->width || cso->height != state->height) {
3109      ice->state.dirty |= IRIS_DIRTY_SF_CL_VIEWPORT;
3110   }
3111
3112   if (cso->zsbuf || state->zsbuf) {
3113      ice->state.dirty |= IRIS_DIRTY_DEPTH_BUFFER;
3114   }
3115
3116   util_copy_framebuffer_state(cso, state);
3117   cso->samples = samples;
3118   cso->layers = layers;
3119
3120   struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
3121
3122   struct isl_view view = {
3123      .base_level = 0,
3124      .levels = 1,
3125      .base_array_layer = 0,
3126      .array_len = 1,
3127      .swizzle = ISL_SWIZZLE_IDENTITY,
3128   };
3129
3130   struct isl_depth_stencil_hiz_emit_info info = { .view = &view };
3131
3132   if (cso->zsbuf) {
3133      iris_get_depth_stencil_resources(cso->zsbuf->texture, &zres,
3134                                       &stencil_res);
3135
3136      view.base_level = cso->zsbuf->u.tex.level;
3137      view.base_array_layer = cso->zsbuf->u.tex.first_layer;
3138      view.array_len =
3139         cso->zsbuf->u.tex.last_layer - cso->zsbuf->u.tex.first_layer + 1;
3140
3141      if (zres) {
3142         view.usage |= ISL_SURF_USAGE_DEPTH_BIT;
3143
3144         info.depth_surf = &zres->surf;
3145         info.depth_address = zres->bo->address + zres->offset;
3146         info.mocs = iris_mocs(zres->bo, isl_dev, view.usage);
3147
3148         view.format = zres->surf.format;
3149
3150         if (iris_resource_level_has_hiz(zres, view.base_level)) {
3151            info.hiz_usage = zres->aux.usage;
3152            info.hiz_surf = &zres->aux.surf;
3153            info.hiz_address = zres->aux.bo->address + zres->aux.offset;
3154         }
3155
3156         ice->state.hiz_usage = info.hiz_usage;
3157      }
3158
3159      if (stencil_res) {
3160         view.usage |= ISL_SURF_USAGE_STENCIL_BIT;
3161         info.stencil_aux_usage = stencil_res->aux.usage;
3162         info.stencil_surf = &stencil_res->surf;
3163         info.stencil_address = stencil_res->bo->address + stencil_res->offset;
3164         if (!zres) {
3165            view.format = stencil_res->surf.format;
3166            info.mocs = iris_mocs(stencil_res->bo, isl_dev, view.usage);
3167         }
3168      }
3169   }
3170
3171   isl_emit_depth_stencil_hiz_s(isl_dev, cso_z->packets, &info);
3172
3173   /* Make a null surface for unbound buffers */
3174   void *null_surf_map =
3175      upload_state(ice->state.surface_uploader, &ice->state.null_fb,
3176                   4 * GENX(RENDER_SURFACE_STATE_length), 64);
3177   isl_null_fill_state(&screen->isl_dev, null_surf_map,
3178                       .size = isl_extent3d(MAX2(cso->width, 1),
3179                                            MAX2(cso->height, 1),
3180                                            cso->layers ? cso->layers : 1));
3181   ice->state.null_fb.offset +=
3182      iris_bo_offset_from_base_address(iris_resource_bo(ice->state.null_fb.res));
3183
3184   /* Render target change */
3185   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_FS;
3186
3187   ice->state.dirty |= IRIS_DIRTY_RENDER_BUFFER;
3188
3189   ice->state.dirty |= IRIS_DIRTY_RENDER_RESOLVES_AND_FLUSHES;
3190
3191   ice->state.stage_dirty |=
3192      ice->state.stage_dirty_for_nos[IRIS_NOS_FRAMEBUFFER];
3193
3194   if (GFX_VER == 8)
3195      ice->state.dirty |= IRIS_DIRTY_PMA_FIX;
3196}
3197
3198/**
3199 * The pipe->set_constant_buffer() driver hook.
3200 *
3201 * This uploads any constant data in user buffers, and references
3202 * any UBO resources containing constant data.
3203 */
3204static void
3205iris_set_constant_buffer(struct pipe_context *ctx,
3206                         enum pipe_shader_type p_stage, unsigned index,
3207                         bool take_ownership,
3208                         const struct pipe_constant_buffer *input)
3209{
3210   struct iris_context *ice = (struct iris_context *) ctx;
3211   gl_shader_stage stage = stage_from_pipe(p_stage);
3212   struct iris_shader_state *shs = &ice->state.shaders[stage];
3213   struct pipe_shader_buffer *cbuf = &shs->constbuf[index];
3214
3215   /* TODO: Only do this if the buffer changes? */
3216   pipe_resource_reference(&shs->constbuf_surf_state[index].res, NULL);
3217
3218   if (input && input->buffer_size && (input->buffer || input->user_buffer)) {
3219      shs->bound_cbufs |= 1u << index;
3220
3221      if (input->user_buffer) {
3222         void *map = NULL;
3223         pipe_resource_reference(&cbuf->buffer, NULL);
3224         u_upload_alloc(ice->ctx.const_uploader, 0, input->buffer_size, 64,
3225                        &cbuf->buffer_offset, &cbuf->buffer, (void **) &map);
3226
3227         if (!cbuf->buffer) {
3228            /* Allocation was unsuccessful - just unbind */
3229            iris_set_constant_buffer(ctx, p_stage, index, false, NULL);
3230            return;
3231         }
3232
3233         assert(map);
3234         memcpy(map, input->user_buffer, input->buffer_size);
3235      } else if (input->buffer) {
3236         if (cbuf->buffer != input->buffer) {
3237            ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3238                                 IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3239            shs->dirty_cbufs |= 1u << index;
3240         }
3241
3242         if (take_ownership) {
3243            pipe_resource_reference(&cbuf->buffer, NULL);
3244            cbuf->buffer = input->buffer;
3245         } else {
3246            pipe_resource_reference(&cbuf->buffer, input->buffer);
3247         }
3248
3249         cbuf->buffer_offset = input->buffer_offset;
3250      }
3251
3252      cbuf->buffer_size =
3253         MIN2(input->buffer_size,
3254              iris_resource_bo(cbuf->buffer)->size - cbuf->buffer_offset);
3255
3256      struct iris_resource *res = (void *) cbuf->buffer;
3257      res->bind_history |= PIPE_BIND_CONSTANT_BUFFER;
3258      res->bind_stages |= 1 << stage;
3259   } else {
3260      shs->bound_cbufs &= ~(1u << index);
3261      pipe_resource_reference(&cbuf->buffer, NULL);
3262   }
3263
3264   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << stage;
3265}
3266
3267static void
3268upload_sysvals(struct iris_context *ice,
3269               gl_shader_stage stage,
3270               const struct pipe_grid_info *grid)
3271{
3272   UNUSED struct iris_genx_state *genx = ice->state.genx;
3273   struct iris_shader_state *shs = &ice->state.shaders[stage];
3274
3275   struct iris_compiled_shader *shader = ice->shaders.prog[stage];
3276   if (!shader || (shader->num_system_values == 0 &&
3277                   shader->kernel_input_size == 0))
3278      return;
3279
3280   assert(shader->num_cbufs > 0);
3281
3282   unsigned sysval_cbuf_index = shader->num_cbufs - 1;
3283   struct pipe_shader_buffer *cbuf = &shs->constbuf[sysval_cbuf_index];
3284   unsigned system_values_start =
3285      ALIGN(shader->kernel_input_size, sizeof(uint32_t));
3286   unsigned upload_size = system_values_start +
3287                          shader->num_system_values * sizeof(uint32_t);
3288   void *map = NULL;
3289
3290   assert(sysval_cbuf_index < PIPE_MAX_CONSTANT_BUFFERS);
3291   u_upload_alloc(ice->ctx.const_uploader, 0, upload_size, 64,
3292                  &cbuf->buffer_offset, &cbuf->buffer, &map);
3293
3294   if (shader->kernel_input_size > 0)
3295      memcpy(map, grid->input, shader->kernel_input_size);
3296
3297   uint32_t *sysval_map = map + system_values_start;
3298   for (int i = 0; i < shader->num_system_values; i++) {
3299      uint32_t sysval = shader->system_values[i];
3300      uint32_t value = 0;
3301
3302      if (BRW_PARAM_DOMAIN(sysval) == BRW_PARAM_DOMAIN_IMAGE) {
3303#if GFX_VER == 8
3304         unsigned img = BRW_PARAM_IMAGE_IDX(sysval);
3305         unsigned offset = BRW_PARAM_IMAGE_OFFSET(sysval);
3306         struct brw_image_param *param =
3307            &genx->shaders[stage].image_param[img];
3308
3309         assert(offset < sizeof(struct brw_image_param));
3310         value = ((uint32_t *) param)[offset];
3311#endif
3312      } else if (sysval == BRW_PARAM_BUILTIN_ZERO) {
3313         value = 0;
3314      } else if (BRW_PARAM_BUILTIN_IS_CLIP_PLANE(sysval)) {
3315         int plane = BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(sysval);
3316         int comp  = BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(sysval);
3317         value = fui(ice->state.clip_planes.ucp[plane][comp]);
3318      } else if (sysval == BRW_PARAM_BUILTIN_PATCH_VERTICES_IN) {
3319         if (stage == MESA_SHADER_TESS_CTRL) {
3320            value = ice->state.vertices_per_patch;
3321         } else {
3322            assert(stage == MESA_SHADER_TESS_EVAL);
3323            const struct shader_info *tcs_info =
3324               iris_get_shader_info(ice, MESA_SHADER_TESS_CTRL);
3325            if (tcs_info)
3326               value = tcs_info->tess.tcs_vertices_out;
3327            else
3328               value = ice->state.vertices_per_patch;
3329         }
3330      } else if (sysval >= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X &&
3331                 sysval <= BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W) {
3332         unsigned i = sysval - BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X;
3333         value = fui(ice->state.default_outer_level[i]);
3334      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X) {
3335         value = fui(ice->state.default_inner_level[0]);
3336      } else if (sysval == BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y) {
3337         value = fui(ice->state.default_inner_level[1]);
3338      } else if (sysval >= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X &&
3339                 sysval <= BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z) {
3340         unsigned i = sysval - BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X;
3341         value = ice->state.last_block[i];
3342      } else if (sysval == BRW_PARAM_BUILTIN_WORK_DIM) {
3343         value = grid->work_dim;
3344      } else {
3345         assert(!"unhandled system value");
3346      }
3347
3348      *sysval_map++ = value;
3349   }
3350
3351   cbuf->buffer_size = upload_size;
3352   iris_upload_ubo_ssbo_surf_state(ice, cbuf,
3353                                   &shs->constbuf_surf_state[sysval_cbuf_index],
3354                                   ISL_SURF_USAGE_CONSTANT_BUFFER_BIT);
3355
3356   shs->sysvals_need_upload = false;
3357}
3358
3359/**
3360 * The pipe->set_shader_buffers() driver hook.
3361 *
3362 * This binds SSBOs and ABOs.  Unfortunately, we need to stream out
3363 * SURFACE_STATE here, as the buffer offset may change each time.
3364 */
3365static void
3366iris_set_shader_buffers(struct pipe_context *ctx,
3367                        enum pipe_shader_type p_stage,
3368                        unsigned start_slot, unsigned count,
3369                        const struct pipe_shader_buffer *buffers,
3370                        unsigned writable_bitmask)
3371{
3372   struct iris_context *ice = (struct iris_context *) ctx;
3373   gl_shader_stage stage = stage_from_pipe(p_stage);
3374   struct iris_shader_state *shs = &ice->state.shaders[stage];
3375
3376   unsigned modified_bits = u_bit_consecutive(start_slot, count);
3377
3378   shs->bound_ssbos &= ~modified_bits;
3379   shs->writable_ssbos &= ~modified_bits;
3380   shs->writable_ssbos |= writable_bitmask << start_slot;
3381
3382   for (unsigned i = 0; i < count; i++) {
3383      if (buffers && buffers[i].buffer) {
3384         struct iris_resource *res = (void *) buffers[i].buffer;
3385         struct pipe_shader_buffer *ssbo = &shs->ssbo[start_slot + i];
3386         struct iris_state_ref *surf_state =
3387            &shs->ssbo_surf_state[start_slot + i];
3388         pipe_resource_reference(&ssbo->buffer, &res->base.b);
3389         ssbo->buffer_offset = buffers[i].buffer_offset;
3390         ssbo->buffer_size =
3391            MIN2(buffers[i].buffer_size, res->bo->size - ssbo->buffer_offset);
3392
3393         shs->bound_ssbos |= 1 << (start_slot + i);
3394
3395         isl_surf_usage_flags_t usage = ISL_SURF_USAGE_STORAGE_BIT;
3396
3397         iris_upload_ubo_ssbo_surf_state(ice, ssbo, surf_state, usage);
3398
3399         res->bind_history |= PIPE_BIND_SHADER_BUFFER;
3400         res->bind_stages |= 1 << stage;
3401
3402         util_range_add(&res->base.b, &res->valid_buffer_range, ssbo->buffer_offset,
3403                        ssbo->buffer_offset + ssbo->buffer_size);
3404      } else {
3405         pipe_resource_reference(&shs->ssbo[start_slot + i].buffer, NULL);
3406         pipe_resource_reference(&shs->ssbo_surf_state[start_slot + i].res,
3407                                 NULL);
3408      }
3409   }
3410
3411   ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
3412                        IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
3413   ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << stage;
3414}
3415
3416static void
3417iris_delete_state(struct pipe_context *ctx, void *state)
3418{
3419   free(state);
3420}
3421
3422/**
3423 * The pipe->set_vertex_buffers() driver hook.
3424 *
3425 * This translates pipe_vertex_buffer to our 3DSTATE_VERTEX_BUFFERS packet.
3426 */
3427static void
3428iris_set_vertex_buffers(struct pipe_context *ctx,
3429                        unsigned start_slot, unsigned count,
3430                        unsigned unbind_num_trailing_slots,
3431                        bool take_ownership,
3432                        const struct pipe_vertex_buffer *buffers)
3433{
3434   struct iris_context *ice = (struct iris_context *) ctx;
3435   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3436   struct iris_genx_state *genx = ice->state.genx;
3437
3438   ice->state.bound_vertex_buffers &=
3439      ~u_bit_consecutive64(start_slot, count + unbind_num_trailing_slots);
3440
3441   for (unsigned i = 0; i < count; i++) {
3442      const struct pipe_vertex_buffer *buffer = buffers ? &buffers[i] : NULL;
3443      struct iris_vertex_buffer_state *state =
3444         &genx->vertex_buffers[start_slot + i];
3445
3446      if (!buffer) {
3447         pipe_resource_reference(&state->resource, NULL);
3448         continue;
3449      }
3450
3451      /* We may see user buffers that are NULL bindings. */
3452      assert(!(buffer->is_user_buffer && buffer->buffer.user != NULL));
3453
3454      if (buffer->buffer.resource &&
3455          state->resource != buffer->buffer.resource)
3456         ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
3457
3458      if (take_ownership) {
3459         pipe_resource_reference(&state->resource, NULL);
3460         state->resource = buffer->buffer.resource;
3461      } else {
3462         pipe_resource_reference(&state->resource, buffer->buffer.resource);
3463      }
3464      struct iris_resource *res = (void *) state->resource;
3465
3466      state->offset = (int) buffer->buffer_offset;
3467
3468      if (res) {
3469         ice->state.bound_vertex_buffers |= 1ull << (start_slot + i);
3470         res->bind_history |= PIPE_BIND_VERTEX_BUFFER;
3471      }
3472
3473      iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
3474         vb.VertexBufferIndex = start_slot + i;
3475         vb.AddressModifyEnable = true;
3476         vb.BufferPitch = buffer->stride;
3477         if (res) {
3478            vb.BufferSize = res->base.b.width0 - (int) buffer->buffer_offset;
3479            vb.BufferStartingAddress =
3480               ro_bo(NULL, res->bo->address + (int) buffer->buffer_offset);
3481            vb.MOCS = iris_mocs(res->bo, &screen->isl_dev,
3482                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
3483#if GFX_VER >= 12
3484            vb.L3BypassDisable       = true;
3485#endif
3486         } else {
3487            vb.NullVertexBuffer = true;
3488         }
3489      }
3490   }
3491
3492   for (unsigned i = 0; i < unbind_num_trailing_slots; i++) {
3493      struct iris_vertex_buffer_state *state =
3494         &genx->vertex_buffers[start_slot + count + i];
3495
3496      pipe_resource_reference(&state->resource, NULL);
3497   }
3498
3499   ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS;
3500}
3501
3502/**
3503 * Gallium CSO for vertex elements.
3504 */
3505struct iris_vertex_element_state {
3506   uint32_t vertex_elements[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
3507   uint32_t vf_instancing[33 * GENX(3DSTATE_VF_INSTANCING_length)];
3508   uint32_t edgeflag_ve[GENX(VERTEX_ELEMENT_STATE_length)];
3509   uint32_t edgeflag_vfi[GENX(3DSTATE_VF_INSTANCING_length)];
3510   unsigned count;
3511};
3512
3513/**
3514 * The pipe->create_vertex_elements() driver hook.
3515 *
3516 * This translates pipe_vertex_element to our 3DSTATE_VERTEX_ELEMENTS
3517 * and 3DSTATE_VF_INSTANCING commands. The vertex_elements and vf_instancing
3518 * arrays are ready to be emitted at draw time if no EdgeFlag or SGVs are
3519 * needed. In these cases we will need information available at draw time.
3520 * We setup edgeflag_ve and edgeflag_vfi as alternatives last
3521 * 3DSTATE_VERTEX_ELEMENT and 3DSTATE_VF_INSTANCING that can be used at
3522 * draw time if we detect that EdgeFlag is needed by the Vertex Shader.
3523 */
3524static void *
3525iris_create_vertex_elements(struct pipe_context *ctx,
3526                            unsigned count,
3527                            const struct pipe_vertex_element *state)
3528{
3529   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3530   const struct intel_device_info *devinfo = &screen->devinfo;
3531   struct iris_vertex_element_state *cso =
3532      malloc(sizeof(struct iris_vertex_element_state));
3533
3534   cso->count = count;
3535
3536   iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS), cso->vertex_elements, ve) {
3537      ve.DWordLength =
3538         1 + GENX(VERTEX_ELEMENT_STATE_length) * MAX2(count, 1) - 2;
3539   }
3540
3541   uint32_t *ve_pack_dest = &cso->vertex_elements[1];
3542   uint32_t *vfi_pack_dest = cso->vf_instancing;
3543
3544   if (count == 0) {
3545      iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3546         ve.Valid = true;
3547         ve.SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT;
3548         ve.Component0Control = VFCOMP_STORE_0;
3549         ve.Component1Control = VFCOMP_STORE_0;
3550         ve.Component2Control = VFCOMP_STORE_0;
3551         ve.Component3Control = VFCOMP_STORE_1_FP;
3552      }
3553
3554      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3555      }
3556   }
3557
3558   for (int i = 0; i < count; i++) {
3559      const struct iris_format_info fmt =
3560         iris_format_for_usage(devinfo, state[i].src_format, 0);
3561      unsigned comp[4] = { VFCOMP_STORE_SRC, VFCOMP_STORE_SRC,
3562                           VFCOMP_STORE_SRC, VFCOMP_STORE_SRC };
3563
3564      switch (isl_format_get_num_channels(fmt.fmt)) {
3565      case 0: comp[0] = VFCOMP_STORE_0; FALLTHROUGH;
3566      case 1: comp[1] = VFCOMP_STORE_0; FALLTHROUGH;
3567      case 2: comp[2] = VFCOMP_STORE_0; FALLTHROUGH;
3568      case 3:
3569         comp[3] = isl_format_has_int_channel(fmt.fmt) ? VFCOMP_STORE_1_INT
3570                                                       : VFCOMP_STORE_1_FP;
3571         break;
3572      }
3573      iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
3574         ve.EdgeFlagEnable = false;
3575         ve.VertexBufferIndex = state[i].vertex_buffer_index;
3576         ve.Valid = true;
3577         ve.SourceElementOffset = state[i].src_offset;
3578         ve.SourceElementFormat = fmt.fmt;
3579         ve.Component0Control = comp[0];
3580         ve.Component1Control = comp[1];
3581         ve.Component2Control = comp[2];
3582         ve.Component3Control = comp[3];
3583      }
3584
3585      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
3586         vi.VertexElementIndex = i;
3587         vi.InstancingEnable = state[i].instance_divisor > 0;
3588         vi.InstanceDataStepRate = state[i].instance_divisor;
3589      }
3590
3591      ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
3592      vfi_pack_dest += GENX(3DSTATE_VF_INSTANCING_length);
3593   }
3594
3595   /* An alternative version of the last VE and VFI is stored so it
3596    * can be used at draw time in case Vertex Shader uses EdgeFlag
3597    */
3598   if (count) {
3599      const unsigned edgeflag_index = count - 1;
3600      const struct iris_format_info fmt =
3601         iris_format_for_usage(devinfo, state[edgeflag_index].src_format, 0);
3602      iris_pack_state(GENX(VERTEX_ELEMENT_STATE), cso->edgeflag_ve, ve) {
3603         ve.EdgeFlagEnable = true ;
3604         ve.VertexBufferIndex = state[edgeflag_index].vertex_buffer_index;
3605         ve.Valid = true;
3606         ve.SourceElementOffset = state[edgeflag_index].src_offset;
3607         ve.SourceElementFormat = fmt.fmt;
3608         ve.Component0Control = VFCOMP_STORE_SRC;
3609         ve.Component1Control = VFCOMP_STORE_0;
3610         ve.Component2Control = VFCOMP_STORE_0;
3611         ve.Component3Control = VFCOMP_STORE_0;
3612      }
3613      iris_pack_command(GENX(3DSTATE_VF_INSTANCING), cso->edgeflag_vfi, vi) {
3614         /* The vi.VertexElementIndex of the EdgeFlag Vertex Element is filled
3615          * at draw time, as it should change if SGVs are emitted.
3616          */
3617         vi.InstancingEnable = state[edgeflag_index].instance_divisor > 0;
3618         vi.InstanceDataStepRate = state[edgeflag_index].instance_divisor;
3619      }
3620   }
3621
3622   return cso;
3623}
3624
3625/**
3626 * The pipe->bind_vertex_elements_state() driver hook.
3627 */
3628static void
3629iris_bind_vertex_elements_state(struct pipe_context *ctx, void *state)
3630{
3631   struct iris_context *ice = (struct iris_context *) ctx;
3632   struct iris_vertex_element_state *old_cso = ice->state.cso_vertex_elements;
3633   struct iris_vertex_element_state *new_cso = state;
3634
3635   /* 3DSTATE_VF_SGVs overrides the last VE, so if the count is changing,
3636    * we need to re-emit it to ensure we're overriding the right one.
3637    */
3638   if (new_cso && cso_changed(count))
3639      ice->state.dirty |= IRIS_DIRTY_VF_SGVS;
3640
3641   ice->state.cso_vertex_elements = state;
3642   ice->state.dirty |= IRIS_DIRTY_VERTEX_ELEMENTS;
3643}
3644
3645/**
3646 * The pipe->create_stream_output_target() driver hook.
3647 *
3648 * "Target" here refers to a destination buffer.  We translate this into
3649 * a 3DSTATE_SO_BUFFER packet.  We can handle most fields, but don't yet
3650 * know which buffer this represents, or whether we ought to zero the
3651 * write-offsets, or append.  Those are handled in the set() hook.
3652 */
3653static struct pipe_stream_output_target *
3654iris_create_stream_output_target(struct pipe_context *ctx,
3655                                 struct pipe_resource *p_res,
3656                                 unsigned buffer_offset,
3657                                 unsigned buffer_size)
3658{
3659   struct iris_resource *res = (void *) p_res;
3660   struct iris_stream_output_target *cso = calloc(1, sizeof(*cso));
3661   if (!cso)
3662      return NULL;
3663
3664   res->bind_history |= PIPE_BIND_STREAM_OUTPUT;
3665
3666   pipe_reference_init(&cso->base.reference, 1);
3667   pipe_resource_reference(&cso->base.buffer, p_res);
3668   cso->base.buffer_offset = buffer_offset;
3669   cso->base.buffer_size = buffer_size;
3670   cso->base.context = ctx;
3671
3672   util_range_add(&res->base.b, &res->valid_buffer_range, buffer_offset,
3673                  buffer_offset + buffer_size);
3674
3675   return &cso->base;
3676}
3677
3678static void
3679iris_stream_output_target_destroy(struct pipe_context *ctx,
3680                                  struct pipe_stream_output_target *state)
3681{
3682   struct iris_stream_output_target *cso = (void *) state;
3683
3684   pipe_resource_reference(&cso->base.buffer, NULL);
3685   pipe_resource_reference(&cso->offset.res, NULL);
3686
3687   free(cso);
3688}
3689
3690/**
3691 * The pipe->set_stream_output_targets() driver hook.
3692 *
3693 * At this point, we know which targets are bound to a particular index,
3694 * and also whether we want to append or start over.  We can finish the
3695 * 3DSTATE_SO_BUFFER packets we started earlier.
3696 */
3697static void
3698iris_set_stream_output_targets(struct pipe_context *ctx,
3699                               unsigned num_targets,
3700                               struct pipe_stream_output_target **targets,
3701                               const unsigned *offsets)
3702{
3703   struct iris_context *ice = (struct iris_context *) ctx;
3704   struct iris_genx_state *genx = ice->state.genx;
3705   uint32_t *so_buffers = genx->so_buffers;
3706   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
3707
3708   const bool active = num_targets > 0;
3709   if (ice->state.streamout_active != active) {
3710      ice->state.streamout_active = active;
3711      ice->state.dirty |= IRIS_DIRTY_STREAMOUT;
3712
3713      /* We only emit 3DSTATE_SO_DECL_LIST when streamout is active, because
3714       * it's a non-pipelined command.  If we're switching streamout on, we
3715       * may have missed emitting it earlier, so do so now.  (We're already
3716       * taking a stall to update 3DSTATE_SO_BUFFERS anyway...)
3717       */
3718      if (active) {
3719         ice->state.dirty |= IRIS_DIRTY_SO_DECL_LIST;
3720      } else {
3721         uint32_t flush = 0;
3722         for (int i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
3723            struct iris_stream_output_target *tgt =
3724               (void *) ice->state.so_target[i];
3725            if (tgt) {
3726               struct iris_resource *res = (void *) tgt->base.buffer;
3727
3728               flush |= iris_flush_bits_for_history(ice, res);
3729               iris_dirty_for_history(ice, res);
3730            }
3731         }
3732#if GFX_VER >= 12
3733         /* SO draws require flushing of const cache to make SO data
3734          * observable when VB/IB are cached in L3.
3735          */
3736         if (flush & PIPE_CONTROL_VF_CACHE_INVALIDATE)
3737            flush |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3738#endif
3739         iris_emit_pipe_control_flush(&ice->batches[IRIS_BATCH_RENDER],
3740                                      "make streamout results visible", flush);
3741      }
3742   }
3743
3744   for (int i = 0; i < 4; i++) {
3745      pipe_so_target_reference(&ice->state.so_target[i],
3746                               i < num_targets ? targets[i] : NULL);
3747   }
3748
3749   /* No need to update 3DSTATE_SO_BUFFER unless SOL is active. */
3750   if (!active)
3751      return;
3752
3753   for (unsigned i = 0; i < 4; i++,
3754        so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
3755
3756      struct iris_stream_output_target *tgt = (void *) ice->state.so_target[i];
3757      unsigned offset = offsets[i];
3758
3759      if (!tgt) {
3760         iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
3761#if GFX_VER < 12
3762            sob.SOBufferIndex = i;
3763#else
3764            sob._3DCommandOpcode = 0;
3765            sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
3766#endif
3767         }
3768         continue;
3769      }
3770
3771      if (!tgt->offset.res)
3772         upload_state(ctx->const_uploader, &tgt->offset, sizeof(uint32_t), 4);
3773
3774      struct iris_resource *res = (void *) tgt->base.buffer;
3775
3776      /* Note that offsets[i] will either be 0, causing us to zero
3777       * the value in the buffer, or 0xFFFFFFFF, which happens to mean
3778       * "continue appending at the existing offset."
3779       */
3780      assert(offset == 0 || offset == 0xFFFFFFFF);
3781
3782      /* When we're first called with an offset of 0, we want the next
3783       * 3DSTATE_SO_BUFFER packets to reset the offset to the beginning.
3784       * Any further times we emit those packets, we want to use 0xFFFFFFFF
3785       * to continue appending from the current offset.
3786       *
3787       * Note that we might be called by Begin (offset = 0), Pause, then
3788       * Resume (offset = 0xFFFFFFFF) before ever drawing (where these
3789       * commands will actually be sent to the GPU).  In this case, we
3790       * don't want to append - we still want to do our initial zeroing.
3791       */
3792      if (offset == 0)
3793         tgt->zero_offset = true;
3794
3795      iris_pack_command(GENX(3DSTATE_SO_BUFFER), so_buffers, sob) {
3796#if GFX_VER < 12
3797         sob.SOBufferIndex = i;
3798#else
3799         sob._3DCommandOpcode = 0;
3800         sob._3DCommandSubOpcode = SO_BUFFER_INDEX_0_CMD + i;
3801#endif
3802         sob.SurfaceBaseAddress =
3803            rw_bo(NULL, res->bo->address + tgt->base.buffer_offset,
3804                  IRIS_DOMAIN_OTHER_WRITE);
3805         sob.SOBufferEnable = true;
3806         sob.StreamOffsetWriteEnable = true;
3807         sob.StreamOutputBufferOffsetAddressEnable = true;
3808         sob.MOCS = iris_mocs(res->bo, &screen->isl_dev, 0);
3809
3810         sob.SurfaceSize = MAX2(tgt->base.buffer_size / 4, 1) - 1;
3811         sob.StreamOutputBufferOffsetAddress =
3812            rw_bo(NULL, iris_resource_bo(tgt->offset.res)->address +
3813                        tgt->offset.offset, IRIS_DOMAIN_OTHER_WRITE);
3814         sob.StreamOffset = 0xFFFFFFFF; /* not offset, see above */
3815      }
3816   }
3817
3818   ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
3819}
3820
3821/**
3822 * An iris-vtable helper for encoding the 3DSTATE_SO_DECL_LIST and
3823 * 3DSTATE_STREAMOUT packets.
3824 *
3825 * 3DSTATE_SO_DECL_LIST is a list of shader outputs we want the streamout
3826 * hardware to record.  We can create it entirely based on the shader, with
3827 * no dynamic state dependencies.
3828 *
3829 * 3DSTATE_STREAMOUT is an annoying mix of shader-based information and
3830 * state-based settings.  We capture the shader-related ones here, and merge
3831 * the rest in at draw time.
3832 */
3833static uint32_t *
3834iris_create_so_decl_list(const struct pipe_stream_output_info *info,
3835                         const struct brw_vue_map *vue_map)
3836{
3837   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3838   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3839   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3840   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3841   int max_decls = 0;
3842   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3843
3844   memset(so_decl, 0, sizeof(so_decl));
3845
3846   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3847    * command feels strange -- each dword pair contains a SO_DECL per stream.
3848    */
3849   for (unsigned i = 0; i < info->num_outputs; i++) {
3850      const struct pipe_stream_output *output = &info->output[i];
3851      const int buffer = output->output_buffer;
3852      const int varying = output->register_index;
3853      const unsigned stream_id = output->stream;
3854      assert(stream_id < MAX_VERTEX_STREAMS);
3855
3856      buffer_mask[stream_id] |= 1 << buffer;
3857
3858      assert(vue_map->varying_to_slot[varying] >= 0);
3859
3860      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3861       * array.  Instead, it simply increments DstOffset for the following
3862       * input by the number of components that should be skipped.
3863       *
3864       * Our hardware is unusual in that it requires us to program SO_DECLs
3865       * for fake "hole" components, rather than simply taking the offset
3866       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3867       * program as many size = 4 holes as we can, then a final hole to
3868       * accommodate the final 1, 2, or 3 remaining.
3869       */
3870      int skip_components = output->dst_offset - next_offset[buffer];
3871
3872      while (skip_components > 0) {
3873         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3874            .HoleFlag = 1,
3875            .OutputBufferSlot = output->output_buffer,
3876            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3877         };
3878         skip_components -= 4;
3879      }
3880
3881      next_offset[buffer] = output->dst_offset + output->num_components;
3882
3883      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3884         .OutputBufferSlot = output->output_buffer,
3885         .RegisterIndex = vue_map->varying_to_slot[varying],
3886         .ComponentMask =
3887            ((1 << output->num_components) - 1) << output->start_component,
3888      };
3889
3890      if (decls[stream_id] > max_decls)
3891         max_decls = decls[stream_id];
3892   }
3893
3894   unsigned dwords = GENX(3DSTATE_STREAMOUT_length) + (3 + 2 * max_decls);
3895   uint32_t *map = ralloc_size(NULL, sizeof(uint32_t) * dwords);
3896   uint32_t *so_decl_map = map + GENX(3DSTATE_STREAMOUT_length);
3897
3898   iris_pack_command(GENX(3DSTATE_STREAMOUT), map, sol) {
3899      int urb_entry_read_offset = 0;
3900      int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3901         urb_entry_read_offset;
3902
3903      /* We always read the whole vertex.  This could be reduced at some
3904       * point by reading less and offsetting the register index in the
3905       * SO_DECLs.
3906       */
3907      sol.Stream0VertexReadOffset = urb_entry_read_offset;
3908      sol.Stream0VertexReadLength = urb_entry_read_length - 1;
3909      sol.Stream1VertexReadOffset = urb_entry_read_offset;
3910      sol.Stream1VertexReadLength = urb_entry_read_length - 1;
3911      sol.Stream2VertexReadOffset = urb_entry_read_offset;
3912      sol.Stream2VertexReadLength = urb_entry_read_length - 1;
3913      sol.Stream3VertexReadOffset = urb_entry_read_offset;
3914      sol.Stream3VertexReadLength = urb_entry_read_length - 1;
3915
3916      /* Set buffer pitches; 0 means unbound. */
3917      sol.Buffer0SurfacePitch = 4 * info->stride[0];
3918      sol.Buffer1SurfacePitch = 4 * info->stride[1];
3919      sol.Buffer2SurfacePitch = 4 * info->stride[2];
3920      sol.Buffer3SurfacePitch = 4 * info->stride[3];
3921   }
3922
3923   iris_pack_command(GENX(3DSTATE_SO_DECL_LIST), so_decl_map, list) {
3924      list.DWordLength = 3 + 2 * max_decls - 2;
3925      list.StreamtoBufferSelects0 = buffer_mask[0];
3926      list.StreamtoBufferSelects1 = buffer_mask[1];
3927      list.StreamtoBufferSelects2 = buffer_mask[2];
3928      list.StreamtoBufferSelects3 = buffer_mask[3];
3929      list.NumEntries0 = decls[0];
3930      list.NumEntries1 = decls[1];
3931      list.NumEntries2 = decls[2];
3932      list.NumEntries3 = decls[3];
3933   }
3934
3935   for (int i = 0; i < max_decls; i++) {
3936      iris_pack_state(GENX(SO_DECL_ENTRY), so_decl_map + 3 + i * 2, entry) {
3937         entry.Stream0Decl = so_decl[0][i];
3938         entry.Stream1Decl = so_decl[1][i];
3939         entry.Stream2Decl = so_decl[2][i];
3940         entry.Stream3Decl = so_decl[3][i];
3941      }
3942   }
3943
3944   return map;
3945}
3946
3947static void
3948iris_compute_sbe_urb_read_interval(uint64_t fs_input_slots,
3949                                   const struct brw_vue_map *last_vue_map,
3950                                   bool two_sided_color,
3951                                   unsigned *out_offset,
3952                                   unsigned *out_length)
3953{
3954   /* The compiler computes the first URB slot without considering COL/BFC
3955    * swizzling (because it doesn't know whether it's enabled), so we need
3956    * to do that here too.  This may result in a smaller offset, which
3957    * should be safe.
3958    */
3959   const unsigned first_slot =
3960      brw_compute_first_urb_slot_required(fs_input_slots, last_vue_map);
3961
3962   /* This becomes the URB read offset (counted in pairs of slots). */
3963   assert(first_slot % 2 == 0);
3964   *out_offset = first_slot / 2;
3965
3966   /* We need to adjust the inputs read to account for front/back color
3967    * swizzling, as it can make the URB length longer.
3968    */
3969   for (int c = 0; c <= 1; c++) {
3970      if (fs_input_slots & (VARYING_BIT_COL0 << c)) {
3971         /* If two sided color is enabled, the fragment shader's gl_Color
3972          * (COL0) input comes from either the gl_FrontColor (COL0) or
3973          * gl_BackColor (BFC0) input varyings.  Mark BFC as used, too.
3974          */
3975         if (two_sided_color)
3976            fs_input_slots |= (VARYING_BIT_BFC0 << c);
3977
3978         /* If front color isn't written, we opt to give them back color
3979          * instead of an undefined value.  Switch from COL to BFC.
3980          */
3981         if (last_vue_map->varying_to_slot[VARYING_SLOT_COL0 + c] == -1) {
3982            fs_input_slots &= ~(VARYING_BIT_COL0 << c);
3983            fs_input_slots |= (VARYING_BIT_BFC0 << c);
3984         }
3985      }
3986   }
3987
3988   /* Compute the minimum URB Read Length necessary for the FS inputs.
3989    *
3990    * From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
3991    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
3992    *
3993    * "This field should be set to the minimum length required to read the
3994    *  maximum source attribute.  The maximum source attribute is indicated
3995    *  by the maximum value of the enabled Attribute # Source Attribute if
3996    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
3997    *  enable is not set.
3998    *  read_length = ceiling((max_source_attr + 1) / 2)
3999    *
4000    *  [errata] Corruption/Hang possible if length programmed larger than
4001    *  recommended"
4002    *
4003    * Similar text exists for Ivy Bridge.
4004    *
4005    * We find the last URB slot that's actually read by the FS.
4006    */
4007   unsigned last_read_slot = last_vue_map->num_slots - 1;
4008   while (last_read_slot > first_slot && !(fs_input_slots &
4009          (1ull << last_vue_map->slot_to_varying[last_read_slot])))
4010      --last_read_slot;
4011
4012   /* The URB read length is the difference of the two, counted in pairs. */
4013   *out_length = DIV_ROUND_UP(last_read_slot - first_slot + 1, 2);
4014}
4015
4016static void
4017iris_emit_sbe_swiz(struct iris_batch *batch,
4018                   const struct iris_context *ice,
4019                   const struct brw_vue_map *vue_map,
4020                   unsigned urb_read_offset,
4021                   unsigned sprite_coord_enables)
4022{
4023   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = {};
4024   const struct brw_wm_prog_data *wm_prog_data = (void *)
4025      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4026   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4027
4028   /* XXX: this should be generated when putting programs in place */
4029
4030   for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
4031      const uint8_t fs_attr = wm_prog_data->urb_setup_attribs[idx];
4032      const int input_index = wm_prog_data->urb_setup[fs_attr];
4033      if (input_index < 0 || input_index >= 16)
4034         continue;
4035
4036      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr =
4037         &attr_overrides[input_index];
4038      int slot = vue_map->varying_to_slot[fs_attr];
4039
4040      /* Viewport and Layer are stored in the VUE header.  We need to override
4041       * them to zero if earlier stages didn't write them, as GL requires that
4042       * they read back as zero when not explicitly set.
4043       */
4044      switch (fs_attr) {
4045      case VARYING_SLOT_VIEWPORT:
4046      case VARYING_SLOT_LAYER:
4047         attr->ComponentOverrideX = true;
4048         attr->ComponentOverrideW = true;
4049         attr->ConstantSource = CONST_0000;
4050
4051         if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
4052            attr->ComponentOverrideY = true;
4053         if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
4054            attr->ComponentOverrideZ = true;
4055         continue;
4056
4057      case VARYING_SLOT_PRIMITIVE_ID:
4058         /* Override if the previous shader stage didn't write gl_PrimitiveID. */
4059         if (slot == -1) {
4060            attr->ComponentOverrideX = true;
4061            attr->ComponentOverrideY = true;
4062            attr->ComponentOverrideZ = true;
4063            attr->ComponentOverrideW = true;
4064            attr->ConstantSource = PRIM_ID;
4065            continue;
4066         }
4067         break;
4068
4069      default:
4070         break;
4071      }
4072
4073      if (sprite_coord_enables & (1 << input_index))
4074         continue;
4075
4076      /* If there was only a back color written but not front, use back
4077       * as the color instead of undefined.
4078       */
4079      if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
4080         slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
4081      if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
4082         slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
4083
4084      /* Not written by the previous stage - undefined. */
4085      if (slot == -1) {
4086         attr->ComponentOverrideX = true;
4087         attr->ComponentOverrideY = true;
4088         attr->ComponentOverrideZ = true;
4089         attr->ComponentOverrideW = true;
4090         attr->ConstantSource = CONST_0001_FLOAT;
4091         continue;
4092      }
4093
4094      /* Compute the location of the attribute relative to the read offset,
4095       * which is counted in 256-bit increments (two 128-bit VUE slots).
4096       */
4097      const int source_attr = slot - 2 * urb_read_offset;
4098      assert(source_attr >= 0 && source_attr <= 32);
4099      attr->SourceAttribute = source_attr;
4100
4101      /* If we are doing two-sided color, and the VUE slot following this one
4102       * represents a back-facing color, then we need to instruct the SF unit
4103       * to do back-facing swizzling.
4104       */
4105      if (cso_rast->light_twoside &&
4106          ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
4107            vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
4108           (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
4109            vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)))
4110         attr->SwizzleSelect = INPUTATTR_FACING;
4111   }
4112
4113   iris_emit_cmd(batch, GENX(3DSTATE_SBE_SWIZ), sbes) {
4114      for (int i = 0; i < 16; i++)
4115         sbes.Attribute[i] = attr_overrides[i];
4116   }
4117}
4118
4119static bool
4120iris_is_drawing_points(const struct iris_context *ice)
4121{
4122   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4123
4124   if (cso_rast->fill_mode_point) {
4125      return true;
4126   }
4127
4128   if (ice->shaders.prog[MESA_SHADER_GEOMETRY]) {
4129      const struct brw_gs_prog_data *gs_prog_data =
4130         (void *) ice->shaders.prog[MESA_SHADER_GEOMETRY]->prog_data;
4131      return gs_prog_data->output_topology == _3DPRIM_POINTLIST;
4132   } else if (ice->shaders.prog[MESA_SHADER_TESS_EVAL]) {
4133      const struct brw_tes_prog_data *tes_data =
4134         (void *) ice->shaders.prog[MESA_SHADER_TESS_EVAL]->prog_data;
4135      return tes_data->output_topology == BRW_TESS_OUTPUT_TOPOLOGY_POINT;
4136   } else {
4137      return ice->state.prim_mode == PIPE_PRIM_POINTS;
4138   }
4139}
4140
4141static unsigned
4142iris_calculate_point_sprite_overrides(const struct brw_wm_prog_data *prog_data,
4143                                      const struct iris_rasterizer_state *cso)
4144{
4145   unsigned overrides = 0;
4146
4147   if (prog_data->urb_setup[VARYING_SLOT_PNTC] != -1)
4148      overrides |= 1 << prog_data->urb_setup[VARYING_SLOT_PNTC];
4149
4150   for (int i = 0; i < 8; i++) {
4151      if ((cso->sprite_coord_enable & (1 << i)) &&
4152          prog_data->urb_setup[VARYING_SLOT_TEX0 + i] != -1)
4153         overrides |= 1 << prog_data->urb_setup[VARYING_SLOT_TEX0 + i];
4154   }
4155
4156   return overrides;
4157}
4158
4159static void
4160iris_emit_sbe(struct iris_batch *batch, const struct iris_context *ice)
4161{
4162   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4163   const struct brw_wm_prog_data *wm_prog_data = (void *)
4164      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
4165   const struct shader_info *fs_info =
4166      iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
4167   const struct brw_vue_map *last_vue_map =
4168      &brw_vue_prog_data(ice->shaders.last_vue_shader->prog_data)->vue_map;
4169
4170   unsigned urb_read_offset, urb_read_length;
4171   iris_compute_sbe_urb_read_interval(fs_info->inputs_read,
4172                                      last_vue_map,
4173                                      cso_rast->light_twoside,
4174                                      &urb_read_offset, &urb_read_length);
4175
4176   unsigned sprite_coord_overrides =
4177      iris_is_drawing_points(ice) ?
4178      iris_calculate_point_sprite_overrides(wm_prog_data, cso_rast) : 0;
4179
4180   iris_emit_cmd(batch, GENX(3DSTATE_SBE), sbe) {
4181      sbe.AttributeSwizzleEnable = true;
4182      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
4183      sbe.PointSpriteTextureCoordinateOrigin = cso_rast->sprite_coord_mode;
4184      sbe.VertexURBEntryReadOffset = urb_read_offset;
4185      sbe.VertexURBEntryReadLength = urb_read_length;
4186      sbe.ForceVertexURBEntryReadOffset = true;
4187      sbe.ForceVertexURBEntryReadLength = true;
4188      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
4189      sbe.PointSpriteTextureCoordinateEnable = sprite_coord_overrides;
4190#if GFX_VER >= 9
4191      for (int i = 0; i < 32; i++) {
4192         sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
4193      }
4194#endif
4195   }
4196
4197   iris_emit_sbe_swiz(batch, ice, last_vue_map, urb_read_offset,
4198                      sprite_coord_overrides);
4199}
4200
4201/* ------------------------------------------------------------------- */
4202
4203/**
4204 * Populate VS program key fields based on the current state.
4205 */
4206static void
4207iris_populate_vs_key(const struct iris_context *ice,
4208                     const struct shader_info *info,
4209                     gl_shader_stage last_stage,
4210                     struct iris_vs_prog_key *key)
4211{
4212   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4213
4214   if (info->clip_distance_array_size == 0 &&
4215       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4216       last_stage == MESA_SHADER_VERTEX)
4217      key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4218}
4219
4220/**
4221 * Populate TCS program key fields based on the current state.
4222 */
4223static void
4224iris_populate_tcs_key(const struct iris_context *ice,
4225                      struct iris_tcs_prog_key *key)
4226{
4227}
4228
4229/**
4230 * Populate TES program key fields based on the current state.
4231 */
4232static void
4233iris_populate_tes_key(const struct iris_context *ice,
4234                      const struct shader_info *info,
4235                      gl_shader_stage last_stage,
4236                      struct iris_tes_prog_key *key)
4237{
4238   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4239
4240   if (info->clip_distance_array_size == 0 &&
4241       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4242       last_stage == MESA_SHADER_TESS_EVAL)
4243      key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4244}
4245
4246/**
4247 * Populate GS program key fields based on the current state.
4248 */
4249static void
4250iris_populate_gs_key(const struct iris_context *ice,
4251                     const struct shader_info *info,
4252                     gl_shader_stage last_stage,
4253                     struct iris_gs_prog_key *key)
4254{
4255   const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
4256
4257   if (info->clip_distance_array_size == 0 &&
4258       (info->outputs_written & (VARYING_BIT_POS | VARYING_BIT_CLIP_VERTEX)) &&
4259       last_stage == MESA_SHADER_GEOMETRY)
4260      key->vue.nr_userclip_plane_consts = cso_rast->num_clip_plane_consts;
4261}
4262
4263/**
4264 * Populate FS program key fields based on the current state.
4265 */
4266static void
4267iris_populate_fs_key(const struct iris_context *ice,
4268                     const struct shader_info *info,
4269                     struct iris_fs_prog_key *key)
4270{
4271   struct iris_screen *screen = (void *) ice->ctx.screen;
4272   const struct pipe_framebuffer_state *fb = &ice->state.framebuffer;
4273   const struct iris_depth_stencil_alpha_state *zsa = ice->state.cso_zsa;
4274   const struct iris_rasterizer_state *rast = ice->state.cso_rast;
4275   const struct iris_blend_state *blend = ice->state.cso_blend;
4276
4277   key->nr_color_regions = fb->nr_cbufs;
4278
4279   key->clamp_fragment_color = rast->clamp_fragment_color;
4280
4281   key->alpha_to_coverage = blend->alpha_to_coverage;
4282
4283   key->alpha_test_replicate_alpha = fb->nr_cbufs > 1 && zsa->alpha_enabled;
4284
4285   key->flat_shade = rast->flatshade &&
4286      (info->inputs_read & (VARYING_BIT_COL0 | VARYING_BIT_COL1));
4287
4288   key->persample_interp = rast->force_persample_interp;
4289   key->multisample_fbo = rast->multisample && fb->samples > 1;
4290
4291   key->coherent_fb_fetch = GFX_VER >= 9;
4292
4293   key->force_dual_color_blend =
4294      screen->driconf.dual_color_blend_by_location &&
4295      (blend->blend_enables & 1) && blend->dual_color_blending;
4296
4297   /* TODO: Respect glHint for key->high_quality_derivatives */
4298}
4299
4300static void
4301iris_populate_cs_key(const struct iris_context *ice,
4302                     struct iris_cs_prog_key *key)
4303{
4304}
4305
4306static uint64_t
4307KSP(const struct iris_compiled_shader *shader)
4308{
4309   struct iris_resource *res = (void *) shader->assembly.res;
4310   return iris_bo_offset_from_base_address(res->bo) + shader->assembly.offset;
4311}
4312
4313#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix, stage)                   \
4314   pkt.KernelStartPointer = KSP(shader);                                  \
4315   pkt.BindingTableEntryCount = shader->bt.size_bytes / 4;                \
4316   pkt.FloatingPointMode = prog_data->use_alt_mode;                       \
4317                                                                          \
4318   pkt.DispatchGRFStartRegisterForURBData =                               \
4319      prog_data->dispatch_grf_start_reg;                                  \
4320   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
4321   pkt.prefix##URBEntryReadOffset = 0;                                    \
4322                                                                          \
4323   pkt.StatisticsEnable = true;                                           \
4324   pkt.Enable           = true;                                           \
4325                                                                          \
4326   if (prog_data->total_scratch) {                                        \
4327      INIT_THREAD_SCRATCH_SIZE(pkt)                                       \
4328   }
4329
4330#if GFX_VERx10 >= 125
4331#define INIT_THREAD_SCRATCH_SIZE(pkt)
4332#define MERGE_SCRATCH_ADDR(name)                                          \
4333{                                                                         \
4334   uint32_t pkt2[GENX(name##_length)] = {0};                              \
4335   _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
4336      p.ScratchSpaceBuffer = scratch_addr >> 4;                           \
4337   }                                                                      \
4338   iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
4339}
4340#else
4341#define INIT_THREAD_SCRATCH_SIZE(pkt)                                     \
4342   pkt.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
4343#define MERGE_SCRATCH_ADDR(name)                                          \
4344{                                                                         \
4345   uint32_t pkt2[GENX(name##_length)] = {0};                              \
4346   _iris_pack_command(batch, GENX(name), pkt2, p) {                       \
4347      p.ScratchSpaceBasePointer =                                         \
4348         rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);                     \
4349   }                                                                      \
4350   iris_emit_merge(batch, pkt, pkt2, GENX(name##_length));                \
4351}
4352#endif
4353
4354
4355/**
4356 * Encode most of 3DSTATE_VS based on the compiled shader.
4357 */
4358static void
4359iris_store_vs_state(const struct intel_device_info *devinfo,
4360                    struct iris_compiled_shader *shader)
4361{
4362   struct brw_stage_prog_data *prog_data = shader->prog_data;
4363   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
4364
4365   iris_pack_command(GENX(3DSTATE_VS), shader->derived_data, vs) {
4366      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex, MESA_SHADER_VERTEX);
4367      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
4368      vs.SIMD8DispatchEnable = true;
4369      vs.UserClipDistanceCullTestEnableBitmask =
4370         vue_prog_data->cull_distance_mask;
4371   }
4372}
4373
4374/**
4375 * Encode most of 3DSTATE_HS based on the compiled shader.
4376 */
4377static void
4378iris_store_tcs_state(const struct intel_device_info *devinfo,
4379                     struct iris_compiled_shader *shader)
4380{
4381   struct brw_stage_prog_data *prog_data = shader->prog_data;
4382   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
4383   struct brw_tcs_prog_data *tcs_prog_data = (void *) prog_data;
4384
4385   iris_pack_command(GENX(3DSTATE_HS), shader->derived_data, hs) {
4386      INIT_THREAD_DISPATCH_FIELDS(hs, Vertex, MESA_SHADER_TESS_CTRL);
4387
4388#if GFX_VER >= 12
4389      /* Wa_1604578095:
4390       *
4391       *    Hang occurs when the number of max threads is less than 2 times
4392       *    the number of instance count. The number of max threads must be
4393       *    more than 2 times the number of instance count.
4394       */
4395      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
4396      hs.DispatchGRFStartRegisterForURBData = prog_data->dispatch_grf_start_reg & 0x1f;
4397      hs.DispatchGRFStartRegisterForURBData5 = prog_data->dispatch_grf_start_reg >> 5;
4398#endif
4399
4400      hs.InstanceCount = tcs_prog_data->instances - 1;
4401      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4402      hs.IncludeVertexHandles = true;
4403
4404#if GFX_VER == 12
4405      /* Patch Count threshold specifies the maximum number of patches that
4406       * will be accumulated before a thread dispatch is forced.
4407       */
4408      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
4409#endif
4410
4411#if GFX_VER >= 9
4412      hs.DispatchMode = vue_prog_data->dispatch_mode;
4413      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
4414#endif
4415   }
4416}
4417
4418/**
4419 * Encode 3DSTATE_TE and most of 3DSTATE_DS based on the compiled shader.
4420 */
4421static void
4422iris_store_tes_state(const struct intel_device_info *devinfo,
4423                     struct iris_compiled_shader *shader)
4424{
4425   struct brw_stage_prog_data *prog_data = shader->prog_data;
4426   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
4427   struct brw_tes_prog_data *tes_prog_data = (void *) prog_data;
4428
4429   uint32_t *ds_state = (void *) shader->derived_data;
4430   uint32_t *te_state = ds_state + GENX(3DSTATE_DS_length);
4431
4432   iris_pack_command(GENX(3DSTATE_DS), ds_state, ds) {
4433      INIT_THREAD_DISPATCH_FIELDS(ds, Patch, MESA_SHADER_TESS_EVAL);
4434
4435      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4436      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4437      ds.ComputeWCoordinateEnable =
4438         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4439
4440      ds.UserClipDistanceCullTestEnableBitmask =
4441         vue_prog_data->cull_distance_mask;
4442   }
4443
4444   iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
4445      te.Partitioning = tes_prog_data->partitioning;
4446      te.OutputTopology = tes_prog_data->output_topology;
4447      te.TEDomain = tes_prog_data->domain;
4448      te.TEEnable = true;
4449      te.MaximumTessellationFactorOdd = 63.0;
4450      te.MaximumTessellationFactorNotOdd = 64.0;
4451   }
4452}
4453
4454/**
4455 * Encode most of 3DSTATE_GS based on the compiled shader.
4456 */
4457static void
4458iris_store_gs_state(const struct intel_device_info *devinfo,
4459                    struct iris_compiled_shader *shader)
4460{
4461   struct brw_stage_prog_data *prog_data = shader->prog_data;
4462   struct brw_vue_prog_data *vue_prog_data = (void *) prog_data;
4463   struct brw_gs_prog_data *gs_prog_data = (void *) prog_data;
4464
4465   iris_pack_command(GENX(3DSTATE_GS), shader->derived_data, gs) {
4466      INIT_THREAD_DISPATCH_FIELDS(gs, Vertex, MESA_SHADER_GEOMETRY);
4467
4468      gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
4469      gs.OutputTopology = gs_prog_data->output_topology;
4470      gs.ControlDataHeaderSize =
4471         gs_prog_data->control_data_header_size_hwords;
4472      gs.InstanceControl = gs_prog_data->invocations - 1;
4473      gs.DispatchMode = DISPATCH_MODE_SIMD8;
4474      gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
4475      gs.ControlDataFormat = gs_prog_data->control_data_format;
4476      gs.ReorderMode = TRAILING;
4477      gs.ExpectedVertexCount = gs_prog_data->vertices_in;
4478      gs.MaximumNumberofThreads =
4479         GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
4480                      : (devinfo->max_gs_threads - 1);
4481
4482      if (gs_prog_data->static_vertex_count != -1) {
4483         gs.StaticOutput = true;
4484         gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
4485      }
4486      gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
4487
4488      gs.UserClipDistanceCullTestEnableBitmask =
4489         vue_prog_data->cull_distance_mask;
4490
4491      const int urb_entry_write_offset = 1;
4492      const uint32_t urb_entry_output_length =
4493         DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
4494         urb_entry_write_offset;
4495
4496      gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
4497      gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
4498   }
4499}
4500
4501/**
4502 * Encode most of 3DSTATE_PS and 3DSTATE_PS_EXTRA based on the shader.
4503 */
4504static void
4505iris_store_fs_state(const struct intel_device_info *devinfo,
4506                    struct iris_compiled_shader *shader)
4507{
4508   struct brw_stage_prog_data *prog_data = shader->prog_data;
4509   struct brw_wm_prog_data *wm_prog_data = (void *) shader->prog_data;
4510
4511   uint32_t *ps_state = (void *) shader->derived_data;
4512   uint32_t *psx_state = ps_state + GENX(3DSTATE_PS_length);
4513
4514   iris_pack_command(GENX(3DSTATE_PS), ps_state, ps) {
4515      ps.VectorMaskEnable = true;
4516      ps.BindingTableEntryCount = shader->bt.size_bytes / 4;
4517      ps.FloatingPointMode = prog_data->use_alt_mode;
4518      ps.MaximumNumberofThreadsPerPSD = 64 - (GFX_VER == 8 ? 2 : 1);
4519
4520      ps.PushConstantEnable = prog_data->ubo_ranges[0].length > 0;
4521
4522      /* From the documentation for this packet:
4523       * "If the PS kernel does not need the Position XY Offsets to
4524       *  compute a Position Value, then this field should be programmed
4525       *  to POSOFFSET_NONE."
4526       *
4527       * "SW Recommendation: If the PS kernel needs the Position Offsets
4528       *  to compute a Position XY value, this field should match Position
4529       *  ZW Interpolation Mode to ensure a consistent position.xyzw
4530       *  computation."
4531       *
4532       * We only require XY sample offsets. So, this recommendation doesn't
4533       * look useful at the moment.  We might need this in future.
4534       */
4535      ps.PositionXYOffsetSelect =
4536         wm_prog_data->uses_pos_offset ? POSOFFSET_SAMPLE : POSOFFSET_NONE;
4537
4538      if (prog_data->total_scratch) {
4539         INIT_THREAD_SCRATCH_SIZE(ps);
4540      }
4541   }
4542
4543   iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
4544      psx.PixelShaderValid = true;
4545      psx.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
4546      psx.PixelShaderKillsPixel = wm_prog_data->uses_kill;
4547      psx.AttributeEnable = wm_prog_data->num_varying_inputs != 0;
4548      psx.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
4549      psx.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
4550      psx.PixelShaderIsPerSample = wm_prog_data->persample_dispatch;
4551      psx.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
4552
4553#if GFX_VER >= 9
4554      psx.PixelShaderPullsBary = wm_prog_data->pulls_bary;
4555      psx.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
4556#endif
4557   }
4558}
4559
4560/**
4561 * Compute the size of the derived data (shader command packets).
4562 *
4563 * This must match the data written by the iris_store_xs_state() functions.
4564 */
4565static void
4566iris_store_cs_state(const struct intel_device_info *devinfo,
4567                    struct iris_compiled_shader *shader)
4568{
4569   struct brw_cs_prog_data *cs_prog_data = (void *) shader->prog_data;
4570   void *map = shader->derived_data;
4571
4572   iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), map, desc) {
4573#if GFX_VERx10 < 125
4574      desc.ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs;
4575      desc.CrossThreadConstantDataReadLength =
4576         cs_prog_data->push.cross_thread.regs;
4577#else
4578      assert(cs_prog_data->push.per_thread.regs == 0);
4579      assert(cs_prog_data->push.cross_thread.regs == 0);
4580#endif
4581      desc.BarrierEnable = cs_prog_data->uses_barrier;
4582#if GFX_VER >= 12
4583      /* TODO: Check if we are missing workarounds and enable mid-thread
4584       * preemption.
4585       *
4586       * We still have issues with mid-thread preemption (it was already
4587       * disabled by the kernel on gfx11, due to missing workarounds). It's
4588       * possible that we are just missing some workarounds, and could enable
4589       * it later, but for now let's disable it to fix a GPU in compute in Car
4590       * Chase (and possibly more).
4591       */
4592      desc.ThreadPreemptionDisable = true;
4593#endif
4594   }
4595}
4596
4597static unsigned
4598iris_derived_program_state_size(enum iris_program_cache_id cache_id)
4599{
4600   assert(cache_id <= IRIS_CACHE_BLORP);
4601
4602   static const unsigned dwords[] = {
4603      [IRIS_CACHE_VS] = GENX(3DSTATE_VS_length),
4604      [IRIS_CACHE_TCS] = GENX(3DSTATE_HS_length),
4605      [IRIS_CACHE_TES] = GENX(3DSTATE_TE_length) + GENX(3DSTATE_DS_length),
4606      [IRIS_CACHE_GS] = GENX(3DSTATE_GS_length),
4607      [IRIS_CACHE_FS] =
4608         GENX(3DSTATE_PS_length) + GENX(3DSTATE_PS_EXTRA_length),
4609      [IRIS_CACHE_CS] = GENX(INTERFACE_DESCRIPTOR_DATA_length),
4610      [IRIS_CACHE_BLORP] = 0,
4611   };
4612
4613   return sizeof(uint32_t) * dwords[cache_id];
4614}
4615
4616/**
4617 * Create any state packets corresponding to the given shader stage
4618 * (i.e. 3DSTATE_VS) and save them as "derived data" in the shader variant.
4619 * This means that we can look up a program in the in-memory cache and
4620 * get most of the state packet without having to reconstruct it.
4621 */
4622static void
4623iris_store_derived_program_state(const struct intel_device_info *devinfo,
4624                                 enum iris_program_cache_id cache_id,
4625                                 struct iris_compiled_shader *shader)
4626{
4627   switch (cache_id) {
4628   case IRIS_CACHE_VS:
4629      iris_store_vs_state(devinfo, shader);
4630      break;
4631   case IRIS_CACHE_TCS:
4632      iris_store_tcs_state(devinfo, shader);
4633      break;
4634   case IRIS_CACHE_TES:
4635      iris_store_tes_state(devinfo, shader);
4636      break;
4637   case IRIS_CACHE_GS:
4638      iris_store_gs_state(devinfo, shader);
4639      break;
4640   case IRIS_CACHE_FS:
4641      iris_store_fs_state(devinfo, shader);
4642      break;
4643   case IRIS_CACHE_CS:
4644      iris_store_cs_state(devinfo, shader);
4645      break;
4646   case IRIS_CACHE_BLORP:
4647      break;
4648   }
4649}
4650
4651/* ------------------------------------------------------------------- */
4652
4653static const uint32_t push_constant_opcodes[] = {
4654   [MESA_SHADER_VERTEX]    = 21,
4655   [MESA_SHADER_TESS_CTRL] = 25, /* HS */
4656   [MESA_SHADER_TESS_EVAL] = 26, /* DS */
4657   [MESA_SHADER_GEOMETRY]  = 22,
4658   [MESA_SHADER_FRAGMENT]  = 23,
4659   [MESA_SHADER_COMPUTE]   = 0,
4660};
4661
4662static uint32_t
4663use_null_surface(struct iris_batch *batch, struct iris_context *ice)
4664{
4665   struct iris_bo *state_bo = iris_resource_bo(ice->state.unbound_tex.res);
4666
4667   iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
4668
4669   return ice->state.unbound_tex.offset;
4670}
4671
4672static uint32_t
4673use_null_fb_surface(struct iris_batch *batch, struct iris_context *ice)
4674{
4675   /* If set_framebuffer_state() was never called, fall back to 1x1x1 */
4676   if (!ice->state.null_fb.res)
4677      return use_null_surface(batch, ice);
4678
4679   struct iris_bo *state_bo = iris_resource_bo(ice->state.null_fb.res);
4680
4681   iris_use_pinned_bo(batch, state_bo, false, IRIS_DOMAIN_NONE);
4682
4683   return ice->state.null_fb.offset;
4684}
4685
4686static uint32_t
4687surf_state_offset_for_aux(struct iris_resource *res,
4688                          unsigned aux_modes,
4689                          enum isl_aux_usage aux_usage)
4690{
4691   assert(aux_modes & (1 << aux_usage));
4692   return SURFACE_STATE_ALIGNMENT *
4693          util_bitcount(aux_modes & ((1 << aux_usage) - 1));
4694}
4695
4696#if GFX_VER == 9
4697static void
4698surf_state_update_clear_value(struct iris_batch *batch,
4699                              struct iris_resource *res,
4700                              struct iris_state_ref *state,
4701                              unsigned aux_modes,
4702                              enum isl_aux_usage aux_usage)
4703{
4704   struct isl_device *isl_dev = &batch->screen->isl_dev;
4705   struct iris_bo *state_bo = iris_resource_bo(state->res);
4706   uint64_t real_offset = state->offset + IRIS_MEMZONE_BINDER_START;
4707   uint32_t offset_into_bo = real_offset - state_bo->address;
4708   uint32_t clear_offset = offset_into_bo +
4709      isl_dev->ss.clear_value_offset +
4710      surf_state_offset_for_aux(res, aux_modes, aux_usage);
4711   uint32_t *color = res->aux.clear_color.u32;
4712
4713   assert(isl_dev->ss.clear_value_size == 16);
4714
4715   if (aux_usage == ISL_AUX_USAGE_HIZ) {
4716      iris_emit_pipe_control_write(batch, "update fast clear value (Z)",
4717                                   PIPE_CONTROL_WRITE_IMMEDIATE,
4718                                   state_bo, clear_offset, color[0]);
4719   } else {
4720      iris_emit_pipe_control_write(batch, "update fast clear color (RG__)",
4721                                   PIPE_CONTROL_WRITE_IMMEDIATE,
4722                                   state_bo, clear_offset,
4723                                   (uint64_t) color[0] |
4724                                   (uint64_t) color[1] << 32);
4725      iris_emit_pipe_control_write(batch, "update fast clear color (__BA)",
4726                                   PIPE_CONTROL_WRITE_IMMEDIATE,
4727                                   state_bo, clear_offset + 8,
4728                                   (uint64_t) color[2] |
4729                                   (uint64_t) color[3] << 32);
4730   }
4731
4732   iris_emit_pipe_control_flush(batch,
4733                                "update fast clear: state cache invalidate",
4734                                PIPE_CONTROL_FLUSH_ENABLE |
4735                                PIPE_CONTROL_STATE_CACHE_INVALIDATE);
4736}
4737#endif
4738
4739static void
4740update_clear_value(struct iris_context *ice,
4741                   struct iris_batch *batch,
4742                   struct iris_resource *res,
4743                   struct iris_surface_state *surf_state,
4744                   unsigned all_aux_modes,
4745                   struct isl_view *view)
4746{
4747   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
4748   UNUSED unsigned aux_modes = all_aux_modes;
4749
4750   /* We only need to update the clear color in the surface state for gfx8 and
4751    * gfx9. Newer gens can read it directly from the clear color state buffer.
4752    */
4753#if GFX_VER == 9
4754   /* Skip updating the ISL_AUX_USAGE_NONE surface state */
4755   aux_modes &= ~(1 << ISL_AUX_USAGE_NONE);
4756
4757   while (aux_modes) {
4758      enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
4759
4760      surf_state_update_clear_value(batch, res, &surf_state->ref,
4761                                    all_aux_modes, aux_usage);
4762   }
4763#elif GFX_VER == 8
4764   /* TODO: Could update rather than re-filling */
4765   alloc_surface_states(surf_state, all_aux_modes);
4766
4767   void *map = surf_state->cpu;
4768
4769   while (aux_modes) {
4770      enum isl_aux_usage aux_usage = u_bit_scan(&aux_modes);
4771      fill_surface_state(isl_dev, map, res, &res->surf, view, aux_usage,
4772                         0, 0, 0);
4773      map += SURFACE_STATE_ALIGNMENT;
4774   }
4775
4776   upload_surface_states(ice->state.surface_uploader, surf_state);
4777#endif
4778}
4779
4780/**
4781 * Add a surface to the validation list, as well as the buffer containing
4782 * the corresponding SURFACE_STATE.
4783 *
4784 * Returns the binding table entry (offset to SURFACE_STATE).
4785 */
4786static uint32_t
4787use_surface(struct iris_context *ice,
4788            struct iris_batch *batch,
4789            struct pipe_surface *p_surf,
4790            bool writeable,
4791            enum isl_aux_usage aux_usage,
4792            bool is_read_surface,
4793            enum iris_domain access)
4794{
4795   struct iris_surface *surf = (void *) p_surf;
4796   struct iris_resource *res = (void *) p_surf->texture;
4797   uint32_t offset = 0;
4798
4799   if (GFX_VER == 8 && is_read_surface && !surf->surface_state_read.ref.res) {
4800      upload_surface_states(ice->state.surface_uploader,
4801                            &surf->surface_state_read);
4802   }
4803
4804   if (!surf->surface_state.ref.res) {
4805      upload_surface_states(ice->state.surface_uploader,
4806                            &surf->surface_state);
4807   }
4808
4809   if (memcmp(&res->aux.clear_color, &surf->clear_color,
4810              sizeof(surf->clear_color)) != 0) {
4811      update_clear_value(ice, batch, res, &surf->surface_state,
4812                         res->aux.possible_usages, &surf->view);
4813      if (GFX_VER == 8) {
4814         update_clear_value(ice, batch, res, &surf->surface_state_read,
4815                            res->aux.possible_usages, &surf->read_view);
4816      }
4817      surf->clear_color = res->aux.clear_color;
4818   }
4819
4820   if (res->aux.clear_color_bo)
4821      iris_use_pinned_bo(batch, res->aux.clear_color_bo, false, access);
4822
4823   if (res->aux.bo)
4824      iris_use_pinned_bo(batch, res->aux.bo, writeable, access);
4825
4826   iris_use_pinned_bo(batch, res->bo, writeable, access);
4827
4828   if (GFX_VER == 8 && is_read_surface) {
4829      iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state_read.ref.res), false,
4830                         IRIS_DOMAIN_NONE);
4831   } else {
4832      iris_use_pinned_bo(batch, iris_resource_bo(surf->surface_state.ref.res), false,
4833                         IRIS_DOMAIN_NONE);
4834   }
4835
4836   offset = (GFX_VER == 8 && is_read_surface)
4837               ? surf->surface_state_read.ref.offset
4838               : surf->surface_state.ref.offset;
4839
4840   return offset +
4841          surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage);
4842}
4843
4844static uint32_t
4845use_sampler_view(struct iris_context *ice,
4846                 struct iris_batch *batch,
4847                 struct iris_sampler_view *isv)
4848{
4849   enum isl_aux_usage aux_usage =
4850      iris_resource_texture_aux_usage(ice, isv->res, isv->view.format);
4851
4852   if (!isv->surface_state.ref.res)
4853      upload_surface_states(ice->state.surface_uploader, &isv->surface_state);
4854
4855   if (memcmp(&isv->res->aux.clear_color, &isv->clear_color,
4856              sizeof(isv->clear_color)) != 0) {
4857      update_clear_value(ice, batch, isv->res, &isv->surface_state,
4858                         isv->res->aux.sampler_usages, &isv->view);
4859      isv->clear_color = isv->res->aux.clear_color;
4860   }
4861
4862   if (isv->res->aux.clear_color_bo) {
4863      iris_use_pinned_bo(batch, isv->res->aux.clear_color_bo,
4864                         false, IRIS_DOMAIN_OTHER_READ);
4865   }
4866
4867   if (isv->res->aux.bo) {
4868      iris_use_pinned_bo(batch, isv->res->aux.bo,
4869                         false, IRIS_DOMAIN_OTHER_READ);
4870   }
4871
4872   iris_use_pinned_bo(batch, isv->res->bo, false, IRIS_DOMAIN_OTHER_READ);
4873   iris_use_pinned_bo(batch, iris_resource_bo(isv->surface_state.ref.res), false,
4874                      IRIS_DOMAIN_NONE);
4875
4876   return isv->surface_state.ref.offset +
4877          surf_state_offset_for_aux(isv->res, isv->res->aux.sampler_usages,
4878                                    aux_usage);
4879}
4880
4881static uint32_t
4882use_ubo_ssbo(struct iris_batch *batch,
4883             struct iris_context *ice,
4884             struct pipe_shader_buffer *buf,
4885             struct iris_state_ref *surf_state,
4886             bool writable, enum iris_domain access)
4887{
4888   if (!buf->buffer || !surf_state->res)
4889      return use_null_surface(batch, ice);
4890
4891   iris_use_pinned_bo(batch, iris_resource_bo(buf->buffer), writable, access);
4892   iris_use_pinned_bo(batch, iris_resource_bo(surf_state->res), false,
4893                      IRIS_DOMAIN_NONE);
4894
4895   return surf_state->offset;
4896}
4897
4898static uint32_t
4899use_image(struct iris_batch *batch, struct iris_context *ice,
4900          struct iris_shader_state *shs, const struct shader_info *info,
4901          int i)
4902{
4903   struct iris_image_view *iv = &shs->image[i];
4904   struct iris_resource *res = (void *) iv->base.resource;
4905
4906   if (!res)
4907      return use_null_surface(batch, ice);
4908
4909   bool write = iv->base.shader_access & PIPE_IMAGE_ACCESS_WRITE;
4910
4911   iris_use_pinned_bo(batch, res->bo, write, IRIS_DOMAIN_NONE);
4912   iris_use_pinned_bo(batch, iris_resource_bo(iv->surface_state.ref.res),
4913                      false, IRIS_DOMAIN_NONE);
4914
4915   if (res->aux.bo)
4916      iris_use_pinned_bo(batch, res->aux.bo, write, IRIS_DOMAIN_NONE);
4917
4918   enum isl_aux_usage aux_usage =
4919      iris_image_view_aux_usage(ice, &iv->base, info);
4920
4921   return iv->surface_state.ref.offset +
4922      surf_state_offset_for_aux(res, res->aux.possible_usages, aux_usage);
4923}
4924
4925#define push_bt_entry(addr) \
4926   assert(addr >= binder_addr); \
4927   assert(s < shader->bt.size_bytes / sizeof(uint32_t)); \
4928   if (!pin_only) bt_map[s++] = (addr) - binder_addr;
4929
4930#define bt_assert(section) \
4931   if (!pin_only && shader->bt.used_mask[section] != 0) \
4932      assert(shader->bt.offsets[section] == s);
4933
4934/**
4935 * Populate the binding table for a given shader stage.
4936 *
4937 * This fills out the table of pointers to surfaces required by the shader,
4938 * and also adds those buffers to the validation list so the kernel can make
4939 * resident before running our batch.
4940 */
4941static void
4942iris_populate_binding_table(struct iris_context *ice,
4943                            struct iris_batch *batch,
4944                            gl_shader_stage stage,
4945                            bool pin_only)
4946{
4947   const struct iris_binder *binder = &ice->state.binder;
4948   struct iris_compiled_shader *shader = ice->shaders.prog[stage];
4949   if (!shader)
4950      return;
4951
4952   struct iris_binding_table *bt = &shader->bt;
4953   UNUSED struct brw_stage_prog_data *prog_data = shader->prog_data;
4954   struct iris_shader_state *shs = &ice->state.shaders[stage];
4955   uint32_t binder_addr = binder->bo->address;
4956
4957   uint32_t *bt_map = binder->map + binder->bt_offset[stage];
4958   int s = 0;
4959
4960   const struct shader_info *info = iris_get_shader_info(ice, stage);
4961   if (!info) {
4962      /* TCS passthrough doesn't need a binding table. */
4963      assert(stage == MESA_SHADER_TESS_CTRL);
4964      return;
4965   }
4966
4967   if (stage == MESA_SHADER_COMPUTE &&
4968       shader->bt.used_mask[IRIS_SURFACE_GROUP_CS_WORK_GROUPS]) {
4969      /* surface for gl_NumWorkGroups */
4970      struct iris_state_ref *grid_data = &ice->state.grid_size;
4971      struct iris_state_ref *grid_state = &ice->state.grid_surf_state;
4972      iris_use_pinned_bo(batch, iris_resource_bo(grid_data->res), false,
4973                         IRIS_DOMAIN_OTHER_READ);
4974      iris_use_pinned_bo(batch, iris_resource_bo(grid_state->res), false,
4975                         IRIS_DOMAIN_NONE);
4976      push_bt_entry(grid_state->offset);
4977   }
4978
4979   if (stage == MESA_SHADER_FRAGMENT) {
4980      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
4981      /* Note that cso_fb->nr_cbufs == fs_key->nr_color_regions. */
4982      if (cso_fb->nr_cbufs) {
4983         for (unsigned i = 0; i < cso_fb->nr_cbufs; i++) {
4984            uint32_t addr;
4985            if (cso_fb->cbufs[i]) {
4986               addr = use_surface(ice, batch, cso_fb->cbufs[i], true,
4987                                  ice->state.draw_aux_usage[i], false,
4988                                  IRIS_DOMAIN_RENDER_WRITE);
4989            } else {
4990               addr = use_null_fb_surface(batch, ice);
4991            }
4992            push_bt_entry(addr);
4993         }
4994      } else if (GFX_VER < 11) {
4995         uint32_t addr = use_null_fb_surface(batch, ice);
4996         push_bt_entry(addr);
4997      }
4998   }
4999
5000#define foreach_surface_used(index, group) \
5001   bt_assert(group); \
5002   for (int index = 0; index < bt->sizes[group]; index++) \
5003      if (iris_group_index_to_bti(bt, group, index) != \
5004          IRIS_SURFACE_NOT_USED)
5005
5006   foreach_surface_used(i, IRIS_SURFACE_GROUP_RENDER_TARGET_READ) {
5007      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5008      uint32_t addr;
5009      if (cso_fb->cbufs[i]) {
5010         addr = use_surface(ice, batch, cso_fb->cbufs[i],
5011                            false, ice->state.draw_aux_usage[i], true,
5012                            IRIS_DOMAIN_OTHER_READ);
5013         push_bt_entry(addr);
5014      }
5015   }
5016
5017   foreach_surface_used(i, IRIS_SURFACE_GROUP_TEXTURE) {
5018      struct iris_sampler_view *view = shs->textures[i];
5019      uint32_t addr = view ? use_sampler_view(ice, batch, view)
5020                           : use_null_surface(batch, ice);
5021      push_bt_entry(addr);
5022   }
5023
5024   foreach_surface_used(i, IRIS_SURFACE_GROUP_IMAGE) {
5025      uint32_t addr = use_image(batch, ice, shs, info, i);
5026      push_bt_entry(addr);
5027   }
5028
5029   foreach_surface_used(i, IRIS_SURFACE_GROUP_UBO) {
5030      uint32_t addr = use_ubo_ssbo(batch, ice, &shs->constbuf[i],
5031                                   &shs->constbuf_surf_state[i], false,
5032                                   IRIS_DOMAIN_OTHER_READ);
5033      push_bt_entry(addr);
5034   }
5035
5036   foreach_surface_used(i, IRIS_SURFACE_GROUP_SSBO) {
5037      uint32_t addr =
5038         use_ubo_ssbo(batch, ice, &shs->ssbo[i], &shs->ssbo_surf_state[i],
5039                      shs->writable_ssbos & (1u << i), IRIS_DOMAIN_NONE);
5040      push_bt_entry(addr);
5041   }
5042
5043#if 0
5044      /* XXX: YUV surfaces not implemented yet */
5045      bt_assert(plane_start[1], ...);
5046      bt_assert(plane_start[2], ...);
5047#endif
5048}
5049
5050static void
5051iris_use_optional_res(struct iris_batch *batch,
5052                      struct pipe_resource *res,
5053                      bool writeable,
5054                      enum iris_domain access)
5055{
5056   if (res) {
5057      struct iris_bo *bo = iris_resource_bo(res);
5058      iris_use_pinned_bo(batch, bo, writeable, access);
5059   }
5060}
5061
5062static void
5063pin_depth_and_stencil_buffers(struct iris_batch *batch,
5064                              struct pipe_surface *zsbuf,
5065                              struct iris_depth_stencil_alpha_state *cso_zsa)
5066{
5067   if (!zsbuf)
5068      return;
5069
5070   struct iris_resource *zres, *sres;
5071   iris_get_depth_stencil_resources(zsbuf->texture, &zres, &sres);
5072
5073   if (zres) {
5074      const enum iris_domain access = cso_zsa->depth_writes_enabled ?
5075         IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ;
5076      iris_use_pinned_bo(batch, zres->bo, cso_zsa->depth_writes_enabled,
5077                         access);
5078      if (zres->aux.bo) {
5079         iris_use_pinned_bo(batch, zres->aux.bo,
5080                            cso_zsa->depth_writes_enabled, access);
5081      }
5082   }
5083
5084   if (sres) {
5085      const enum iris_domain access = cso_zsa->stencil_writes_enabled ?
5086         IRIS_DOMAIN_DEPTH_WRITE : IRIS_DOMAIN_OTHER_READ;
5087      iris_use_pinned_bo(batch, sres->bo, cso_zsa->stencil_writes_enabled,
5088                         access);
5089   }
5090}
5091
5092static uint32_t
5093pin_scratch_space(struct iris_context *ice,
5094                  struct iris_batch *batch,
5095                  const struct brw_stage_prog_data *prog_data,
5096                  gl_shader_stage stage)
5097{
5098   uint32_t scratch_addr = 0;
5099
5100   if (prog_data->total_scratch > 0) {
5101      struct iris_bo *scratch_bo =
5102         iris_get_scratch_space(ice, prog_data->total_scratch, stage);
5103      iris_use_pinned_bo(batch, scratch_bo, true, IRIS_DOMAIN_NONE);
5104
5105#if GFX_VERx10 >= 125
5106      const struct iris_state_ref *ref =
5107         iris_get_scratch_surf(ice, prog_data->total_scratch);
5108      iris_use_pinned_bo(batch, iris_resource_bo(ref->res),
5109                         false, IRIS_DOMAIN_NONE);
5110      scratch_addr = ref->offset +
5111                     iris_resource_bo(ref->res)->address -
5112                     IRIS_MEMZONE_BINDLESS_START;
5113      assert((scratch_addr & 0x3f) == 0 && scratch_addr < (1 << 26));
5114#else
5115      scratch_addr = scratch_bo->address;
5116#endif
5117   }
5118
5119   return scratch_addr;
5120}
5121
5122/* ------------------------------------------------------------------- */
5123
5124/**
5125 * Pin any BOs which were installed by a previous batch, and restored
5126 * via the hardware logical context mechanism.
5127 *
5128 * We don't need to re-emit all state every batch - the hardware context
5129 * mechanism will save and restore it for us.  This includes pointers to
5130 * various BOs...which won't exist unless we ask the kernel to pin them
5131 * by adding them to the validation list.
5132 *
5133 * We can skip buffers if we've re-emitted those packets, as we're
5134 * overwriting those stale pointers with new ones, and don't actually
5135 * refer to the old BOs.
5136 */
5137static void
5138iris_restore_render_saved_bos(struct iris_context *ice,
5139                              struct iris_batch *batch,
5140                              const struct pipe_draw_info *draw)
5141{
5142   struct iris_genx_state *genx = ice->state.genx;
5143
5144   const uint64_t clean = ~ice->state.dirty;
5145   const uint64_t stage_clean = ~ice->state.stage_dirty;
5146
5147   if (clean & IRIS_DIRTY_CC_VIEWPORT) {
5148      iris_use_optional_res(batch, ice->state.last_res.cc_vp, false,
5149                            IRIS_DOMAIN_NONE);
5150   }
5151
5152   if (clean & IRIS_DIRTY_SF_CL_VIEWPORT) {
5153      iris_use_optional_res(batch, ice->state.last_res.sf_cl_vp, false,
5154                            IRIS_DOMAIN_NONE);
5155   }
5156
5157   if (clean & IRIS_DIRTY_BLEND_STATE) {
5158      iris_use_optional_res(batch, ice->state.last_res.blend, false,
5159                            IRIS_DOMAIN_NONE);
5160   }
5161
5162   if (clean & IRIS_DIRTY_COLOR_CALC_STATE) {
5163      iris_use_optional_res(batch, ice->state.last_res.color_calc, false,
5164                            IRIS_DOMAIN_NONE);
5165   }
5166
5167   if (clean & IRIS_DIRTY_SCISSOR_RECT) {
5168      iris_use_optional_res(batch, ice->state.last_res.scissor, false,
5169                            IRIS_DOMAIN_NONE);
5170   }
5171
5172   if (ice->state.streamout_active && (clean & IRIS_DIRTY_SO_BUFFERS)) {
5173      for (int i = 0; i < 4; i++) {
5174         struct iris_stream_output_target *tgt =
5175            (void *) ice->state.so_target[i];
5176         if (tgt) {
5177            iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
5178                               true, IRIS_DOMAIN_OTHER_WRITE);
5179            iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
5180                               true, IRIS_DOMAIN_OTHER_WRITE);
5181         }
5182      }
5183   }
5184
5185   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5186      if (!(stage_clean & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)))
5187         continue;
5188
5189      struct iris_shader_state *shs = &ice->state.shaders[stage];
5190      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5191
5192      if (!shader)
5193         continue;
5194
5195      struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5196
5197      for (int i = 0; i < 4; i++) {
5198         const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5199
5200         if (range->length == 0)
5201            continue;
5202
5203         /* Range block is a binding table index, map back to UBO index. */
5204         unsigned block_index = iris_bti_to_group_index(
5205            &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
5206         assert(block_index != IRIS_SURFACE_NOT_USED);
5207
5208         struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
5209         struct iris_resource *res = (void *) cbuf->buffer;
5210
5211         if (res)
5212            iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_OTHER_READ);
5213         else
5214            iris_use_pinned_bo(batch, batch->screen->workaround_bo, false,
5215                               IRIS_DOMAIN_OTHER_READ);
5216      }
5217   }
5218
5219   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5220      if (stage_clean & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
5221         /* Re-pin any buffers referred to by the binding table. */
5222         iris_populate_binding_table(ice, batch, stage, true);
5223      }
5224   }
5225
5226   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5227      struct iris_shader_state *shs = &ice->state.shaders[stage];
5228      struct pipe_resource *res = shs->sampler_table.res;
5229      if (res)
5230         iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5231                            IRIS_DOMAIN_NONE);
5232   }
5233
5234   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5235      if (stage_clean & (IRIS_STAGE_DIRTY_VS << stage)) {
5236         struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5237
5238         if (shader) {
5239            struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
5240            iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
5241
5242            pin_scratch_space(ice, batch, shader->prog_data, stage);
5243         }
5244      }
5245   }
5246
5247   if ((clean & IRIS_DIRTY_DEPTH_BUFFER) &&
5248       (clean & IRIS_DIRTY_WM_DEPTH_STENCIL)) {
5249      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5250      pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
5251   }
5252
5253   iris_use_optional_res(batch, ice->state.last_res.index_buffer, false,
5254                         IRIS_DOMAIN_VF_READ);
5255
5256   if (clean & IRIS_DIRTY_VERTEX_BUFFERS) {
5257      uint64_t bound = ice->state.bound_vertex_buffers;
5258      while (bound) {
5259         const int i = u_bit_scan64(&bound);
5260         struct pipe_resource *res = genx->vertex_buffers[i].resource;
5261         iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5262                            IRIS_DOMAIN_VF_READ);
5263      }
5264   }
5265}
5266
5267static void
5268iris_restore_compute_saved_bos(struct iris_context *ice,
5269                               struct iris_batch *batch,
5270                               const struct pipe_grid_info *grid)
5271{
5272   const uint64_t stage_clean = ~ice->state.stage_dirty;
5273
5274   const int stage = MESA_SHADER_COMPUTE;
5275   struct iris_shader_state *shs = &ice->state.shaders[stage];
5276
5277   if (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) {
5278      /* Re-pin any buffers referred to by the binding table. */
5279      iris_populate_binding_table(ice, batch, stage, true);
5280   }
5281
5282   struct pipe_resource *sampler_res = shs->sampler_table.res;
5283   if (sampler_res)
5284      iris_use_pinned_bo(batch, iris_resource_bo(sampler_res), false,
5285                         IRIS_DOMAIN_NONE);
5286
5287   if ((stage_clean & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS) &&
5288       (stage_clean & IRIS_STAGE_DIRTY_BINDINGS_CS) &&
5289       (stage_clean & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
5290       (stage_clean & IRIS_STAGE_DIRTY_CS)) {
5291      iris_use_optional_res(batch, ice->state.last_res.cs_desc, false,
5292                            IRIS_DOMAIN_NONE);
5293   }
5294
5295   if (stage_clean & IRIS_STAGE_DIRTY_CS) {
5296      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5297
5298      if (shader) {
5299         struct iris_bo *bo = iris_resource_bo(shader->assembly.res);
5300         iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_NONE);
5301
5302         if (GFX_VERx10 < 125) {
5303            struct iris_bo *curbe_bo =
5304               iris_resource_bo(ice->state.last_res.cs_thread_ids);
5305            iris_use_pinned_bo(batch, curbe_bo, false, IRIS_DOMAIN_NONE);
5306         }
5307
5308         pin_scratch_space(ice, batch, shader->prog_data, stage);
5309      }
5310   }
5311}
5312
5313/**
5314 * Possibly emit STATE_BASE_ADDRESS to update Surface State Base Address.
5315 */
5316static void
5317iris_update_surface_base_address(struct iris_batch *batch,
5318                                 struct iris_binder *binder)
5319{
5320   if (batch->last_surface_base_address == binder->bo->address)
5321      return;
5322
5323   struct isl_device *isl_dev = &batch->screen->isl_dev;
5324   uint32_t mocs = isl_mocs(isl_dev, 0, false);
5325
5326   iris_batch_sync_region_start(batch);
5327
5328   flush_before_state_base_change(batch);
5329
5330#if GFX_VER == 12
5331   /* Wa_1607854226:
5332    *
5333    *  Workaround the non pipelined state not applying in MEDIA/GPGPU pipeline
5334    *  mode by putting the pipeline temporarily in 3D mode..
5335    */
5336   if (batch->name == IRIS_BATCH_COMPUTE)
5337      emit_pipeline_select(batch, _3D);
5338#endif
5339
5340   iris_emit_cmd(batch, GENX(STATE_BASE_ADDRESS), sba) {
5341      sba.SurfaceStateBaseAddressModifyEnable = true;
5342      sba.SurfaceStateBaseAddress = ro_bo(binder->bo, 0);
5343
5344      /* The hardware appears to pay attention to the MOCS fields even
5345       * if you don't set the "Address Modify Enable" bit for the base.
5346       */
5347      sba.GeneralStateMOCS            = mocs;
5348      sba.StatelessDataPortAccessMOCS = mocs;
5349      sba.DynamicStateMOCS            = mocs;
5350      sba.IndirectObjectMOCS          = mocs;
5351      sba.InstructionMOCS             = mocs;
5352      sba.SurfaceStateMOCS            = mocs;
5353#if GFX_VER >= 9
5354      sba.BindlessSurfaceStateMOCS    = mocs;
5355#endif
5356   }
5357
5358#if GFX_VER == 12
5359   /* Wa_1607854226:
5360    *
5361    *  Put the pipeline back into compute mode.
5362    */
5363   if (batch->name == IRIS_BATCH_COMPUTE)
5364      emit_pipeline_select(batch, GPGPU);
5365#endif
5366
5367   flush_after_state_base_change(batch);
5368   iris_batch_sync_region_end(batch);
5369
5370   batch->last_surface_base_address = binder->bo->address;
5371}
5372
5373static inline void
5374iris_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
5375                        bool window_space_position, float *zmin, float *zmax)
5376{
5377   if (window_space_position) {
5378      *zmin = 0.f;
5379      *zmax = 1.f;
5380      return;
5381   }
5382   util_viewport_zmin_zmax(vp, halfz, zmin, zmax);
5383}
5384
5385#if GFX_VER >= 12
5386void
5387genX(invalidate_aux_map_state)(struct iris_batch *batch)
5388{
5389   struct iris_screen *screen = batch->screen;
5390   void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
5391   if (!aux_map_ctx)
5392      return;
5393   uint32_t aux_map_state_num = intel_aux_map_get_state_num(aux_map_ctx);
5394   if (batch->last_aux_map_state != aux_map_state_num) {
5395      /* HSD 1209978178: docs say that before programming the aux table:
5396       *
5397       *    "Driver must ensure that the engine is IDLE but ensure it doesn't
5398       *    add extra flushes in the case it knows that the engine is already
5399       *    IDLE."
5400       *
5401       * An end of pipe sync is needed here, otherwise we see GPU hangs in
5402       * dEQP-GLES31.functional.copy_image.* tests.
5403       */
5404      iris_emit_end_of_pipe_sync(batch, "Invalidate aux map table",
5405                                 PIPE_CONTROL_CS_STALL);
5406
5407      /* If the aux-map state number increased, then we need to rewrite the
5408       * register. Rewriting the register is used to both set the aux-map
5409       * translation table address, and also to invalidate any previously
5410       * cached translations.
5411       */
5412      iris_load_register_imm32(batch, GENX(GFX_CCS_AUX_INV_num), 1);
5413      batch->last_aux_map_state = aux_map_state_num;
5414   }
5415}
5416
5417static void
5418init_aux_map_state(struct iris_batch *batch)
5419{
5420   struct iris_screen *screen = batch->screen;
5421   void *aux_map_ctx = iris_bufmgr_get_aux_map_context(screen->bufmgr);
5422   if (!aux_map_ctx)
5423      return;
5424
5425   uint64_t base_addr = intel_aux_map_get_base(aux_map_ctx);
5426   assert(base_addr != 0 && align64(base_addr, 32 * 1024) == base_addr);
5427   iris_load_register_imm64(batch, GENX(GFX_AUX_TABLE_BASE_ADDR_num),
5428                            base_addr);
5429}
5430#endif
5431
5432struct push_bos {
5433   struct {
5434      struct iris_address addr;
5435      uint32_t length;
5436   } buffers[4];
5437   int buffer_count;
5438   uint32_t max_length;
5439};
5440
5441static void
5442setup_constant_buffers(struct iris_context *ice,
5443                       struct iris_batch *batch,
5444                       int stage,
5445                       struct push_bos *push_bos)
5446{
5447   struct iris_shader_state *shs = &ice->state.shaders[stage];
5448   struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5449   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5450
5451   uint32_t push_range_sum = 0;
5452
5453   int n = 0;
5454   for (int i = 0; i < 4; i++) {
5455      const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
5456
5457      if (range->length == 0)
5458         continue;
5459
5460      push_range_sum += range->length;
5461
5462      if (range->length > push_bos->max_length)
5463         push_bos->max_length = range->length;
5464
5465      /* Range block is a binding table index, map back to UBO index. */
5466      unsigned block_index = iris_bti_to_group_index(
5467         &shader->bt, IRIS_SURFACE_GROUP_UBO, range->block);
5468      assert(block_index != IRIS_SURFACE_NOT_USED);
5469
5470      struct pipe_shader_buffer *cbuf = &shs->constbuf[block_index];
5471      struct iris_resource *res = (void *) cbuf->buffer;
5472
5473      assert(cbuf->buffer_offset % 32 == 0);
5474
5475      push_bos->buffers[n].length = range->length;
5476      push_bos->buffers[n].addr =
5477         res ? ro_bo(res->bo, range->start * 32 + cbuf->buffer_offset)
5478         : batch->screen->workaround_address;
5479      n++;
5480   }
5481
5482   /* From the 3DSTATE_CONSTANT_XS and 3DSTATE_CONSTANT_ALL programming notes:
5483    *
5484    *    "The sum of all four read length fields must be less than or
5485    *    equal to the size of 64."
5486    */
5487   assert(push_range_sum <= 64);
5488
5489   push_bos->buffer_count = n;
5490}
5491
5492static void
5493emit_push_constant_packets(struct iris_context *ice,
5494                           struct iris_batch *batch,
5495                           int stage,
5496                           const struct push_bos *push_bos)
5497{
5498   UNUSED struct isl_device *isl_dev = &batch->screen->isl_dev;
5499   struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5500   struct brw_stage_prog_data *prog_data = (void *) shader->prog_data;
5501
5502   iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_VS), pkt) {
5503      pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
5504#if GFX_VER >= 12
5505      pkt.MOCS = isl_mocs(isl_dev, 0, false);
5506#endif
5507      if (prog_data) {
5508         /* The Skylake PRM contains the following restriction:
5509          *
5510          *    "The driver must ensure The following case does not occur
5511          *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
5512          *     buffer 3 read length equal to zero committed followed by a
5513          *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
5514          *     zero committed."
5515          *
5516          * To avoid this, we program the buffers in the highest slots.
5517          * This way, slot 0 is only used if slot 3 is also used.
5518          */
5519         int n = push_bos->buffer_count;
5520         assert(n <= 4);
5521         const unsigned shift = 4 - n;
5522         for (int i = 0; i < n; i++) {
5523            pkt.ConstantBody.ReadLength[i + shift] =
5524               push_bos->buffers[i].length;
5525            pkt.ConstantBody.Buffer[i + shift] = push_bos->buffers[i].addr;
5526         }
5527      }
5528   }
5529}
5530
5531#if GFX_VER >= 12
5532static void
5533emit_push_constant_packet_all(struct iris_context *ice,
5534                              struct iris_batch *batch,
5535                              uint32_t shader_mask,
5536                              const struct push_bos *push_bos)
5537{
5538   struct isl_device *isl_dev = &batch->screen->isl_dev;
5539
5540   if (!push_bos) {
5541      iris_emit_cmd(batch, GENX(3DSTATE_CONSTANT_ALL), pc) {
5542         pc.ShaderUpdateEnable = shader_mask;
5543      }
5544      return;
5545   }
5546
5547   const uint32_t n = push_bos->buffer_count;
5548   const uint32_t max_pointers = 4;
5549   const uint32_t num_dwords = 2 + 2 * n;
5550   uint32_t const_all[2 + 2 * max_pointers];
5551   uint32_t *dw = &const_all[0];
5552
5553   assert(n <= max_pointers);
5554   iris_pack_command(GENX(3DSTATE_CONSTANT_ALL), dw, all) {
5555      all.DWordLength = num_dwords - 2;
5556      all.MOCS = isl_mocs(isl_dev, 0, false);
5557      all.ShaderUpdateEnable = shader_mask;
5558      all.PointerBufferMask = (1 << n) - 1;
5559   }
5560   dw += 2;
5561
5562   for (int i = 0; i < n; i++) {
5563      _iris_pack_state(batch, GENX(3DSTATE_CONSTANT_ALL_DATA),
5564                       dw + i * 2, data) {
5565         data.PointerToConstantBuffer = push_bos->buffers[i].addr;
5566         data.ConstantBufferReadLength = push_bos->buffers[i].length;
5567      }
5568   }
5569   iris_batch_emit(batch, const_all, sizeof(uint32_t) * num_dwords);
5570}
5571#endif
5572
5573void
5574genX(emit_depth_state_workarounds)(struct iris_context *ice,
5575                                   struct iris_batch *batch,
5576                                   const struct isl_surf *surf)
5577{
5578#if GFX_VERx10 == 120
5579   const bool fmt_is_d16 = surf->format == ISL_FORMAT_R16_UNORM;
5580
5581   switch (ice->state.genx->depth_reg_mode) {
5582   case IRIS_DEPTH_REG_MODE_HW_DEFAULT:
5583      if (!fmt_is_d16)
5584         return;
5585      break;
5586   case IRIS_DEPTH_REG_MODE_D16:
5587      if (fmt_is_d16)
5588         return;
5589      break;
5590   case IRIS_DEPTH_REG_MODE_UNKNOWN:
5591      break;
5592   }
5593
5594   /* We'll change some CHICKEN registers depending on the depth surface
5595    * format. Do a depth flush and stall so the pipeline is not using these
5596    * settings while we change the registers.
5597    */
5598   iris_emit_end_of_pipe_sync(batch,
5599                              "Workaround: Stop pipeline for 14010455700",
5600                              PIPE_CONTROL_DEPTH_STALL |
5601                              PIPE_CONTROL_DEPTH_CACHE_FLUSH);
5602
5603   /* Wa_14010455700
5604    *
5605    * To avoid sporadic corruptions “Set 0x7010[9] when Depth Buffer
5606    * Surface Format is D16_UNORM , surface type is not NULL & 1X_MSAA”.
5607    */
5608   iris_emit_reg(batch, GENX(COMMON_SLICE_CHICKEN1), reg) {
5609      reg.HIZPlaneOptimizationdisablebit = fmt_is_d16 && surf->samples == 1;
5610      reg.HIZPlaneOptimizationdisablebitMask = true;
5611   }
5612
5613   /* Wa_1806527549
5614    *
5615    * Set HIZ_CHICKEN (7018h) bit 13 = 1 when depth buffer is D16_UNORM.
5616    */
5617   iris_emit_reg(batch, GENX(HIZ_CHICKEN), reg) {
5618      reg.HZDepthTestLEGEOptimizationDisable = fmt_is_d16;
5619      reg.HZDepthTestLEGEOptimizationDisableMask = true;
5620   }
5621
5622   ice->state.genx->depth_reg_mode =
5623      fmt_is_d16 ? IRIS_DEPTH_REG_MODE_D16 : IRIS_DEPTH_REG_MODE_HW_DEFAULT;
5624#endif
5625}
5626
5627static void
5628iris_upload_dirty_render_state(struct iris_context *ice,
5629                               struct iris_batch *batch,
5630                               const struct pipe_draw_info *draw)
5631{
5632   const uint64_t dirty = ice->state.dirty;
5633   const uint64_t stage_dirty = ice->state.stage_dirty;
5634
5635   if (!(dirty & IRIS_ALL_DIRTY_FOR_RENDER) &&
5636       !(stage_dirty & IRIS_ALL_STAGE_DIRTY_FOR_RENDER))
5637      return;
5638
5639   struct iris_genx_state *genx = ice->state.genx;
5640   struct iris_binder *binder = &ice->state.binder;
5641   struct brw_wm_prog_data *wm_prog_data = (void *)
5642      ice->shaders.prog[MESA_SHADER_FRAGMENT]->prog_data;
5643
5644   if (dirty & IRIS_DIRTY_CC_VIEWPORT) {
5645      const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
5646      uint32_t cc_vp_address;
5647
5648      /* XXX: could avoid streaming for depth_clip [0,1] case. */
5649      uint32_t *cc_vp_map =
5650         stream_state(batch, ice->state.dynamic_uploader,
5651                      &ice->state.last_res.cc_vp,
5652                      4 * ice->state.num_viewports *
5653                      GENX(CC_VIEWPORT_length), 32, &cc_vp_address);
5654      for (int i = 0; i < ice->state.num_viewports; i++) {
5655         float zmin, zmax;
5656         iris_viewport_zmin_zmax(&ice->state.viewports[i], cso_rast->clip_halfz,
5657                                 ice->state.window_space_position,
5658                                 &zmin, &zmax);
5659         if (cso_rast->depth_clip_near)
5660            zmin = 0.0;
5661         if (cso_rast->depth_clip_far)
5662            zmax = 1.0;
5663
5664         iris_pack_state(GENX(CC_VIEWPORT), cc_vp_map, ccv) {
5665            ccv.MinimumDepth = zmin;
5666            ccv.MaximumDepth = zmax;
5667         }
5668
5669         cc_vp_map += GENX(CC_VIEWPORT_length);
5670      }
5671
5672      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
5673         ptr.CCViewportPointer = cc_vp_address;
5674      }
5675   }
5676
5677   if (dirty & IRIS_DIRTY_SF_CL_VIEWPORT) {
5678      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5679      uint32_t sf_cl_vp_address;
5680      uint32_t *vp_map =
5681         stream_state(batch, ice->state.dynamic_uploader,
5682                      &ice->state.last_res.sf_cl_vp,
5683                      4 * ice->state.num_viewports *
5684                      GENX(SF_CLIP_VIEWPORT_length), 64, &sf_cl_vp_address);
5685
5686      for (unsigned i = 0; i < ice->state.num_viewports; i++) {
5687         const struct pipe_viewport_state *state = &ice->state.viewports[i];
5688         float gb_xmin, gb_xmax, gb_ymin, gb_ymax;
5689
5690         float vp_xmin = viewport_extent(state, 0, -1.0f);
5691         float vp_xmax = viewport_extent(state, 0,  1.0f);
5692         float vp_ymin = viewport_extent(state, 1, -1.0f);
5693         float vp_ymax = viewport_extent(state, 1,  1.0f);
5694
5695         intel_calculate_guardband_size(cso_fb->width, cso_fb->height,
5696                                        state->scale[0], state->scale[1],
5697                                        state->translate[0], state->translate[1],
5698                                        &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
5699
5700         iris_pack_state(GENX(SF_CLIP_VIEWPORT), vp_map, vp) {
5701            vp.ViewportMatrixElementm00 = state->scale[0];
5702            vp.ViewportMatrixElementm11 = state->scale[1];
5703            vp.ViewportMatrixElementm22 = state->scale[2];
5704            vp.ViewportMatrixElementm30 = state->translate[0];
5705            vp.ViewportMatrixElementm31 = state->translate[1];
5706            vp.ViewportMatrixElementm32 = state->translate[2];
5707            vp.XMinClipGuardband = gb_xmin;
5708            vp.XMaxClipGuardband = gb_xmax;
5709            vp.YMinClipGuardband = gb_ymin;
5710            vp.YMaxClipGuardband = gb_ymax;
5711            vp.XMinViewPort = MAX2(vp_xmin, 0);
5712            vp.XMaxViewPort = MIN2(vp_xmax, cso_fb->width) - 1;
5713            vp.YMinViewPort = MAX2(vp_ymin, 0);
5714            vp.YMaxViewPort = MIN2(vp_ymax, cso_fb->height) - 1;
5715         }
5716
5717         vp_map += GENX(SF_CLIP_VIEWPORT_length);
5718      }
5719
5720      iris_emit_cmd(batch, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
5721         ptr.SFClipViewportPointer = sf_cl_vp_address;
5722      }
5723   }
5724
5725   if (dirty & IRIS_DIRTY_URB) {
5726      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
5727         if (!ice->shaders.prog[i]) {
5728            ice->shaders.urb.size[i] = 1;
5729         } else {
5730            struct brw_vue_prog_data *vue_prog_data =
5731               (void *) ice->shaders.prog[i]->prog_data;
5732            ice->shaders.urb.size[i] = vue_prog_data->urb_entry_size;
5733         }
5734         assert(ice->shaders.urb.size[i] != 0);
5735      }
5736
5737      intel_get_urb_config(&batch->screen->devinfo,
5738                           batch->screen->l3_config_3d,
5739                           ice->shaders.prog[MESA_SHADER_TESS_EVAL] != NULL,
5740                           ice->shaders.prog[MESA_SHADER_GEOMETRY] != NULL,
5741                           ice->shaders.urb.size,
5742                           ice->shaders.urb.entries,
5743                           ice->shaders.urb.start,
5744                           &ice->state.urb_deref_block_size,
5745                           &ice->shaders.urb.constrained);
5746
5747      for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
5748         iris_emit_cmd(batch, GENX(3DSTATE_URB_VS), urb) {
5749            urb._3DCommandSubOpcode += i;
5750            urb.VSURBStartingAddress     = ice->shaders.urb.start[i];
5751            urb.VSURBEntryAllocationSize = ice->shaders.urb.size[i] - 1;
5752            urb.VSNumberofURBEntries     = ice->shaders.urb.entries[i];
5753         }
5754      }
5755   }
5756
5757   if (dirty & IRIS_DIRTY_BLEND_STATE) {
5758      struct iris_blend_state *cso_blend = ice->state.cso_blend;
5759      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5760      struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
5761      const int header_dwords = GENX(BLEND_STATE_length);
5762
5763      /* Always write at least one BLEND_STATE - the final RT message will
5764       * reference BLEND_STATE[0] even if there aren't color writes.  There
5765       * may still be alpha testing, computed depth, and so on.
5766       */
5767      const int rt_dwords =
5768         MAX2(cso_fb->nr_cbufs, 1) * GENX(BLEND_STATE_ENTRY_length);
5769
5770      uint32_t blend_offset;
5771      uint32_t *blend_map =
5772         stream_state(batch, ice->state.dynamic_uploader,
5773                      &ice->state.last_res.blend,
5774                      4 * (header_dwords + rt_dwords), 64, &blend_offset);
5775
5776      uint32_t blend_state_header;
5777      iris_pack_state(GENX(BLEND_STATE), &blend_state_header, bs) {
5778         bs.AlphaTestEnable = cso_zsa->alpha_enabled;
5779         bs.AlphaTestFunction = translate_compare_func(cso_zsa->alpha_func);
5780      }
5781
5782      blend_map[0] = blend_state_header | cso_blend->blend_state[0];
5783      memcpy(&blend_map[1], &cso_blend->blend_state[1], 4 * rt_dwords);
5784
5785      iris_emit_cmd(batch, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
5786         ptr.BlendStatePointer = blend_offset;
5787         ptr.BlendStatePointerValid = true;
5788      }
5789   }
5790
5791   if (dirty & IRIS_DIRTY_COLOR_CALC_STATE) {
5792      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
5793#if GFX_VER == 8
5794      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
5795#endif
5796      uint32_t cc_offset;
5797      void *cc_map =
5798         stream_state(batch, ice->state.dynamic_uploader,
5799                      &ice->state.last_res.color_calc,
5800                      sizeof(uint32_t) * GENX(COLOR_CALC_STATE_length),
5801                      64, &cc_offset);
5802      iris_pack_state(GENX(COLOR_CALC_STATE), cc_map, cc) {
5803         cc.AlphaTestFormat = ALPHATEST_FLOAT32;
5804         cc.AlphaReferenceValueAsFLOAT32 = cso->alpha_ref_value;
5805         cc.BlendConstantColorRed   = ice->state.blend_color.color[0];
5806         cc.BlendConstantColorGreen = ice->state.blend_color.color[1];
5807         cc.BlendConstantColorBlue  = ice->state.blend_color.color[2];
5808         cc.BlendConstantColorAlpha = ice->state.blend_color.color[3];
5809#if GFX_VER == 8
5810	 cc.StencilReferenceValue = p_stencil_refs->ref_value[0];
5811	 cc.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
5812#endif
5813      }
5814      iris_emit_cmd(batch, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
5815         ptr.ColorCalcStatePointer = cc_offset;
5816         ptr.ColorCalcStatePointerValid = true;
5817      }
5818   }
5819
5820   /* Wa_1604061319
5821    *
5822    *    3DSTATE_CONSTANT_* needs to be programmed before BTP_*
5823    *
5824    * Testing shows that all the 3DSTATE_CONSTANT_XS need to be emitted if
5825    * any stage has a dirty binding table.
5826    */
5827   const bool emit_const_wa = GFX_VER >= 11 &&
5828      ((dirty & IRIS_DIRTY_RENDER_BUFFER) ||
5829       (stage_dirty & IRIS_ALL_STAGE_DIRTY_BINDINGS_FOR_RENDER));
5830
5831#if GFX_VER >= 12
5832   uint32_t nobuffer_stages = 0;
5833#endif
5834
5835   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5836      if (!(stage_dirty & (IRIS_STAGE_DIRTY_CONSTANTS_VS << stage)) &&
5837          !emit_const_wa)
5838         continue;
5839
5840      struct iris_shader_state *shs = &ice->state.shaders[stage];
5841      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5842
5843      if (!shader)
5844         continue;
5845
5846      if (shs->sysvals_need_upload)
5847         upload_sysvals(ice, stage, NULL);
5848
5849      struct push_bos push_bos = {};
5850      setup_constant_buffers(ice, batch, stage, &push_bos);
5851
5852#if GFX_VER >= 12
5853      /* If this stage doesn't have any push constants, emit it later in a
5854       * single CONSTANT_ALL packet with all the other stages.
5855       */
5856      if (push_bos.buffer_count == 0) {
5857         nobuffer_stages |= 1 << stage;
5858         continue;
5859      }
5860
5861      /* The Constant Buffer Read Length field from 3DSTATE_CONSTANT_ALL
5862       * contains only 5 bits, so we can only use it for buffers smaller than
5863       * 32.
5864       */
5865      if (push_bos.max_length < 32) {
5866         emit_push_constant_packet_all(ice, batch, 1 << stage, &push_bos);
5867         continue;
5868      }
5869#endif
5870      emit_push_constant_packets(ice, batch, stage, &push_bos);
5871   }
5872
5873#if GFX_VER >= 12
5874   if (nobuffer_stages)
5875      emit_push_constant_packet_all(ice, batch, nobuffer_stages, NULL);
5876#endif
5877
5878   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5879      /* Gfx9 requires 3DSTATE_BINDING_TABLE_POINTERS_XS to be re-emitted
5880       * in order to commit constants.  TODO: Investigate "Disable Gather
5881       * at Set Shader" to go back to legacy mode...
5882       */
5883      if (stage_dirty & ((IRIS_STAGE_DIRTY_BINDINGS_VS |
5884                          (GFX_VER == 9 ? IRIS_STAGE_DIRTY_CONSTANTS_VS : 0))
5885                            << stage)) {
5886         iris_emit_cmd(batch, GENX(3DSTATE_BINDING_TABLE_POINTERS_VS), ptr) {
5887            ptr._3DCommandSubOpcode = 38 + stage;
5888            ptr.PointertoVSBindingTable = binder->bt_offset[stage];
5889         }
5890      }
5891   }
5892
5893   if (GFX_VER >= 11 && (dirty & IRIS_DIRTY_RENDER_BUFFER)) {
5894      // XXX: we may want to flag IRIS_DIRTY_MULTISAMPLE (or SAMPLE_MASK?)
5895      // XXX: see commit 979fc1bc9bcc64027ff2cfafd285676f31b930a6
5896
5897      /* The PIPE_CONTROL command description says:
5898       *
5899       *   "Whenever a Binding Table Index (BTI) used by a Render Target
5900       *    Message points to a different RENDER_SURFACE_STATE, SW must issue a
5901       *    Render Target Cache Flush by enabling this bit. When render target
5902       *    flush is set due to new association of BTI, PS Scoreboard Stall bit
5903       *    must be set in this packet."
5904       */
5905      // XXX: does this need to happen at 3DSTATE_BTP_PS time?
5906      iris_emit_pipe_control_flush(batch, "workaround: RT BTI change [draw]",
5907                                   PIPE_CONTROL_RENDER_TARGET_FLUSH |
5908                                   PIPE_CONTROL_STALL_AT_SCOREBOARD);
5909   }
5910
5911   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5912      if (stage_dirty & (IRIS_STAGE_DIRTY_BINDINGS_VS << stage)) {
5913         iris_populate_binding_table(ice, batch, stage, false);
5914      }
5915   }
5916
5917   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5918      if (!(stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_VS << stage)) ||
5919          !ice->shaders.prog[stage])
5920         continue;
5921
5922      iris_upload_sampler_states(ice, stage);
5923
5924      struct iris_shader_state *shs = &ice->state.shaders[stage];
5925      struct pipe_resource *res = shs->sampler_table.res;
5926      if (res)
5927         iris_use_pinned_bo(batch, iris_resource_bo(res), false,
5928                            IRIS_DOMAIN_NONE);
5929
5930      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
5931         ptr._3DCommandSubOpcode = 43 + stage;
5932         ptr.PointertoVSSamplerState = shs->sampler_table.offset;
5933      }
5934   }
5935
5936   if (ice->state.need_border_colors)
5937      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,
5938                         IRIS_DOMAIN_NONE);
5939
5940   if (dirty & IRIS_DIRTY_MULTISAMPLE) {
5941      iris_emit_cmd(batch, GENX(3DSTATE_MULTISAMPLE), ms) {
5942         ms.PixelLocation =
5943            ice->state.cso_rast->half_pixel_center ? CENTER : UL_CORNER;
5944         if (ice->state.framebuffer.samples > 0)
5945            ms.NumberofMultisamples = ffs(ice->state.framebuffer.samples) - 1;
5946      }
5947   }
5948
5949   if (dirty & IRIS_DIRTY_SAMPLE_MASK) {
5950      iris_emit_cmd(batch, GENX(3DSTATE_SAMPLE_MASK), ms) {
5951         ms.SampleMask = ice->state.sample_mask;
5952      }
5953   }
5954
5955   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
5956      if (!(stage_dirty & (IRIS_STAGE_DIRTY_VS << stage)))
5957         continue;
5958
5959      struct iris_compiled_shader *shader = ice->shaders.prog[stage];
5960
5961      if (shader) {
5962         struct brw_stage_prog_data *prog_data = shader->prog_data;
5963         struct iris_resource *cache = (void *) shader->assembly.res;
5964         iris_use_pinned_bo(batch, cache->bo, false, IRIS_DOMAIN_NONE);
5965
5966         uint32_t scratch_addr =
5967            pin_scratch_space(ice, batch, prog_data, stage);
5968
5969         if (stage == MESA_SHADER_FRAGMENT) {
5970            UNUSED struct iris_rasterizer_state *cso = ice->state.cso_rast;
5971            struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
5972
5973            uint32_t ps_state[GENX(3DSTATE_PS_length)] = {0};
5974            _iris_pack_command(batch, GENX(3DSTATE_PS), ps_state, ps) {
5975               ps._8PixelDispatchEnable = wm_prog_data->dispatch_8;
5976               ps._16PixelDispatchEnable = wm_prog_data->dispatch_16;
5977               ps._32PixelDispatchEnable = wm_prog_data->dispatch_32;
5978
5979              /* The docs for 3DSTATE_PS::32 Pixel Dispatch Enable say:
5980               *
5981               *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16,
5982               *     SIMD32 Dispatch must not be enabled for PER_PIXEL dispatch
5983               *     mode."
5984               *
5985               * 16x MSAA only exists on Gfx9+, so we can skip this on Gfx8.
5986               */
5987               if (GFX_VER >= 9 && cso_fb->samples == 16 &&
5988                   !wm_prog_data->persample_dispatch) {
5989                  assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
5990                  ps._32PixelDispatchEnable = false;
5991               }
5992
5993               ps.DispatchGRFStartRegisterForConstantSetupData0 =
5994                  brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
5995               ps.DispatchGRFStartRegisterForConstantSetupData1 =
5996                  brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
5997               ps.DispatchGRFStartRegisterForConstantSetupData2 =
5998                  brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
5999
6000               ps.KernelStartPointer0 = KSP(shader) +
6001                  brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
6002               ps.KernelStartPointer1 = KSP(shader) +
6003                  brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
6004               ps.KernelStartPointer2 = KSP(shader) +
6005                  brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
6006
6007#if GFX_VERx10 >= 125
6008               ps.ScratchSpaceBuffer = scratch_addr >> 4;
6009#else
6010               ps.ScratchSpaceBasePointer =
6011                  rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
6012#endif
6013            }
6014
6015            uint32_t psx_state[GENX(3DSTATE_PS_EXTRA_length)] = {0};
6016            iris_pack_command(GENX(3DSTATE_PS_EXTRA), psx_state, psx) {
6017#if GFX_VER >= 9
6018               if (!wm_prog_data->uses_sample_mask)
6019                  psx.InputCoverageMaskState  = ICMS_NONE;
6020               else if (wm_prog_data->post_depth_coverage)
6021                  psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
6022               else if (wm_prog_data->inner_coverage &&
6023                        cso->conservative_rasterization)
6024                  psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
6025               else
6026                  psx.InputCoverageMaskState = ICMS_NORMAL;
6027#else
6028               psx.PixelShaderUsesInputCoverageMask =
6029                  wm_prog_data->uses_sample_mask;
6030#endif
6031            }
6032
6033            uint32_t *shader_ps = (uint32_t *) shader->derived_data;
6034            uint32_t *shader_psx = shader_ps + GENX(3DSTATE_PS_length);
6035            iris_emit_merge(batch, shader_ps, ps_state,
6036                            GENX(3DSTATE_PS_length));
6037            iris_emit_merge(batch, shader_psx, psx_state,
6038                            GENX(3DSTATE_PS_EXTRA_length));
6039         } else if (scratch_addr) {
6040            uint32_t *pkt = (uint32_t *) shader->derived_data;
6041            switch (stage) {
6042            case MESA_SHADER_VERTEX:    MERGE_SCRATCH_ADDR(3DSTATE_VS); break;
6043            case MESA_SHADER_TESS_CTRL: MERGE_SCRATCH_ADDR(3DSTATE_HS); break;
6044            case MESA_SHADER_TESS_EVAL: MERGE_SCRATCH_ADDR(3DSTATE_DS); break;
6045            case MESA_SHADER_GEOMETRY:  MERGE_SCRATCH_ADDR(3DSTATE_GS); break;
6046            }
6047         } else {
6048            iris_batch_emit(batch, shader->derived_data,
6049                            iris_derived_program_state_size(stage));
6050         }
6051      } else {
6052         if (stage == MESA_SHADER_TESS_EVAL) {
6053            iris_emit_cmd(batch, GENX(3DSTATE_HS), hs);
6054            iris_emit_cmd(batch, GENX(3DSTATE_TE), te);
6055            iris_emit_cmd(batch, GENX(3DSTATE_DS), ds);
6056         } else if (stage == MESA_SHADER_GEOMETRY) {
6057            iris_emit_cmd(batch, GENX(3DSTATE_GS), gs);
6058         }
6059      }
6060   }
6061
6062   if (ice->state.streamout_active) {
6063      if (dirty & IRIS_DIRTY_SO_BUFFERS) {
6064         for (int i = 0; i < 4; i++) {
6065            struct iris_stream_output_target *tgt =
6066               (void *) ice->state.so_target[i];
6067            const uint32_t dwords = GENX(3DSTATE_SO_BUFFER_length);
6068            uint32_t *so_buffers = genx->so_buffers + i * dwords;
6069            bool zero_offset = false;
6070
6071            if (tgt) {
6072               zero_offset = tgt->zero_offset;
6073               iris_use_pinned_bo(batch, iris_resource_bo(tgt->base.buffer),
6074                                  true, IRIS_DOMAIN_OTHER_WRITE);
6075               iris_use_pinned_bo(batch, iris_resource_bo(tgt->offset.res),
6076                                  true, IRIS_DOMAIN_OTHER_WRITE);
6077            }
6078
6079            if (zero_offset) {
6080               /* Skip the last DWord which contains "Stream Offset" of
6081                * 0xFFFFFFFF and instead emit a dword of zero directly.
6082                */
6083               STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_StreamOffset_start) ==
6084                             32 * (dwords - 1));
6085               const uint32_t zero = 0;
6086               iris_batch_emit(batch, so_buffers, 4 * (dwords - 1));
6087               iris_batch_emit(batch, &zero, sizeof(zero));
6088               tgt->zero_offset = false;
6089            } else {
6090               iris_batch_emit(batch, so_buffers, 4 * dwords);
6091            }
6092         }
6093      }
6094
6095      if ((dirty & IRIS_DIRTY_SO_DECL_LIST) && ice->state.streamout) {
6096         uint32_t *decl_list =
6097            ice->state.streamout + GENX(3DSTATE_STREAMOUT_length);
6098         iris_batch_emit(batch, decl_list, 4 * ((decl_list[0] & 0xff) + 2));
6099      }
6100
6101      if (dirty & IRIS_DIRTY_STREAMOUT) {
6102         const struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6103
6104         uint32_t dynamic_sol[GENX(3DSTATE_STREAMOUT_length)];
6105         iris_pack_command(GENX(3DSTATE_STREAMOUT), dynamic_sol, sol) {
6106            sol.SOFunctionEnable = true;
6107            sol.SOStatisticsEnable = true;
6108
6109            sol.RenderingDisable = cso_rast->rasterizer_discard &&
6110                                   !ice->state.prims_generated_query_active;
6111            sol.ReorderMode = cso_rast->flatshade_first ? LEADING : TRAILING;
6112         }
6113
6114         assert(ice->state.streamout);
6115
6116         iris_emit_merge(batch, ice->state.streamout, dynamic_sol,
6117                         GENX(3DSTATE_STREAMOUT_length));
6118      }
6119   } else {
6120      if (dirty & IRIS_DIRTY_STREAMOUT) {
6121         iris_emit_cmd(batch, GENX(3DSTATE_STREAMOUT), sol);
6122      }
6123   }
6124
6125   if (dirty & IRIS_DIRTY_CLIP) {
6126      struct iris_rasterizer_state *cso_rast = ice->state.cso_rast;
6127      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6128
6129      bool gs_or_tes = ice->shaders.prog[MESA_SHADER_GEOMETRY] ||
6130                       ice->shaders.prog[MESA_SHADER_TESS_EVAL];
6131      bool points_or_lines = cso_rast->fill_mode_point_or_line ||
6132         (gs_or_tes ? ice->shaders.output_topology_is_points_or_lines
6133                    : ice->state.prim_is_points_or_lines);
6134
6135      uint32_t dynamic_clip[GENX(3DSTATE_CLIP_length)];
6136      iris_pack_command(GENX(3DSTATE_CLIP), &dynamic_clip, cl) {
6137         cl.StatisticsEnable = ice->state.statistics_counters_enabled;
6138         if (cso_rast->rasterizer_discard)
6139            cl.ClipMode = CLIPMODE_REJECT_ALL;
6140         else if (ice->state.window_space_position)
6141            cl.ClipMode = CLIPMODE_ACCEPT_ALL;
6142         else
6143            cl.ClipMode = CLIPMODE_NORMAL;
6144
6145         cl.PerspectiveDivideDisable = ice->state.window_space_position;
6146         cl.ViewportXYClipTestEnable = !points_or_lines;
6147
6148         if (wm_prog_data->barycentric_interp_modes &
6149             BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
6150            cl.NonPerspectiveBarycentricEnable = true;
6151
6152         cl.ForceZeroRTAIndexEnable = cso_fb->layers <= 1;
6153         cl.MaximumVPIndex = ice->state.num_viewports - 1;
6154      }
6155      iris_emit_merge(batch, cso_rast->clip, dynamic_clip,
6156                      ARRAY_SIZE(cso_rast->clip));
6157   }
6158
6159   if (dirty & (IRIS_DIRTY_RASTER | IRIS_DIRTY_URB)) {
6160      struct iris_rasterizer_state *cso = ice->state.cso_rast;
6161      iris_batch_emit(batch, cso->raster, sizeof(cso->raster));
6162
6163      uint32_t dynamic_sf[GENX(3DSTATE_SF_length)];
6164      iris_pack_command(GENX(3DSTATE_SF), &dynamic_sf, sf) {
6165         sf.ViewportTransformEnable = !ice->state.window_space_position;
6166
6167#if GFX_VER >= 12
6168         sf.DerefBlockSize = ice->state.urb_deref_block_size;
6169#endif
6170      }
6171      iris_emit_merge(batch, cso->sf, dynamic_sf,
6172                      ARRAY_SIZE(dynamic_sf));
6173   }
6174
6175   if (dirty & IRIS_DIRTY_WM) {
6176      struct iris_rasterizer_state *cso = ice->state.cso_rast;
6177      uint32_t dynamic_wm[GENX(3DSTATE_WM_length)];
6178
6179      iris_pack_command(GENX(3DSTATE_WM), &dynamic_wm, wm) {
6180         wm.StatisticsEnable = ice->state.statistics_counters_enabled;
6181
6182         wm.BarycentricInterpolationMode =
6183            wm_prog_data->barycentric_interp_modes;
6184
6185         if (wm_prog_data->early_fragment_tests)
6186            wm.EarlyDepthStencilControl = EDSC_PREPS;
6187         else if (wm_prog_data->has_side_effects)
6188            wm.EarlyDepthStencilControl = EDSC_PSEXEC;
6189
6190         /* We could skip this bit if color writes are enabled. */
6191         if (wm_prog_data->has_side_effects || wm_prog_data->uses_kill)
6192            wm.ForceThreadDispatchEnable = ForceON;
6193      }
6194      iris_emit_merge(batch, cso->wm, dynamic_wm, ARRAY_SIZE(cso->wm));
6195   }
6196
6197   if (dirty & IRIS_DIRTY_SBE) {
6198      iris_emit_sbe(batch, ice);
6199   }
6200
6201   if (dirty & IRIS_DIRTY_PS_BLEND) {
6202      struct iris_blend_state *cso_blend = ice->state.cso_blend;
6203      struct iris_depth_stencil_alpha_state *cso_zsa = ice->state.cso_zsa;
6204      const struct shader_info *fs_info =
6205         iris_get_shader_info(ice, MESA_SHADER_FRAGMENT);
6206
6207      uint32_t dynamic_pb[GENX(3DSTATE_PS_BLEND_length)];
6208      iris_pack_command(GENX(3DSTATE_PS_BLEND), &dynamic_pb, pb) {
6209         pb.HasWriteableRT = has_writeable_rt(cso_blend, fs_info);
6210         pb.AlphaTestEnable = cso_zsa->alpha_enabled;
6211
6212         /* The dual source blending docs caution against using SRC1 factors
6213          * when the shader doesn't use a dual source render target write.
6214          * Empirically, this can lead to GPU hangs, and the results are
6215          * undefined anyway, so simply disable blending to avoid the hang.
6216          */
6217         pb.ColorBufferBlendEnable = (cso_blend->blend_enables & 1) &&
6218            (!cso_blend->dual_color_blending || wm_prog_data->dual_src_blend);
6219      }
6220
6221      iris_emit_merge(batch, cso_blend->ps_blend, dynamic_pb,
6222                      ARRAY_SIZE(cso_blend->ps_blend));
6223   }
6224
6225   if (dirty & IRIS_DIRTY_WM_DEPTH_STENCIL) {
6226      struct iris_depth_stencil_alpha_state *cso = ice->state.cso_zsa;
6227#if GFX_VER >= 9 && GFX_VER < 12
6228      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6229      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
6230      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
6231         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
6232         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6233      }
6234      iris_emit_merge(batch, cso->wmds, stencil_refs, ARRAY_SIZE(cso->wmds));
6235#else
6236      /* Use modify disable fields which allow us to emit packets
6237       * directly instead of merging them later.
6238       */
6239      iris_batch_emit(batch, cso->wmds, sizeof(cso->wmds));
6240#endif
6241
6242#if GFX_VER >= 12
6243      iris_batch_emit(batch, cso->depth_bounds, sizeof(cso->depth_bounds));
6244#endif
6245   }
6246
6247   if (dirty & IRIS_DIRTY_STENCIL_REF) {
6248#if GFX_VER >= 12
6249      /* Use modify disable fields which allow us to emit packets
6250       * directly instead of merging them later.
6251       */
6252      struct pipe_stencil_ref *p_stencil_refs = &ice->state.stencil_ref;
6253      uint32_t stencil_refs[GENX(3DSTATE_WM_DEPTH_STENCIL_length)];
6254      iris_pack_command(GENX(3DSTATE_WM_DEPTH_STENCIL), &stencil_refs, wmds) {
6255         wmds.StencilReferenceValue = p_stencil_refs->ref_value[0];
6256         wmds.BackfaceStencilReferenceValue = p_stencil_refs->ref_value[1];
6257         wmds.StencilTestMaskModifyDisable = true;
6258         wmds.StencilWriteMaskModifyDisable = true;
6259         wmds.StencilStateModifyDisable = true;
6260         wmds.DepthStateModifyDisable = true;
6261      }
6262      iris_batch_emit(batch, stencil_refs, sizeof(stencil_refs));
6263#endif
6264   }
6265
6266   if (dirty & IRIS_DIRTY_SCISSOR_RECT) {
6267      /* Wa_1409725701:
6268       *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
6269       *    stored as an array of up to 16 elements. The location of first
6270       *    element of the array, as specified by Pointer to SCISSOR_RECT,
6271       *    should be aligned to a 64-byte boundary.
6272       */
6273      uint32_t alignment = 64;
6274      uint32_t scissor_offset =
6275         emit_state(batch, ice->state.dynamic_uploader,
6276                    &ice->state.last_res.scissor,
6277                    ice->state.scissors,
6278                    sizeof(struct pipe_scissor_state) *
6279                    ice->state.num_viewports, alignment);
6280
6281      iris_emit_cmd(batch, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
6282         ptr.ScissorRectPointer = scissor_offset;
6283      }
6284   }
6285
6286   if (dirty & IRIS_DIRTY_DEPTH_BUFFER) {
6287      struct iris_depth_buffer_state *cso_z = &ice->state.genx->depth_buffer;
6288
6289      /* Do not emit the cso yet. We may need to update clear params first. */
6290      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6291      struct iris_resource *zres = NULL, *sres = NULL;
6292      if (cso_fb->zsbuf) {
6293         iris_get_depth_stencil_resources(cso_fb->zsbuf->texture,
6294                                          &zres, &sres);
6295      }
6296
6297      if (zres && ice->state.hiz_usage != ISL_AUX_USAGE_NONE) {
6298         uint32_t *clear_params =
6299            cso_z->packets + ARRAY_SIZE(cso_z->packets) -
6300            GENX(3DSTATE_CLEAR_PARAMS_length);
6301
6302         iris_pack_command(GENX(3DSTATE_CLEAR_PARAMS), clear_params, clear) {
6303            clear.DepthClearValueValid = true;
6304            clear.DepthClearValue = zres->aux.clear_color.f32[0];
6305         }
6306      }
6307
6308      iris_batch_emit(batch, cso_z->packets, sizeof(cso_z->packets));
6309
6310      if (zres)
6311         genX(emit_depth_state_workarounds)(ice, batch, &zres->surf);
6312
6313      if (GFX_VER >= 12) {
6314         /* Wa_1408224581
6315          *
6316          * Workaround: Gfx12LP Astep only An additional pipe control with
6317          * post-sync = store dword operation would be required.( w/a is to
6318          * have an additional pipe control after the stencil state whenever
6319          * the surface state bits of this state is changing).
6320          */
6321         iris_emit_pipe_control_write(batch, "WA for stencil state",
6322                                      PIPE_CONTROL_WRITE_IMMEDIATE,
6323                                      batch->screen->workaround_address.bo,
6324                                      batch->screen->workaround_address.offset, 0);
6325      }
6326   }
6327
6328   if (dirty & (IRIS_DIRTY_DEPTH_BUFFER | IRIS_DIRTY_WM_DEPTH_STENCIL)) {
6329      /* Listen for buffer changes, and also write enable changes. */
6330      struct pipe_framebuffer_state *cso_fb = &ice->state.framebuffer;
6331      pin_depth_and_stencil_buffers(batch, cso_fb->zsbuf, ice->state.cso_zsa);
6332   }
6333
6334   if (dirty & IRIS_DIRTY_POLYGON_STIPPLE) {
6335      iris_emit_cmd(batch, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
6336         for (int i = 0; i < 32; i++) {
6337            poly.PatternRow[i] = ice->state.poly_stipple.stipple[i];
6338         }
6339      }
6340   }
6341
6342   if (dirty & IRIS_DIRTY_LINE_STIPPLE) {
6343      struct iris_rasterizer_state *cso = ice->state.cso_rast;
6344      iris_batch_emit(batch, cso->line_stipple, sizeof(cso->line_stipple));
6345   }
6346
6347   if (dirty & IRIS_DIRTY_VF_TOPOLOGY) {
6348      iris_emit_cmd(batch, GENX(3DSTATE_VF_TOPOLOGY), topo) {
6349         topo.PrimitiveTopologyType =
6350            translate_prim_type(draw->mode, ice->state.vertices_per_patch);
6351      }
6352   }
6353
6354   if (dirty & IRIS_DIRTY_VERTEX_BUFFERS) {
6355      int count = util_bitcount64(ice->state.bound_vertex_buffers);
6356      uint64_t dynamic_bound = ice->state.bound_vertex_buffers;
6357
6358      if (ice->state.vs_uses_draw_params) {
6359         assert(ice->draw.draw_params.res);
6360
6361         struct iris_vertex_buffer_state *state =
6362            &(ice->state.genx->vertex_buffers[count]);
6363         pipe_resource_reference(&state->resource, ice->draw.draw_params.res);
6364         struct iris_resource *res = (void *) state->resource;
6365
6366         iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
6367            vb.VertexBufferIndex = count;
6368            vb.AddressModifyEnable = true;
6369            vb.BufferPitch = 0;
6370            vb.BufferSize = res->bo->size - ice->draw.draw_params.offset;
6371            vb.BufferStartingAddress =
6372               ro_bo(NULL, res->bo->address +
6373                           (int) ice->draw.draw_params.offset);
6374            vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev,
6375                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
6376#if GFX_VER >= 12
6377            vb.L3BypassDisable       = true;
6378#endif
6379         }
6380         dynamic_bound |= 1ull << count;
6381         count++;
6382      }
6383
6384      if (ice->state.vs_uses_derived_draw_params) {
6385         struct iris_vertex_buffer_state *state =
6386            &(ice->state.genx->vertex_buffers[count]);
6387         pipe_resource_reference(&state->resource,
6388                                 ice->draw.derived_draw_params.res);
6389         struct iris_resource *res = (void *) ice->draw.derived_draw_params.res;
6390
6391         iris_pack_state(GENX(VERTEX_BUFFER_STATE), state->state, vb) {
6392             vb.VertexBufferIndex = count;
6393            vb.AddressModifyEnable = true;
6394            vb.BufferPitch = 0;
6395            vb.BufferSize =
6396               res->bo->size - ice->draw.derived_draw_params.offset;
6397            vb.BufferStartingAddress =
6398               ro_bo(NULL, res->bo->address +
6399                           (int) ice->draw.derived_draw_params.offset);
6400            vb.MOCS = iris_mocs(res->bo, &batch->screen->isl_dev,
6401                                ISL_SURF_USAGE_VERTEX_BUFFER_BIT);
6402#if GFX_VER >= 12
6403            vb.L3BypassDisable       = true;
6404#endif
6405         }
6406         dynamic_bound |= 1ull << count;
6407         count++;
6408      }
6409
6410      if (count) {
6411#if GFX_VER >= 11
6412         /* Gfx11+ doesn't need the cache workaround below */
6413         uint64_t bound = dynamic_bound;
6414         while (bound) {
6415            const int i = u_bit_scan64(&bound);
6416            iris_use_optional_res(batch, genx->vertex_buffers[i].resource,
6417                                  false, IRIS_DOMAIN_VF_READ);
6418         }
6419#else
6420         /* The VF cache designers cut corners, and made the cache key's
6421          * <VertexBufferIndex, Memory Address> tuple only consider the bottom
6422          * 32 bits of the address.  If you have two vertex buffers which get
6423          * placed exactly 4 GiB apart and use them in back-to-back draw calls,
6424          * you can get collisions (even within a single batch).
6425          *
6426          * So, we need to do a VF cache invalidate if the buffer for a VB
6427          * slot slot changes [48:32] address bits from the previous time.
6428          */
6429         unsigned flush_flags = 0;
6430
6431         uint64_t bound = dynamic_bound;
6432         while (bound) {
6433            const int i = u_bit_scan64(&bound);
6434            uint16_t high_bits = 0;
6435
6436            struct iris_resource *res =
6437               (void *) genx->vertex_buffers[i].resource;
6438            if (res) {
6439               iris_use_pinned_bo(batch, res->bo, false, IRIS_DOMAIN_VF_READ);
6440
6441               high_bits = res->bo->address >> 32ull;
6442               if (high_bits != ice->state.last_vbo_high_bits[i]) {
6443                  flush_flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE |
6444                                 PIPE_CONTROL_CS_STALL;
6445                  ice->state.last_vbo_high_bits[i] = high_bits;
6446               }
6447            }
6448         }
6449
6450         if (flush_flags) {
6451            iris_emit_pipe_control_flush(batch,
6452                                         "workaround: VF cache 32-bit key [VB]",
6453                                         flush_flags);
6454         }
6455#endif
6456
6457         const unsigned vb_dwords = GENX(VERTEX_BUFFER_STATE_length);
6458
6459         uint32_t *map =
6460            iris_get_command_space(batch, 4 * (1 + vb_dwords * count));
6461         _iris_pack_command(batch, GENX(3DSTATE_VERTEX_BUFFERS), map, vb) {
6462            vb.DWordLength = (vb_dwords * count + 1) - 2;
6463         }
6464         map += 1;
6465
6466         bound = dynamic_bound;
6467         while (bound) {
6468            const int i = u_bit_scan64(&bound);
6469            memcpy(map, genx->vertex_buffers[i].state,
6470                   sizeof(uint32_t) * vb_dwords);
6471            map += vb_dwords;
6472         }
6473      }
6474   }
6475
6476   if (dirty & IRIS_DIRTY_VERTEX_ELEMENTS) {
6477      struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
6478      const unsigned entries = MAX2(cso->count, 1);
6479      if (!(ice->state.vs_needs_sgvs_element ||
6480            ice->state.vs_uses_derived_draw_params ||
6481            ice->state.vs_needs_edge_flag)) {
6482         iris_batch_emit(batch, cso->vertex_elements, sizeof(uint32_t) *
6483                         (1 + entries * GENX(VERTEX_ELEMENT_STATE_length)));
6484      } else {
6485         uint32_t dynamic_ves[1 + 33 * GENX(VERTEX_ELEMENT_STATE_length)];
6486         const unsigned dyn_count = cso->count +
6487            ice->state.vs_needs_sgvs_element +
6488            ice->state.vs_uses_derived_draw_params;
6489
6490         iris_pack_command(GENX(3DSTATE_VERTEX_ELEMENTS),
6491                           &dynamic_ves, ve) {
6492            ve.DWordLength =
6493               1 + GENX(VERTEX_ELEMENT_STATE_length) * dyn_count - 2;
6494         }
6495         memcpy(&dynamic_ves[1], &cso->vertex_elements[1],
6496                (cso->count - ice->state.vs_needs_edge_flag) *
6497                GENX(VERTEX_ELEMENT_STATE_length) * sizeof(uint32_t));
6498         uint32_t *ve_pack_dest =
6499            &dynamic_ves[1 + (cso->count - ice->state.vs_needs_edge_flag) *
6500                         GENX(VERTEX_ELEMENT_STATE_length)];
6501
6502         if (ice->state.vs_needs_sgvs_element) {
6503            uint32_t base_ctrl = ice->state.vs_uses_draw_params ?
6504                                 VFCOMP_STORE_SRC : VFCOMP_STORE_0;
6505            iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6506               ve.Valid = true;
6507               ve.VertexBufferIndex =
6508                  util_bitcount64(ice->state.bound_vertex_buffers);
6509               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
6510               ve.Component0Control = base_ctrl;
6511               ve.Component1Control = base_ctrl;
6512               ve.Component2Control = VFCOMP_STORE_0;
6513               ve.Component3Control = VFCOMP_STORE_0;
6514            }
6515            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6516         }
6517         if (ice->state.vs_uses_derived_draw_params) {
6518            iris_pack_state(GENX(VERTEX_ELEMENT_STATE), ve_pack_dest, ve) {
6519               ve.Valid = true;
6520               ve.VertexBufferIndex =
6521                  util_bitcount64(ice->state.bound_vertex_buffers) +
6522                  ice->state.vs_uses_draw_params;
6523               ve.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
6524               ve.Component0Control = VFCOMP_STORE_SRC;
6525               ve.Component1Control = VFCOMP_STORE_SRC;
6526               ve.Component2Control = VFCOMP_STORE_0;
6527               ve.Component3Control = VFCOMP_STORE_0;
6528            }
6529            ve_pack_dest += GENX(VERTEX_ELEMENT_STATE_length);
6530         }
6531         if (ice->state.vs_needs_edge_flag) {
6532            for (int i = 0; i < GENX(VERTEX_ELEMENT_STATE_length);  i++)
6533               ve_pack_dest[i] = cso->edgeflag_ve[i];
6534         }
6535
6536         iris_batch_emit(batch, &dynamic_ves, sizeof(uint32_t) *
6537                         (1 + dyn_count * GENX(VERTEX_ELEMENT_STATE_length)));
6538      }
6539
6540      if (!ice->state.vs_needs_edge_flag) {
6541         iris_batch_emit(batch, cso->vf_instancing, sizeof(uint32_t) *
6542                         entries * GENX(3DSTATE_VF_INSTANCING_length));
6543      } else {
6544         assert(cso->count > 0);
6545         const unsigned edgeflag_index = cso->count - 1;
6546         uint32_t dynamic_vfi[33 * GENX(3DSTATE_VF_INSTANCING_length)];
6547         memcpy(&dynamic_vfi[0], cso->vf_instancing, edgeflag_index *
6548                GENX(3DSTATE_VF_INSTANCING_length) * sizeof(uint32_t));
6549
6550         uint32_t *vfi_pack_dest = &dynamic_vfi[0] +
6551            edgeflag_index * GENX(3DSTATE_VF_INSTANCING_length);
6552         iris_pack_command(GENX(3DSTATE_VF_INSTANCING), vfi_pack_dest, vi) {
6553            vi.VertexElementIndex = edgeflag_index +
6554               ice->state.vs_needs_sgvs_element +
6555               ice->state.vs_uses_derived_draw_params;
6556         }
6557         for (int i = 0; i < GENX(3DSTATE_VF_INSTANCING_length);  i++)
6558            vfi_pack_dest[i] |= cso->edgeflag_vfi[i];
6559
6560         iris_batch_emit(batch, &dynamic_vfi[0], sizeof(uint32_t) *
6561                         entries * GENX(3DSTATE_VF_INSTANCING_length));
6562      }
6563   }
6564
6565   if (dirty & IRIS_DIRTY_VF_SGVS) {
6566      const struct brw_vs_prog_data *vs_prog_data = (void *)
6567         ice->shaders.prog[MESA_SHADER_VERTEX]->prog_data;
6568      struct iris_vertex_element_state *cso = ice->state.cso_vertex_elements;
6569
6570      iris_emit_cmd(batch, GENX(3DSTATE_VF_SGVS), sgv) {
6571         if (vs_prog_data->uses_vertexid) {
6572            sgv.VertexIDEnable = true;
6573            sgv.VertexIDComponentNumber = 2;
6574            sgv.VertexIDElementOffset =
6575               cso->count - ice->state.vs_needs_edge_flag;
6576         }
6577
6578         if (vs_prog_data->uses_instanceid) {
6579            sgv.InstanceIDEnable = true;
6580            sgv.InstanceIDComponentNumber = 3;
6581            sgv.InstanceIDElementOffset =
6582               cso->count - ice->state.vs_needs_edge_flag;
6583         }
6584      }
6585   }
6586
6587   if (dirty & IRIS_DIRTY_VF) {
6588      iris_emit_cmd(batch, GENX(3DSTATE_VF), vf) {
6589         if (draw->primitive_restart) {
6590            vf.IndexedDrawCutIndexEnable = true;
6591            vf.CutIndex = draw->restart_index;
6592         }
6593      }
6594   }
6595
6596   if (dirty & IRIS_DIRTY_VF_STATISTICS) {
6597      iris_emit_cmd(batch, GENX(3DSTATE_VF_STATISTICS), vf) {
6598         vf.StatisticsEnable = true;
6599      }
6600   }
6601
6602#if GFX_VER == 8
6603   if (dirty & IRIS_DIRTY_PMA_FIX) {
6604      bool enable = want_pma_fix(ice);
6605      genX(update_pma_fix)(ice, batch, enable);
6606   }
6607#endif
6608
6609   if (ice->state.current_hash_scale != 1)
6610      genX(emit_hashing_mode)(ice, batch, UINT_MAX, UINT_MAX, 1);
6611
6612#if GFX_VER >= 12
6613   genX(invalidate_aux_map_state)(batch);
6614#endif
6615}
6616
6617static void
6618flush_vbos(struct iris_context *ice, struct iris_batch *batch)
6619{
6620   struct iris_genx_state *genx = ice->state.genx;
6621   uint64_t bound = ice->state.bound_vertex_buffers;
6622   while (bound) {
6623      const int i = u_bit_scan64(&bound);
6624      struct iris_bo *bo = iris_resource_bo(genx->vertex_buffers[i].resource);
6625      iris_emit_buffer_barrier_for(batch, bo, IRIS_DOMAIN_VF_READ);
6626   }
6627}
6628
6629static void
6630iris_upload_render_state(struct iris_context *ice,
6631                         struct iris_batch *batch,
6632                         const struct pipe_draw_info *draw,
6633                         unsigned drawid_offset,
6634                         const struct pipe_draw_indirect_info *indirect,
6635                         const struct pipe_draw_start_count_bias *sc)
6636{
6637   bool use_predicate = ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT;
6638
6639   if (ice->state.dirty & IRIS_DIRTY_VERTEX_BUFFER_FLUSHES)
6640      flush_vbos(ice, batch);
6641
6642   iris_batch_sync_region_start(batch);
6643
6644   /* Always pin the binder.  If we're emitting new binding table pointers,
6645    * we need it.  If not, we're probably inheriting old tables via the
6646    * context, and need it anyway.  Since true zero-bindings cases are
6647    * practically non-existent, just pin it and avoid last_res tracking.
6648    */
6649   iris_use_pinned_bo(batch, ice->state.binder.bo, false,
6650                      IRIS_DOMAIN_NONE);
6651
6652   if (!batch->contains_draw) {
6653      if (GFX_VER == 12) {
6654         /* Re-emit constants when starting a new batch buffer in order to
6655          * work around push constant corruption on context switch.
6656          *
6657          * XXX - Provide hardware spec quotation when available.
6658          */
6659         ice->state.stage_dirty |= (IRIS_STAGE_DIRTY_CONSTANTS_VS  |
6660                                    IRIS_STAGE_DIRTY_CONSTANTS_TCS |
6661                                    IRIS_STAGE_DIRTY_CONSTANTS_TES |
6662                                    IRIS_STAGE_DIRTY_CONSTANTS_GS  |
6663                                    IRIS_STAGE_DIRTY_CONSTANTS_FS);
6664      }
6665      batch->contains_draw = true;
6666   }
6667
6668   if (!batch->contains_draw_with_next_seqno) {
6669      iris_restore_render_saved_bos(ice, batch, draw);
6670      batch->contains_draw_with_next_seqno = true;
6671   }
6672
6673   iris_upload_dirty_render_state(ice, batch, draw);
6674
6675   if (draw->index_size > 0) {
6676      unsigned offset;
6677
6678      if (draw->has_user_indices) {
6679         unsigned start_offset = draw->index_size * sc->start;
6680
6681         u_upload_data(ice->ctx.const_uploader, start_offset,
6682                       sc->count * draw->index_size, 4,
6683                       (char*)draw->index.user + start_offset,
6684                       &offset, &ice->state.last_res.index_buffer);
6685         offset -= start_offset;
6686      } else {
6687         struct iris_resource *res = (void *) draw->index.resource;
6688         res->bind_history |= PIPE_BIND_INDEX_BUFFER;
6689
6690         pipe_resource_reference(&ice->state.last_res.index_buffer,
6691                                 draw->index.resource);
6692         offset = 0;
6693
6694         iris_emit_buffer_barrier_for(batch, res->bo, IRIS_DOMAIN_VF_READ);
6695      }
6696
6697      struct iris_genx_state *genx = ice->state.genx;
6698      struct iris_bo *bo = iris_resource_bo(ice->state.last_res.index_buffer);
6699
6700      uint32_t ib_packet[GENX(3DSTATE_INDEX_BUFFER_length)];
6701      iris_pack_command(GENX(3DSTATE_INDEX_BUFFER), ib_packet, ib) {
6702         ib.IndexFormat = draw->index_size >> 1;
6703         ib.MOCS = iris_mocs(bo, &batch->screen->isl_dev,
6704                             ISL_SURF_USAGE_INDEX_BUFFER_BIT);
6705         ib.BufferSize = bo->size - offset;
6706         ib.BufferStartingAddress = ro_bo(NULL, bo->address + offset);
6707#if GFX_VER >= 12
6708         ib.L3BypassDisable       = true;
6709#endif
6710      }
6711
6712      if (memcmp(genx->last_index_buffer, ib_packet, sizeof(ib_packet)) != 0) {
6713         memcpy(genx->last_index_buffer, ib_packet, sizeof(ib_packet));
6714         iris_batch_emit(batch, ib_packet, sizeof(ib_packet));
6715         iris_use_pinned_bo(batch, bo, false, IRIS_DOMAIN_VF_READ);
6716      }
6717
6718#if GFX_VER < 11
6719      /* The VF cache key only uses 32-bits, see vertex buffer comment above */
6720      uint16_t high_bits = bo->address >> 32ull;
6721      if (high_bits != ice->state.last_index_bo_high_bits) {
6722         iris_emit_pipe_control_flush(batch,
6723                                      "workaround: VF cache 32-bit key [IB]",
6724                                      PIPE_CONTROL_VF_CACHE_INVALIDATE |
6725                                      PIPE_CONTROL_CS_STALL);
6726         ice->state.last_index_bo_high_bits = high_bits;
6727      }
6728#endif
6729   }
6730
6731#define _3DPRIM_END_OFFSET          0x2420
6732#define _3DPRIM_START_VERTEX        0x2430
6733#define _3DPRIM_VERTEX_COUNT        0x2434
6734#define _3DPRIM_INSTANCE_COUNT      0x2438
6735#define _3DPRIM_START_INSTANCE      0x243C
6736#define _3DPRIM_BASE_VERTEX         0x2440
6737
6738   if (indirect && !indirect->count_from_stream_output) {
6739      if (indirect->indirect_draw_count) {
6740         use_predicate = true;
6741
6742         struct iris_bo *draw_count_bo =
6743            iris_resource_bo(indirect->indirect_draw_count);
6744         unsigned draw_count_offset =
6745            indirect->indirect_draw_count_offset;
6746
6747         if (ice->state.predicate == IRIS_PREDICATE_STATE_USE_BIT) {
6748            struct mi_builder b;
6749            mi_builder_init(&b, &batch->screen->devinfo, batch);
6750
6751            /* comparison = draw id < draw count */
6752            struct mi_value comparison =
6753               mi_ult(&b, mi_imm(drawid_offset),
6754                          mi_mem32(ro_bo(draw_count_bo, draw_count_offset)));
6755
6756            /* predicate = comparison & conditional rendering predicate */
6757            mi_store(&b, mi_reg32(MI_PREDICATE_RESULT),
6758                         mi_iand(&b, comparison, mi_reg32(CS_GPR(15))));
6759         } else {
6760            uint32_t mi_predicate;
6761
6762            /* Upload the id of the current primitive to MI_PREDICATE_SRC1. */
6763            iris_load_register_imm64(batch, MI_PREDICATE_SRC1, drawid_offset);
6764            /* Upload the current draw count from the draw parameters buffer
6765             * to MI_PREDICATE_SRC0.
6766             */
6767            iris_load_register_mem32(batch, MI_PREDICATE_SRC0,
6768                                     draw_count_bo, draw_count_offset);
6769            /* Zero the top 32-bits of MI_PREDICATE_SRC0 */
6770            iris_load_register_imm32(batch, MI_PREDICATE_SRC0 + 4, 0);
6771
6772            if (drawid_offset == 0) {
6773               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOADINV |
6774                              MI_PREDICATE_COMBINEOP_SET |
6775                              MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
6776            } else {
6777               /* While draw_index < draw_count the predicate's result will be
6778                *  (draw_index == draw_count) ^ TRUE = TRUE
6779                * When draw_index == draw_count the result is
6780                *  (TRUE) ^ TRUE = FALSE
6781                * After this all results will be:
6782                *  (FALSE) ^ FALSE = FALSE
6783                */
6784               mi_predicate = MI_PREDICATE | MI_PREDICATE_LOADOP_LOAD |
6785                              MI_PREDICATE_COMBINEOP_XOR |
6786                              MI_PREDICATE_COMPAREOP_SRCS_EQUAL;
6787            }
6788            iris_batch_emit(batch, &mi_predicate, sizeof(uint32_t));
6789         }
6790      }
6791      struct iris_bo *bo = iris_resource_bo(indirect->buffer);
6792      assert(bo);
6793
6794      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6795         lrm.RegisterAddress = _3DPRIM_VERTEX_COUNT;
6796         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 0);
6797      }
6798      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6799         lrm.RegisterAddress = _3DPRIM_INSTANCE_COUNT;
6800         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 4);
6801      }
6802      iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6803         lrm.RegisterAddress = _3DPRIM_START_VERTEX;
6804         lrm.MemoryAddress = ro_bo(bo, indirect->offset + 8);
6805      }
6806      if (draw->index_size) {
6807         iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6808            lrm.RegisterAddress = _3DPRIM_BASE_VERTEX;
6809            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
6810         }
6811         iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6812            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
6813            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 16);
6814         }
6815      } else {
6816         iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6817            lrm.RegisterAddress = _3DPRIM_START_INSTANCE;
6818            lrm.MemoryAddress = ro_bo(bo, indirect->offset + 12);
6819         }
6820         iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_IMM), lri) {
6821            lri.RegisterOffset = _3DPRIM_BASE_VERTEX;
6822            lri.DataDWord = 0;
6823         }
6824      }
6825   } else if (indirect && indirect->count_from_stream_output) {
6826      struct iris_stream_output_target *so =
6827         (void *) indirect->count_from_stream_output;
6828
6829      /* XXX: Replace with actual cache tracking */
6830      iris_emit_pipe_control_flush(batch,
6831                                   "draw count from stream output stall",
6832                                   PIPE_CONTROL_CS_STALL);
6833
6834      struct mi_builder b;
6835      mi_builder_init(&b, &batch->screen->devinfo, batch);
6836
6837      struct iris_address addr =
6838         ro_bo(iris_resource_bo(so->offset.res), so->offset.offset);
6839      struct mi_value offset =
6840         mi_iadd_imm(&b, mi_mem32(addr), -so->base.buffer_offset);
6841
6842      mi_store(&b, mi_reg32(_3DPRIM_VERTEX_COUNT),
6843                   mi_udiv32_imm(&b, offset, so->stride));
6844
6845      _iris_emit_lri(batch, _3DPRIM_START_VERTEX, 0);
6846      _iris_emit_lri(batch, _3DPRIM_BASE_VERTEX, 0);
6847      _iris_emit_lri(batch, _3DPRIM_START_INSTANCE, 0);
6848      _iris_emit_lri(batch, _3DPRIM_INSTANCE_COUNT, draw->instance_count);
6849   }
6850
6851   iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_DRAW, draw, indirect, sc);
6852
6853   iris_emit_cmd(batch, GENX(3DPRIMITIVE), prim) {
6854      prim.VertexAccessType = draw->index_size > 0 ? RANDOM : SEQUENTIAL;
6855      prim.PredicateEnable = use_predicate;
6856
6857      if (indirect) {
6858         prim.IndirectParameterEnable = true;
6859      } else {
6860         prim.StartInstanceLocation = draw->start_instance;
6861         prim.InstanceCount = draw->instance_count;
6862         prim.VertexCountPerInstance = sc->count;
6863
6864         prim.StartVertexLocation = sc->start;
6865
6866         if (draw->index_size) {
6867            prim.BaseVertexLocation += sc->index_bias;
6868         }
6869      }
6870   }
6871
6872   iris_batch_sync_region_end(batch);
6873}
6874
6875static void
6876iris_load_indirect_location(struct iris_context *ice,
6877                            struct iris_batch *batch,
6878                            const struct pipe_grid_info *grid)
6879{
6880#define GPGPU_DISPATCHDIMX 0x2500
6881#define GPGPU_DISPATCHDIMY 0x2504
6882#define GPGPU_DISPATCHDIMZ 0x2508
6883
6884   assert(grid->indirect);
6885
6886   struct iris_state_ref *grid_size = &ice->state.grid_size;
6887   struct iris_bo *bo = iris_resource_bo(grid_size->res);
6888   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6889      lrm.RegisterAddress = GPGPU_DISPATCHDIMX;
6890      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 0);
6891   }
6892   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6893      lrm.RegisterAddress = GPGPU_DISPATCHDIMY;
6894      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 4);
6895   }
6896   iris_emit_cmd(batch, GENX(MI_LOAD_REGISTER_MEM), lrm) {
6897      lrm.RegisterAddress = GPGPU_DISPATCHDIMZ;
6898      lrm.MemoryAddress = ro_bo(bo, grid_size->offset + 8);
6899   }
6900}
6901
6902#if GFX_VERx10 >= 125
6903
6904static void
6905iris_upload_compute_walker(struct iris_context *ice,
6906                           struct iris_batch *batch,
6907                           const struct pipe_grid_info *grid)
6908{
6909   const uint64_t stage_dirty = ice->state.stage_dirty;
6910   struct iris_screen *screen = batch->screen;
6911   const struct intel_device_info *devinfo = &screen->devinfo;
6912   struct iris_binder *binder = &ice->state.binder;
6913   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
6914   struct iris_compiled_shader *shader =
6915      ice->shaders.prog[MESA_SHADER_COMPUTE];
6916   struct brw_stage_prog_data *prog_data = shader->prog_data;
6917   struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
6918   const struct brw_cs_dispatch_info dispatch =
6919      brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
6920
6921   if (stage_dirty & IRIS_STAGE_DIRTY_CS) {
6922      iris_emit_cmd(batch, GENX(CFE_STATE), cfe) {
6923         cfe.MaximumNumberofThreads =
6924            devinfo->max_cs_threads * devinfo->subslice_total - 1;
6925         if (prog_data->total_scratch > 0) {
6926            cfe.ScratchSpaceBuffer =
6927               iris_get_scratch_surf(ice, prog_data->total_scratch)->offset >> 4;
6928         }
6929      }
6930   }
6931
6932   if (grid->indirect)
6933      iris_load_indirect_location(ice, batch, grid);
6934
6935   iris_emit_cmd(batch, GENX(COMPUTE_WALKER), cw) {
6936      cw.IndirectParameterEnable        = grid->indirect;
6937      cw.SIMDSize                       = dispatch.simd_size / 16;
6938      cw.LocalXMaximum                  = grid->block[0] - 1;
6939      cw.LocalYMaximum                  = grid->block[1] - 1;
6940      cw.LocalZMaximum                  = grid->block[2] - 1;
6941      cw.ThreadGroupIDXDimension        = grid->grid[0];
6942      cw.ThreadGroupIDYDimension        = grid->grid[1];
6943      cw.ThreadGroupIDZDimension        = grid->grid[2];
6944      cw.ExecutionMask                  = dispatch.right_mask;
6945
6946      cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) {
6947         .KernelStartPointer = KSP(shader),
6948         .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
6949         .SharedLocalMemorySize =
6950            encode_slm_size(GFX_VER, prog_data->total_shared),
6951         .NumberOfBarriers = cs_prog_data->uses_barrier,
6952         .SamplerStatePointer = shs->sampler_table.offset,
6953         .BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE],
6954      };
6955
6956      assert(brw_cs_push_const_total_size(cs_prog_data, dispatch.threads) == 0);
6957   }
6958
6959}
6960
6961#else /* #if GFX_VERx10 >= 125 */
6962
6963static void
6964iris_upload_gpgpu_walker(struct iris_context *ice,
6965                         struct iris_batch *batch,
6966                         const struct pipe_grid_info *grid)
6967{
6968   const uint64_t stage_dirty = ice->state.stage_dirty;
6969   struct iris_screen *screen = batch->screen;
6970   const struct intel_device_info *devinfo = &screen->devinfo;
6971   struct iris_binder *binder = &ice->state.binder;
6972   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
6973   struct iris_uncompiled_shader *ish =
6974      ice->shaders.uncompiled[MESA_SHADER_COMPUTE];
6975   struct iris_compiled_shader *shader =
6976      ice->shaders.prog[MESA_SHADER_COMPUTE];
6977   struct brw_stage_prog_data *prog_data = shader->prog_data;
6978   struct brw_cs_prog_data *cs_prog_data = (void *) prog_data;
6979   const struct brw_cs_dispatch_info dispatch =
6980      brw_cs_get_dispatch_info(devinfo, cs_prog_data, grid->block);
6981
6982   if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
6983       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
6984      /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
6985       *
6986       *   "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
6987       *    the only bits that are changed are scoreboard related: Scoreboard
6988       *    Enable, Scoreboard Type, Scoreboard Mask, Scoreboard Delta.  For
6989       *    these scoreboard related states, a MEDIA_STATE_FLUSH is
6990       *    sufficient."
6991       */
6992      iris_emit_pipe_control_flush(batch,
6993                                   "workaround: stall before MEDIA_VFE_STATE",
6994                                   PIPE_CONTROL_CS_STALL);
6995
6996      iris_emit_cmd(batch, GENX(MEDIA_VFE_STATE), vfe) {
6997         if (prog_data->total_scratch) {
6998            uint32_t scratch_addr =
6999               pin_scratch_space(ice, batch, prog_data, MESA_SHADER_COMPUTE);
7000
7001            vfe.PerThreadScratchSpace = ffs(prog_data->total_scratch) - 11;
7002            vfe.ScratchSpaceBasePointer =
7003               rw_bo(NULL, scratch_addr, IRIS_DOMAIN_NONE);
7004         }
7005
7006         vfe.MaximumNumberofThreads =
7007            devinfo->max_cs_threads * devinfo->subslice_total - 1;
7008#if GFX_VER < 11
7009         vfe.ResetGatewayTimer =
7010            Resettingrelativetimerandlatchingtheglobaltimestamp;
7011#endif
7012#if GFX_VER == 8
7013         vfe.BypassGatewayControl = true;
7014#endif
7015         vfe.NumberofURBEntries = 2;
7016         vfe.URBEntryAllocationSize = 2;
7017
7018         vfe.CURBEAllocationSize =
7019            ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
7020                  cs_prog_data->push.cross_thread.regs, 2);
7021      }
7022   }
7023
7024   /* TODO: Combine subgroup-id with cbuf0 so we can push regular uniforms */
7025   if ((stage_dirty & IRIS_STAGE_DIRTY_CS) ||
7026       cs_prog_data->local_size[0] == 0 /* Variable local group size */) {
7027      uint32_t curbe_data_offset = 0;
7028      assert(cs_prog_data->push.cross_thread.dwords == 0 &&
7029             cs_prog_data->push.per_thread.dwords == 1 &&
7030             cs_prog_data->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID);
7031      const unsigned push_const_size =
7032         brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
7033      uint32_t *curbe_data_map =
7034         stream_state(batch, ice->state.dynamic_uploader,
7035                      &ice->state.last_res.cs_thread_ids,
7036                      ALIGN(push_const_size, 64), 64,
7037                      &curbe_data_offset);
7038      assert(curbe_data_map);
7039      memset(curbe_data_map, 0x5a, ALIGN(push_const_size, 64));
7040      iris_fill_cs_push_const_buffer(cs_prog_data, dispatch.threads,
7041                                     curbe_data_map);
7042
7043      iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) {
7044         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
7045         curbe.CURBEDataStartAddress = curbe_data_offset;
7046      }
7047   }
7048
7049   for (unsigned i = 0; i < IRIS_MAX_GLOBAL_BINDINGS; i++) {
7050      struct pipe_resource *res = ice->state.global_bindings[i];
7051      if (!res)
7052         continue;
7053
7054      iris_use_pinned_bo(batch, iris_resource_bo(res),
7055                         true, IRIS_DOMAIN_NONE);
7056   }
7057
7058   if (stage_dirty & (IRIS_STAGE_DIRTY_SAMPLER_STATES_CS |
7059                      IRIS_STAGE_DIRTY_BINDINGS_CS |
7060                      IRIS_STAGE_DIRTY_CONSTANTS_CS |
7061                      IRIS_STAGE_DIRTY_CS)) {
7062      uint32_t desc[GENX(INTERFACE_DESCRIPTOR_DATA_length)];
7063
7064      iris_pack_state(GENX(INTERFACE_DESCRIPTOR_DATA), desc, idd) {
7065         idd.SharedLocalMemorySize =
7066            encode_slm_size(GFX_VER, ish->kernel_shared_size);
7067         idd.KernelStartPointer =
7068            KSP(shader) + brw_cs_prog_data_prog_offset(cs_prog_data,
7069                                                       dispatch.simd_size);
7070         idd.SamplerStatePointer = shs->sampler_table.offset;
7071         idd.BindingTablePointer = binder->bt_offset[MESA_SHADER_COMPUTE];
7072         idd.NumberofThreadsinGPGPUThreadGroup = dispatch.threads;
7073      }
7074
7075      for (int i = 0; i < GENX(INTERFACE_DESCRIPTOR_DATA_length); i++)
7076         desc[i] |= ((uint32_t *) shader->derived_data)[i];
7077
7078      iris_emit_cmd(batch, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
7079         load.InterfaceDescriptorTotalLength =
7080            GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
7081         load.InterfaceDescriptorDataStartAddress =
7082            emit_state(batch, ice->state.dynamic_uploader,
7083                       &ice->state.last_res.cs_desc, desc, sizeof(desc), 64);
7084      }
7085   }
7086
7087   if (grid->indirect)
7088      iris_load_indirect_location(ice, batch, grid);
7089
7090   iris_measure_snapshot(ice, batch, INTEL_SNAPSHOT_COMPUTE, NULL, NULL, NULL);
7091
7092   iris_emit_cmd(batch, GENX(GPGPU_WALKER), ggw) {
7093      ggw.IndirectParameterEnable    = grid->indirect != NULL;
7094      ggw.SIMDSize                   = dispatch.simd_size / 16;
7095      ggw.ThreadDepthCounterMaximum  = 0;
7096      ggw.ThreadHeightCounterMaximum = 0;
7097      ggw.ThreadWidthCounterMaximum  = dispatch.threads - 1;
7098      ggw.ThreadGroupIDXDimension    = grid->grid[0];
7099      ggw.ThreadGroupIDYDimension    = grid->grid[1];
7100      ggw.ThreadGroupIDZDimension    = grid->grid[2];
7101      ggw.RightExecutionMask         = dispatch.right_mask;
7102      ggw.BottomExecutionMask        = 0xffffffff;
7103   }
7104
7105   iris_emit_cmd(batch, GENX(MEDIA_STATE_FLUSH), msf);
7106}
7107
7108#endif /* #if GFX_VERx10 >= 125 */
7109
7110static void
7111iris_upload_compute_state(struct iris_context *ice,
7112                          struct iris_batch *batch,
7113                          const struct pipe_grid_info *grid)
7114{
7115   const uint64_t stage_dirty = ice->state.stage_dirty;
7116   struct iris_shader_state *shs = &ice->state.shaders[MESA_SHADER_COMPUTE];
7117   struct iris_compiled_shader *shader =
7118      ice->shaders.prog[MESA_SHADER_COMPUTE];
7119
7120   iris_batch_sync_region_start(batch);
7121
7122   /* Always pin the binder.  If we're emitting new binding table pointers,
7123    * we need it.  If not, we're probably inheriting old tables via the
7124    * context, and need it anyway.  Since true zero-bindings cases are
7125    * practically non-existent, just pin it and avoid last_res tracking.
7126    */
7127   iris_use_pinned_bo(batch, ice->state.binder.bo, false, IRIS_DOMAIN_NONE);
7128
7129   if (((stage_dirty & IRIS_STAGE_DIRTY_CONSTANTS_CS) &&
7130        shs->sysvals_need_upload) ||
7131       shader->kernel_input_size > 0)
7132      upload_sysvals(ice, MESA_SHADER_COMPUTE, grid);
7133
7134   if (stage_dirty & IRIS_STAGE_DIRTY_BINDINGS_CS)
7135      iris_populate_binding_table(ice, batch, MESA_SHADER_COMPUTE, false);
7136
7137   if (stage_dirty & IRIS_STAGE_DIRTY_SAMPLER_STATES_CS)
7138      iris_upload_sampler_states(ice, MESA_SHADER_COMPUTE);
7139
7140   iris_use_optional_res(batch, shs->sampler_table.res, false,
7141                         IRIS_DOMAIN_NONE);
7142   iris_use_pinned_bo(batch, iris_resource_bo(shader->assembly.res), false,
7143                      IRIS_DOMAIN_NONE);
7144
7145   if (ice->state.need_border_colors)
7146      iris_use_pinned_bo(batch, ice->state.border_color_pool.bo, false,
7147                         IRIS_DOMAIN_NONE);
7148
7149#if GFX_VER >= 12
7150   genX(invalidate_aux_map_state)(batch);
7151#endif
7152
7153#if GFX_VERx10 >= 125
7154   iris_upload_compute_walker(ice, batch, grid);
7155#else
7156   iris_upload_gpgpu_walker(ice, batch, grid);
7157#endif
7158
7159   if (!batch->contains_draw_with_next_seqno) {
7160      iris_restore_compute_saved_bos(ice, batch, grid);
7161      batch->contains_draw_with_next_seqno = batch->contains_draw = true;
7162   }
7163
7164   iris_batch_sync_region_end(batch);
7165}
7166
7167/**
7168 * State module teardown.
7169 */
7170static void
7171iris_destroy_state(struct iris_context *ice)
7172{
7173   struct iris_genx_state *genx = ice->state.genx;
7174
7175   pipe_resource_reference(&ice->draw.draw_params.res, NULL);
7176   pipe_resource_reference(&ice->draw.derived_draw_params.res, NULL);
7177
7178   /* Loop over all VBOs, including ones for draw parameters */
7179   for (unsigned i = 0; i < ARRAY_SIZE(genx->vertex_buffers); i++) {
7180      pipe_resource_reference(&genx->vertex_buffers[i].resource, NULL);
7181   }
7182
7183   free(ice->state.genx);
7184
7185   for (int i = 0; i < 4; i++) {
7186      pipe_so_target_reference(&ice->state.so_target[i], NULL);
7187   }
7188
7189   for (unsigned i = 0; i < ice->state.framebuffer.nr_cbufs; i++) {
7190      pipe_surface_reference(&ice->state.framebuffer.cbufs[i], NULL);
7191   }
7192   pipe_surface_reference(&ice->state.framebuffer.zsbuf, NULL);
7193
7194   for (int stage = 0; stage < MESA_SHADER_STAGES; stage++) {
7195      struct iris_shader_state *shs = &ice->state.shaders[stage];
7196      pipe_resource_reference(&shs->sampler_table.res, NULL);
7197      for (int i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
7198         pipe_resource_reference(&shs->constbuf[i].buffer, NULL);
7199         pipe_resource_reference(&shs->constbuf_surf_state[i].res, NULL);
7200      }
7201      for (int i = 0; i < PIPE_MAX_SHADER_IMAGES; i++) {
7202         pipe_resource_reference(&shs->image[i].base.resource, NULL);
7203         pipe_resource_reference(&shs->image[i].surface_state.ref.res, NULL);
7204         free(shs->image[i].surface_state.cpu);
7205      }
7206      for (int i = 0; i < PIPE_MAX_SHADER_BUFFERS; i++) {
7207         pipe_resource_reference(&shs->ssbo[i].buffer, NULL);
7208         pipe_resource_reference(&shs->ssbo_surf_state[i].res, NULL);
7209      }
7210      for (int i = 0; i < IRIS_MAX_TEXTURE_SAMPLERS; i++) {
7211         pipe_sampler_view_reference((struct pipe_sampler_view **)
7212                                     &shs->textures[i], NULL);
7213      }
7214   }
7215
7216   pipe_resource_reference(&ice->state.grid_size.res, NULL);
7217   pipe_resource_reference(&ice->state.grid_surf_state.res, NULL);
7218
7219   pipe_resource_reference(&ice->state.null_fb.res, NULL);
7220   pipe_resource_reference(&ice->state.unbound_tex.res, NULL);
7221
7222   pipe_resource_reference(&ice->state.last_res.cc_vp, NULL);
7223   pipe_resource_reference(&ice->state.last_res.sf_cl_vp, NULL);
7224   pipe_resource_reference(&ice->state.last_res.color_calc, NULL);
7225   pipe_resource_reference(&ice->state.last_res.scissor, NULL);
7226   pipe_resource_reference(&ice->state.last_res.blend, NULL);
7227   pipe_resource_reference(&ice->state.last_res.index_buffer, NULL);
7228   pipe_resource_reference(&ice->state.last_res.cs_thread_ids, NULL);
7229   pipe_resource_reference(&ice->state.last_res.cs_desc, NULL);
7230}
7231
7232/* ------------------------------------------------------------------- */
7233
7234static void
7235iris_rebind_buffer(struct iris_context *ice,
7236                   struct iris_resource *res)
7237{
7238   struct pipe_context *ctx = &ice->ctx;
7239   struct iris_genx_state *genx = ice->state.genx;
7240
7241   assert(res->base.b.target == PIPE_BUFFER);
7242
7243   /* Buffers can't be framebuffer attachments, nor display related,
7244    * and we don't have upstream Clover support.
7245    */
7246   assert(!(res->bind_history & (PIPE_BIND_DEPTH_STENCIL |
7247                                 PIPE_BIND_RENDER_TARGET |
7248                                 PIPE_BIND_BLENDABLE |
7249                                 PIPE_BIND_DISPLAY_TARGET |
7250                                 PIPE_BIND_CURSOR |
7251                                 PIPE_BIND_COMPUTE_RESOURCE |
7252                                 PIPE_BIND_GLOBAL)));
7253
7254   if (res->bind_history & PIPE_BIND_VERTEX_BUFFER) {
7255      uint64_t bound_vbs = ice->state.bound_vertex_buffers;
7256      while (bound_vbs) {
7257         const int i = u_bit_scan64(&bound_vbs);
7258         struct iris_vertex_buffer_state *state = &genx->vertex_buffers[i];
7259
7260         /* Update the CPU struct */
7261         STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_start) == 32);
7262         STATIC_ASSERT(GENX(VERTEX_BUFFER_STATE_BufferStartingAddress_bits) == 64);
7263         uint64_t *addr = (uint64_t *) &state->state[1];
7264         struct iris_bo *bo = iris_resource_bo(state->resource);
7265
7266         if (*addr != bo->address + state->offset) {
7267            *addr = bo->address + state->offset;
7268            ice->state.dirty |= IRIS_DIRTY_VERTEX_BUFFERS |
7269                                IRIS_DIRTY_VERTEX_BUFFER_FLUSHES;
7270         }
7271      }
7272   }
7273
7274   /* We don't need to handle PIPE_BIND_INDEX_BUFFER here: we re-emit
7275    * the 3DSTATE_INDEX_BUFFER packet whenever the address changes.
7276    *
7277    * There is also no need to handle these:
7278    * - PIPE_BIND_COMMAND_ARGS_BUFFER (emitted for every indirect draw)
7279    * - PIPE_BIND_QUERY_BUFFER (no persistent state references)
7280    */
7281
7282   if (res->bind_history & PIPE_BIND_STREAM_OUTPUT) {
7283      uint32_t *so_buffers = genx->so_buffers;
7284      for (unsigned i = 0; i < 4; i++,
7285           so_buffers += GENX(3DSTATE_SO_BUFFER_length)) {
7286
7287         /* There are no other fields in bits 127:64 */
7288         uint64_t *addr = (uint64_t *) &so_buffers[2];
7289         STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_start) == 66);
7290         STATIC_ASSERT(GENX(3DSTATE_SO_BUFFER_SurfaceBaseAddress_bits) == 46);
7291
7292         struct pipe_stream_output_target *tgt = ice->state.so_target[i];
7293         if (tgt) {
7294            struct iris_bo *bo = iris_resource_bo(tgt->buffer);
7295            if (*addr != bo->address + tgt->buffer_offset) {
7296               *addr = bo->address + tgt->buffer_offset;
7297               ice->state.dirty |= IRIS_DIRTY_SO_BUFFERS;
7298            }
7299         }
7300      }
7301   }
7302
7303   for (int s = MESA_SHADER_VERTEX; s < MESA_SHADER_STAGES; s++) {
7304      struct iris_shader_state *shs = &ice->state.shaders[s];
7305      enum pipe_shader_type p_stage = stage_to_pipe(s);
7306
7307      if (!(res->bind_stages & (1 << s)))
7308         continue;
7309
7310      if (res->bind_history & PIPE_BIND_CONSTANT_BUFFER) {
7311         /* Skip constant buffer 0, it's for regular uniforms, not UBOs */
7312         uint32_t bound_cbufs = shs->bound_cbufs & ~1u;
7313         while (bound_cbufs) {
7314            const int i = u_bit_scan(&bound_cbufs);
7315            struct pipe_shader_buffer *cbuf = &shs->constbuf[i];
7316            struct iris_state_ref *surf_state = &shs->constbuf_surf_state[i];
7317
7318            if (res->bo == iris_resource_bo(cbuf->buffer)) {
7319               pipe_resource_reference(&surf_state->res, NULL);
7320               shs->dirty_cbufs |= 1u << i;
7321               ice->state.dirty |= (IRIS_DIRTY_RENDER_MISC_BUFFER_FLUSHES |
7322                                    IRIS_DIRTY_COMPUTE_MISC_BUFFER_FLUSHES);
7323               ice->state.stage_dirty |= IRIS_STAGE_DIRTY_CONSTANTS_VS << s;
7324            }
7325         }
7326      }
7327
7328      if (res->bind_history & PIPE_BIND_SHADER_BUFFER) {
7329         uint32_t bound_ssbos = shs->bound_ssbos;
7330         while (bound_ssbos) {
7331            const int i = u_bit_scan(&bound_ssbos);
7332            struct pipe_shader_buffer *ssbo = &shs->ssbo[i];
7333
7334            if (res->bo == iris_resource_bo(ssbo->buffer)) {
7335               struct pipe_shader_buffer buf = {
7336                  .buffer = &res->base.b,
7337                  .buffer_offset = ssbo->buffer_offset,
7338                  .buffer_size = ssbo->buffer_size,
7339               };
7340               iris_set_shader_buffers(ctx, p_stage, i, 1, &buf,
7341                                       (shs->writable_ssbos >> i) & 1);
7342            }
7343         }
7344      }
7345
7346      if (res->bind_history & PIPE_BIND_SAMPLER_VIEW) {
7347         uint32_t bound_sampler_views = shs->bound_sampler_views;
7348         while (bound_sampler_views) {
7349            const int i = u_bit_scan(&bound_sampler_views);
7350            struct iris_sampler_view *isv = shs->textures[i];
7351            struct iris_bo *bo = isv->res->bo;
7352
7353            if (update_surface_state_addrs(ice->state.surface_uploader,
7354                                           &isv->surface_state, bo)) {
7355               ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
7356            }
7357         }
7358      }
7359
7360      if (res->bind_history & PIPE_BIND_SHADER_IMAGE) {
7361         uint32_t bound_image_views = shs->bound_image_views;
7362         while (bound_image_views) {
7363            const int i = u_bit_scan(&bound_image_views);
7364            struct iris_image_view *iv = &shs->image[i];
7365            struct iris_bo *bo = iris_resource_bo(iv->base.resource);
7366
7367            if (update_surface_state_addrs(ice->state.surface_uploader,
7368                                           &iv->surface_state, bo)) {
7369               ice->state.stage_dirty |= IRIS_STAGE_DIRTY_BINDINGS_VS << s;
7370            }
7371         }
7372      }
7373   }
7374}
7375
7376/* ------------------------------------------------------------------- */
7377
7378/**
7379 * Introduce a batch synchronization boundary, and update its cache coherency
7380 * status to reflect the execution of a PIPE_CONTROL command with the
7381 * specified flags.
7382 */
7383static void
7384batch_mark_sync_for_pipe_control(struct iris_batch *batch, uint32_t flags)
7385{
7386   iris_batch_sync_boundary(batch);
7387
7388   if ((flags & PIPE_CONTROL_CS_STALL)) {
7389      if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
7390         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
7391
7392      if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
7393         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
7394
7395      if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH))
7396         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_DATA_WRITE);
7397
7398      if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
7399         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
7400
7401      if ((flags & (PIPE_CONTROL_CACHE_FLUSH_BITS |
7402                    PIPE_CONTROL_STALL_AT_SCOREBOARD))) {
7403         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_VF_READ);
7404         iris_batch_mark_flush_sync(batch, IRIS_DOMAIN_OTHER_READ);
7405      }
7406   }
7407
7408   if ((flags & PIPE_CONTROL_RENDER_TARGET_FLUSH))
7409      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_RENDER_WRITE);
7410
7411   if ((flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH))
7412      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DEPTH_WRITE);
7413
7414   if ((flags & PIPE_CONTROL_DATA_CACHE_FLUSH))
7415      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_DATA_WRITE);
7416
7417   if ((flags & PIPE_CONTROL_FLUSH_ENABLE))
7418      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_WRITE);
7419
7420   if ((flags & PIPE_CONTROL_VF_CACHE_INVALIDATE))
7421      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_VF_READ);
7422
7423   if ((flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) &&
7424       (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE))
7425      iris_batch_mark_invalidate_sync(batch, IRIS_DOMAIN_OTHER_READ);
7426}
7427
7428static unsigned
7429flags_to_post_sync_op(uint32_t flags)
7430{
7431   if (flags & PIPE_CONTROL_WRITE_IMMEDIATE)
7432      return WriteImmediateData;
7433
7434   if (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT)
7435      return WritePSDepthCount;
7436
7437   if (flags & PIPE_CONTROL_WRITE_TIMESTAMP)
7438      return WriteTimestamp;
7439
7440   return 0;
7441}
7442
7443/**
7444 * Do the given flags have a Post Sync or LRI Post Sync operation?
7445 */
7446static enum pipe_control_flags
7447get_post_sync_flags(enum pipe_control_flags flags)
7448{
7449   flags &= PIPE_CONTROL_WRITE_IMMEDIATE |
7450            PIPE_CONTROL_WRITE_DEPTH_COUNT |
7451            PIPE_CONTROL_WRITE_TIMESTAMP |
7452            PIPE_CONTROL_LRI_POST_SYNC_OP;
7453
7454   /* Only one "Post Sync Op" is allowed, and it's mutually exclusive with
7455    * "LRI Post Sync Operation".  So more than one bit set would be illegal.
7456    */
7457   assert(util_bitcount(flags) <= 1);
7458
7459   return flags;
7460}
7461
7462#define IS_COMPUTE_PIPELINE(batch) (batch->name == IRIS_BATCH_COMPUTE)
7463
7464/**
7465 * Emit a series of PIPE_CONTROL commands, taking into account any
7466 * workarounds necessary to actually accomplish the caller's request.
7467 *
7468 * Unless otherwise noted, spec quotations in this function come from:
7469 *
7470 * Synchronization of the 3D Pipeline > PIPE_CONTROL Command > Programming
7471 * Restrictions for PIPE_CONTROL.
7472 *
7473 * You should not use this function directly.  Use the helpers in
7474 * iris_pipe_control.c instead, which may split the pipe control further.
7475 */
7476static void
7477iris_emit_raw_pipe_control(struct iris_batch *batch,
7478                           const char *reason,
7479                           uint32_t flags,
7480                           struct iris_bo *bo,
7481                           uint32_t offset,
7482                           uint64_t imm)
7483{
7484   UNUSED const struct intel_device_info *devinfo = &batch->screen->devinfo;
7485   enum pipe_control_flags post_sync_flags = get_post_sync_flags(flags);
7486   enum pipe_control_flags non_lri_post_sync_flags =
7487      post_sync_flags & ~PIPE_CONTROL_LRI_POST_SYNC_OP;
7488
7489   /* Recursive PIPE_CONTROL workarounds --------------------------------
7490    * (http://knowyourmeme.com/memes/xzibit-yo-dawg)
7491    *
7492    * We do these first because we want to look at the original operation,
7493    * rather than any workarounds we set.
7494    */
7495   if (GFX_VER == 9 && (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
7496      /* The PIPE_CONTROL "VF Cache Invalidation Enable" bit description
7497       * lists several workarounds:
7498       *
7499       *    "Project: SKL, KBL, BXT
7500       *
7501       *     If the VF Cache Invalidation Enable is set to a 1 in a
7502       *     PIPE_CONTROL, a separate Null PIPE_CONTROL, all bitfields
7503       *     sets to 0, with the VF Cache Invalidation Enable set to 0
7504       *     needs to be sent prior to the PIPE_CONTROL with VF Cache
7505       *     Invalidation Enable set to a 1."
7506       */
7507      iris_emit_raw_pipe_control(batch,
7508                                 "workaround: recursive VF cache invalidate",
7509                                 0, NULL, 0, 0);
7510   }
7511
7512   /* Wa_1409226450, Wait for EU to be idle before pipe control which
7513    * invalidates the instruction cache
7514    */
7515   if (GFX_VER == 12 && (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE)) {
7516      iris_emit_raw_pipe_control(batch,
7517                                 "workaround: CS stall before instruction "
7518                                 "cache invalidate",
7519                                 PIPE_CONTROL_CS_STALL |
7520                                 PIPE_CONTROL_STALL_AT_SCOREBOARD, bo, offset,
7521                                 imm);
7522   }
7523
7524   if ((GFX_VER == 9 || (GFX_VER == 12 && devinfo->revision == 0 /* A0*/)) &&
7525        IS_COMPUTE_PIPELINE(batch) && post_sync_flags) {
7526      /* Project: SKL / Argument: LRI Post Sync Operation [23]
7527       *
7528       * "PIPECONTROL command with “Command Streamer Stall Enable” must be
7529       *  programmed prior to programming a PIPECONTROL command with "LRI
7530       *  Post Sync Operation" in GPGPU mode of operation (i.e when
7531       *  PIPELINE_SELECT command is set to GPGPU mode of operation)."
7532       *
7533       * The same text exists a few rows below for Post Sync Op.
7534       *
7535       * On Gfx12 this is Wa_1607156449.
7536       */
7537      iris_emit_raw_pipe_control(batch,
7538                                 "workaround: CS stall before gpgpu post-sync",
7539                                 PIPE_CONTROL_CS_STALL, bo, offset, imm);
7540   }
7541
7542   /* "Flush Types" workarounds ---------------------------------------------
7543    * We do these now because they may add post-sync operations or CS stalls.
7544    */
7545
7546   if (GFX_VER < 11 && flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) {
7547      /* Project: BDW, SKL+ (stopping at CNL) / Argument: VF Invalidate
7548       *
7549       * "'Post Sync Operation' must be enabled to 'Write Immediate Data' or
7550       *  'Write PS Depth Count' or 'Write Timestamp'."
7551       */
7552      if (!bo) {
7553         flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
7554         post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
7555         non_lri_post_sync_flags |= PIPE_CONTROL_WRITE_IMMEDIATE;
7556         bo = batch->screen->workaround_address.bo;
7557         offset = batch->screen->workaround_address.offset;
7558      }
7559   }
7560
7561   if (flags & PIPE_CONTROL_DEPTH_STALL) {
7562      /* From the PIPE_CONTROL instruction table, bit 13 (Depth Stall Enable):
7563       *
7564       *    "This bit must be DISABLED for operations other than writing
7565       *     PS_DEPTH_COUNT."
7566       *
7567       * This seems like nonsense.  An Ivybridge workaround requires us to
7568       * emit a PIPE_CONTROL with a depth stall and write immediate post-sync
7569       * operation.  Gfx8+ requires us to emit depth stalls and depth cache
7570       * flushes together.  So, it's hard to imagine this means anything other
7571       * than "we originally intended this to be used for PS_DEPTH_COUNT".
7572       *
7573       * We ignore the supposed restriction and do nothing.
7574       */
7575   }
7576
7577   if (flags & (PIPE_CONTROL_RENDER_TARGET_FLUSH |
7578                PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
7579      /* From the PIPE_CONTROL instruction table, bit 12 and bit 1:
7580       *
7581       *    "This bit must be DISABLED for End-of-pipe (Read) fences,
7582       *     PS_DEPTH_COUNT or TIMESTAMP queries."
7583       *
7584       * TODO: Implement end-of-pipe checking.
7585       */
7586      assert(!(post_sync_flags & (PIPE_CONTROL_WRITE_DEPTH_COUNT |
7587                                  PIPE_CONTROL_WRITE_TIMESTAMP)));
7588   }
7589
7590   if (GFX_VER < 11 && (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD)) {
7591      /* From the PIPE_CONTROL instruction table, bit 1:
7592       *
7593       *    "This bit is ignored if Depth Stall Enable is set.
7594       *     Further, the render cache is not flushed even if Write Cache
7595       *     Flush Enable bit is set."
7596       *
7597       * We assert that the caller doesn't do this combination, to try and
7598       * prevent mistakes.  It shouldn't hurt the GPU, though.
7599       *
7600       * We skip this check on Gfx11+ as the "Stall at Pixel Scoreboard"
7601       * and "Render Target Flush" combo is explicitly required for BTI
7602       * update workarounds.
7603       */
7604      assert(!(flags & (PIPE_CONTROL_DEPTH_STALL |
7605                        PIPE_CONTROL_RENDER_TARGET_FLUSH)));
7606   }
7607
7608   /* PIPE_CONTROL page workarounds ------------------------------------- */
7609
7610   if (GFX_VER <= 8 && (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE)) {
7611      /* From the PIPE_CONTROL page itself:
7612       *
7613       *    "IVB, HSW, BDW
7614       *     Restriction: Pipe_control with CS-stall bit set must be issued
7615       *     before a pipe-control command that has the State Cache
7616       *     Invalidate bit set."
7617       */
7618      flags |= PIPE_CONTROL_CS_STALL;
7619   }
7620
7621   if (flags & PIPE_CONTROL_FLUSH_LLC) {
7622      /* From the PIPE_CONTROL instruction table, bit 26 (Flush LLC):
7623       *
7624       *    "Project: ALL
7625       *     SW must always program Post-Sync Operation to "Write Immediate
7626       *     Data" when Flush LLC is set."
7627       *
7628       * For now, we just require the caller to do it.
7629       */
7630      assert(flags & PIPE_CONTROL_WRITE_IMMEDIATE);
7631   }
7632
7633   /* "Post-Sync Operation" workarounds -------------------------------- */
7634
7635   /* Project: All / Argument: Global Snapshot Count Reset [19]
7636    *
7637    * "This bit must not be exercised on any product.
7638    *  Requires stall bit ([20] of DW1) set."
7639    *
7640    * We don't use this, so we just assert that it isn't used.  The
7641    * PIPE_CONTROL instruction page indicates that they intended this
7642    * as a debug feature and don't think it is useful in production,
7643    * but it may actually be usable, should we ever want to.
7644    */
7645   assert((flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) == 0);
7646
7647   if (flags & (PIPE_CONTROL_MEDIA_STATE_CLEAR |
7648                PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE)) {
7649      /* Project: All / Arguments:
7650       *
7651       * - Generic Media State Clear [16]
7652       * - Indirect State Pointers Disable [16]
7653       *
7654       *    "Requires stall bit ([20] of DW1) set."
7655       *
7656       * Also, the PIPE_CONTROL instruction table, bit 16 (Generic Media
7657       * State Clear) says:
7658       *
7659       *    "PIPECONTROL command with “Command Streamer Stall Enable” must be
7660       *     programmed prior to programming a PIPECONTROL command with "Media
7661       *     State Clear" set in GPGPU mode of operation"
7662       *
7663       * This is a subset of the earlier rule, so there's nothing to do.
7664       */
7665      flags |= PIPE_CONTROL_CS_STALL;
7666   }
7667
7668   if (flags & PIPE_CONTROL_STORE_DATA_INDEX) {
7669      /* Project: All / Argument: Store Data Index
7670       *
7671       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
7672       *  than '0'."
7673       *
7674       * For now, we just assert that the caller does this.  We might want to
7675       * automatically add a write to the workaround BO...
7676       */
7677      assert(non_lri_post_sync_flags != 0);
7678   }
7679
7680   if (flags & PIPE_CONTROL_SYNC_GFDT) {
7681      /* Project: All / Argument: Sync GFDT
7682       *
7683       * "Post-Sync Operation ([15:14] of DW1) must be set to something other
7684       *  than '0' or 0x2520[13] must be set."
7685       *
7686       * For now, we just assert that the caller does this.
7687       */
7688      assert(non_lri_post_sync_flags != 0);
7689   }
7690
7691   if (flags & PIPE_CONTROL_TLB_INVALIDATE) {
7692      /* Project: IVB+ / Argument: TLB inv
7693       *
7694       *    "Requires stall bit ([20] of DW1) set."
7695       *
7696       * Also, from the PIPE_CONTROL instruction table:
7697       *
7698       *    "Project: SKL+
7699       *     Post Sync Operation or CS stall must be set to ensure a TLB
7700       *     invalidation occurs.  Otherwise no cycle will occur to the TLB
7701       *     cache to invalidate."
7702       *
7703       * This is not a subset of the earlier rule, so there's nothing to do.
7704       */
7705      flags |= PIPE_CONTROL_CS_STALL;
7706   }
7707
7708   if (GFX_VER == 9 && devinfo->gt == 4) {
7709      /* TODO: The big Skylake GT4 post sync op workaround */
7710   }
7711
7712   /* "GPGPU specific workarounds" (both post-sync and flush) ------------ */
7713
7714   if (IS_COMPUTE_PIPELINE(batch)) {
7715      if (GFX_VER >= 9 && (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE)) {
7716         /* Project: SKL+ / Argument: Tex Invalidate
7717          * "Requires stall bit ([20] of DW) set for all GPGPU Workloads."
7718          */
7719         flags |= PIPE_CONTROL_CS_STALL;
7720      }
7721
7722      if (GFX_VER == 8 && (post_sync_flags ||
7723                           (flags & (PIPE_CONTROL_NOTIFY_ENABLE |
7724                                     PIPE_CONTROL_DEPTH_STALL |
7725                                     PIPE_CONTROL_RENDER_TARGET_FLUSH |
7726                                     PIPE_CONTROL_DEPTH_CACHE_FLUSH |
7727                                     PIPE_CONTROL_DATA_CACHE_FLUSH)))) {
7728         /* Project: BDW / Arguments:
7729          *
7730          * - LRI Post Sync Operation   [23]
7731          * - Post Sync Op              [15:14]
7732          * - Notify En                 [8]
7733          * - Depth Stall               [13]
7734          * - Render Target Cache Flush [12]
7735          * - Depth Cache Flush         [0]
7736          * - DC Flush Enable           [5]
7737          *
7738          *    "Requires stall bit ([20] of DW) set for all GPGPU and Media
7739          *     Workloads."
7740          */
7741         flags |= PIPE_CONTROL_CS_STALL;
7742
7743         /* Also, from the PIPE_CONTROL instruction table, bit 20:
7744          *
7745          *    "Project: BDW
7746          *     This bit must be always set when PIPE_CONTROL command is
7747          *     programmed by GPGPU and MEDIA workloads, except for the cases
7748          *     when only Read Only Cache Invalidation bits are set (State
7749          *     Cache Invalidation Enable, Instruction cache Invalidation
7750          *     Enable, Texture Cache Invalidation Enable, Constant Cache
7751          *     Invalidation Enable). This is to WA FFDOP CG issue, this WA
7752          *     need not implemented when FF_DOP_CG is disable via "Fixed
7753          *     Function DOP Clock Gate Disable" bit in RC_PSMI_CTRL register."
7754          *
7755          * It sounds like we could avoid CS stalls in some cases, but we
7756          * don't currently bother.  This list isn't exactly the list above,
7757          * either...
7758          */
7759      }
7760   }
7761
7762   /* "Stall" workarounds ----------------------------------------------
7763    * These have to come after the earlier ones because we may have added
7764    * some additional CS stalls above.
7765    */
7766
7767   if (GFX_VER < 9 && (flags & PIPE_CONTROL_CS_STALL)) {
7768      /* Project: PRE-SKL, VLV, CHV
7769       *
7770       * "[All Stepping][All SKUs]:
7771       *
7772       *  One of the following must also be set:
7773       *
7774       *  - Render Target Cache Flush Enable ([12] of DW1)
7775       *  - Depth Cache Flush Enable ([0] of DW1)
7776       *  - Stall at Pixel Scoreboard ([1] of DW1)
7777       *  - Depth Stall ([13] of DW1)
7778       *  - Post-Sync Operation ([13] of DW1)
7779       *  - DC Flush Enable ([5] of DW1)"
7780       *
7781       * If we don't already have one of those bits set, we choose to add
7782       * "Stall at Pixel Scoreboard".  Some of the other bits require a
7783       * CS stall as a workaround (see above), which would send us into
7784       * an infinite recursion of PIPE_CONTROLs.  "Stall at Pixel Scoreboard"
7785       * appears to be safe, so we choose that.
7786       */
7787      const uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH |
7788                               PIPE_CONTROL_DEPTH_CACHE_FLUSH |
7789                               PIPE_CONTROL_WRITE_IMMEDIATE |
7790                               PIPE_CONTROL_WRITE_DEPTH_COUNT |
7791                               PIPE_CONTROL_WRITE_TIMESTAMP |
7792                               PIPE_CONTROL_STALL_AT_SCOREBOARD |
7793                               PIPE_CONTROL_DEPTH_STALL |
7794                               PIPE_CONTROL_DATA_CACHE_FLUSH;
7795      if (!(flags & wa_bits))
7796         flags |= PIPE_CONTROL_STALL_AT_SCOREBOARD;
7797   }
7798
7799   if (GFX_VER >= 12 && (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH)) {
7800      /* Wa_1409600907:
7801       *
7802       * "PIPE_CONTROL with Depth Stall Enable bit must be set
7803       * with any PIPE_CONTROL with Depth Flush Enable bit set.
7804       */
7805      flags |= PIPE_CONTROL_DEPTH_STALL;
7806   }
7807
7808   /* Emit --------------------------------------------------------------- */
7809
7810   if (INTEL_DEBUG(DEBUG_PIPE_CONTROL)) {
7811      fprintf(stderr,
7812              "  PC [%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%"PRIx64"]: %s\n",
7813              (flags & PIPE_CONTROL_FLUSH_ENABLE) ? "PipeCon " : "",
7814              (flags & PIPE_CONTROL_CS_STALL) ? "CS " : "",
7815              (flags & PIPE_CONTROL_STALL_AT_SCOREBOARD) ? "Scoreboard " : "",
7816              (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE) ? "VF " : "",
7817              (flags & PIPE_CONTROL_RENDER_TARGET_FLUSH) ? "RT " : "",
7818              (flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE) ? "Const " : "",
7819              (flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE) ? "TC " : "",
7820              (flags & PIPE_CONTROL_DATA_CACHE_FLUSH) ? "DC " : "",
7821              (flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH) ? "ZFlush " : "",
7822              (flags & PIPE_CONTROL_TILE_CACHE_FLUSH) ? "Tile " : "",
7823              (flags & PIPE_CONTROL_DEPTH_STALL) ? "ZStall " : "",
7824              (flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE) ? "State " : "",
7825              (flags & PIPE_CONTROL_TLB_INVALIDATE) ? "TLB " : "",
7826              (flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE) ? "Inst " : "",
7827              (flags & PIPE_CONTROL_MEDIA_STATE_CLEAR) ? "MediaClear " : "",
7828              (flags & PIPE_CONTROL_NOTIFY_ENABLE) ? "Notify " : "",
7829              (flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET) ?
7830                 "SnapRes" : "",
7831              (flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE) ?
7832                  "ISPDis" : "",
7833              (flags & PIPE_CONTROL_WRITE_IMMEDIATE) ? "WriteImm " : "",
7834              (flags & PIPE_CONTROL_WRITE_DEPTH_COUNT) ? "WriteZCount " : "",
7835              (flags & PIPE_CONTROL_WRITE_TIMESTAMP) ? "WriteTimestamp " : "",
7836              (flags & PIPE_CONTROL_FLUSH_HDC) ? "HDC " : "",
7837              imm, reason);
7838   }
7839
7840   batch_mark_sync_for_pipe_control(batch, flags);
7841   iris_batch_sync_region_start(batch);
7842
7843   iris_emit_cmd(batch, GENX(PIPE_CONTROL), pc) {
7844#if GFX_VER >= 12
7845      pc.TileCacheFlushEnable = flags & PIPE_CONTROL_TILE_CACHE_FLUSH;
7846#endif
7847#if GFX_VER >= 11
7848      pc.HDCPipelineFlushEnable = flags & PIPE_CONTROL_FLUSH_HDC;
7849#endif
7850      pc.LRIPostSyncOperation = NoLRIOperation;
7851      pc.PipeControlFlushEnable = flags & PIPE_CONTROL_FLUSH_ENABLE;
7852      pc.DCFlushEnable = flags & PIPE_CONTROL_DATA_CACHE_FLUSH;
7853      pc.StoreDataIndex = 0;
7854      pc.CommandStreamerStallEnable = flags & PIPE_CONTROL_CS_STALL;
7855      pc.GlobalSnapshotCountReset =
7856         flags & PIPE_CONTROL_GLOBAL_SNAPSHOT_COUNT_RESET;
7857      pc.TLBInvalidate = flags & PIPE_CONTROL_TLB_INVALIDATE;
7858      pc.GenericMediaStateClear = flags & PIPE_CONTROL_MEDIA_STATE_CLEAR;
7859      pc.StallAtPixelScoreboard = flags & PIPE_CONTROL_STALL_AT_SCOREBOARD;
7860      pc.RenderTargetCacheFlushEnable =
7861         flags & PIPE_CONTROL_RENDER_TARGET_FLUSH;
7862      pc.DepthCacheFlushEnable = flags & PIPE_CONTROL_DEPTH_CACHE_FLUSH;
7863      pc.StateCacheInvalidationEnable =
7864         flags & PIPE_CONTROL_STATE_CACHE_INVALIDATE;
7865#if GFX_VER >= 12
7866      /* Invalidates the L3 cache part in which index & vertex data is loaded
7867       * when VERTEX_BUFFER_STATE::L3BypassDisable is set.
7868       */
7869      pc.L3ReadOnlyCacheInvalidationEnable =
7870         flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
7871#endif
7872      pc.VFCacheInvalidationEnable = flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
7873      pc.ConstantCacheInvalidationEnable =
7874         flags & PIPE_CONTROL_CONST_CACHE_INVALIDATE;
7875      pc.PostSyncOperation = flags_to_post_sync_op(flags);
7876      pc.DepthStallEnable = flags & PIPE_CONTROL_DEPTH_STALL;
7877      pc.InstructionCacheInvalidateEnable =
7878         flags & PIPE_CONTROL_INSTRUCTION_INVALIDATE;
7879      pc.NotifyEnable = flags & PIPE_CONTROL_NOTIFY_ENABLE;
7880      pc.IndirectStatePointersDisable =
7881         flags & PIPE_CONTROL_INDIRECT_STATE_POINTERS_DISABLE;
7882      pc.TextureCacheInvalidationEnable =
7883         flags & PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
7884      pc.Address = rw_bo(bo, offset, IRIS_DOMAIN_OTHER_WRITE);
7885      pc.ImmediateData = imm;
7886   }
7887
7888   iris_batch_sync_region_end(batch);
7889}
7890
7891#if GFX_VER == 9
7892/**
7893 * Preemption on Gfx9 has to be enabled or disabled in various cases.
7894 *
7895 * See these workarounds for preemption:
7896 *  - WaDisableMidObjectPreemptionForGSLineStripAdj
7897 *  - WaDisableMidObjectPreemptionForTrifanOrPolygon
7898 *  - WaDisableMidObjectPreemptionForLineLoop
7899 *  - WA#0798
7900 *
7901 * We don't put this in the vtable because it's only used on Gfx9.
7902 */
7903void
7904gfx9_toggle_preemption(struct iris_context *ice,
7905                       struct iris_batch *batch,
7906                       const struct pipe_draw_info *draw)
7907{
7908   struct iris_genx_state *genx = ice->state.genx;
7909   bool object_preemption = true;
7910
7911   /* WaDisableMidObjectPreemptionForGSLineStripAdj
7912    *
7913    *    "WA: Disable mid-draw preemption when draw-call is a linestrip_adj
7914    *     and GS is enabled."
7915    */
7916   if (draw->mode == PIPE_PRIM_LINE_STRIP_ADJACENCY &&
7917       ice->shaders.prog[MESA_SHADER_GEOMETRY])
7918      object_preemption = false;
7919
7920   /* WaDisableMidObjectPreemptionForTrifanOrPolygon
7921    *
7922    *    "TriFan miscompare in Execlist Preemption test. Cut index that is
7923    *     on a previous context. End the previous, the resume another context
7924    *     with a tri-fan or polygon, and the vertex count is corrupted. If we
7925    *     prempt again we will cause corruption.
7926    *
7927    *     WA: Disable mid-draw preemption when draw-call has a tri-fan."
7928    */
7929   if (draw->mode == PIPE_PRIM_TRIANGLE_FAN)
7930      object_preemption = false;
7931
7932   /* WaDisableMidObjectPreemptionForLineLoop
7933    *
7934    *    "VF Stats Counters Missing a vertex when preemption enabled.
7935    *
7936    *     WA: Disable mid-draw preemption when the draw uses a lineloop
7937    *     topology."
7938    */
7939   if (draw->mode == PIPE_PRIM_LINE_LOOP)
7940      object_preemption = false;
7941
7942   /* WA#0798
7943    *
7944    *    "VF is corrupting GAFS data when preempted on an instance boundary
7945    *     and replayed with instancing enabled.
7946    *
7947    *     WA: Disable preemption when using instanceing."
7948    */
7949   if (draw->instance_count > 1)
7950      object_preemption = false;
7951
7952   if (genx->object_preemption != object_preemption) {
7953      iris_enable_obj_preemption(batch, object_preemption);
7954      genx->object_preemption = object_preemption;
7955   }
7956}
7957#endif
7958
7959static void
7960iris_lost_genx_state(struct iris_context *ice, struct iris_batch *batch)
7961{
7962   struct iris_genx_state *genx = ice->state.genx;
7963
7964#if GFX_VERx10 == 120
7965   genx->depth_reg_mode = IRIS_DEPTH_REG_MODE_UNKNOWN;
7966#endif
7967
7968   memset(genx->last_index_buffer, 0, sizeof(genx->last_index_buffer));
7969}
7970
7971static void
7972iris_emit_mi_report_perf_count(struct iris_batch *batch,
7973                               struct iris_bo *bo,
7974                               uint32_t offset_in_bytes,
7975                               uint32_t report_id)
7976{
7977   iris_batch_sync_region_start(batch);
7978   iris_emit_cmd(batch, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
7979      mi_rpc.MemoryAddress = rw_bo(bo, offset_in_bytes,
7980                                   IRIS_DOMAIN_OTHER_WRITE);
7981      mi_rpc.ReportID = report_id;
7982   }
7983   iris_batch_sync_region_end(batch);
7984}
7985
7986/**
7987 * Update the pixel hashing modes that determine the balancing of PS threads
7988 * across subslices and slices.
7989 *
7990 * \param width Width bound of the rendering area (already scaled down if \p
7991 *              scale is greater than 1).
7992 * \param height Height bound of the rendering area (already scaled down if \p
7993 *               scale is greater than 1).
7994 * \param scale The number of framebuffer samples that could potentially be
7995 *              affected by an individual channel of the PS thread.  This is
7996 *              typically one for single-sampled rendering, but for operations
7997 *              like CCS resolves and fast clears a single PS invocation may
7998 *              update a huge number of pixels, in which case a finer
7999 *              balancing is desirable in order to maximally utilize the
8000 *              bandwidth available.  UINT_MAX can be used as shorthand for
8001 *              "finest hashing mode available".
8002 */
8003void
8004genX(emit_hashing_mode)(struct iris_context *ice, struct iris_batch *batch,
8005                        unsigned width, unsigned height, unsigned scale)
8006{
8007#if GFX_VER == 9
8008   const struct intel_device_info *devinfo = &batch->screen->devinfo;
8009   const unsigned slice_hashing[] = {
8010      /* Because all Gfx9 platforms with more than one slice require
8011       * three-way subslice hashing, a single "normal" 16x16 slice hashing
8012       * block is guaranteed to suffer from substantial imbalance, with one
8013       * subslice receiving twice as much work as the other two in the
8014       * slice.
8015       *
8016       * The performance impact of that would be particularly severe when
8017       * three-way hashing is also in use for slice balancing (which is the
8018       * case for all Gfx9 GT4 platforms), because one of the slices
8019       * receives one every three 16x16 blocks in either direction, which
8020       * is roughly the periodicity of the underlying subslice imbalance
8021       * pattern ("roughly" because in reality the hardware's
8022       * implementation of three-way hashing doesn't do exact modulo 3
8023       * arithmetic, which somewhat decreases the magnitude of this effect
8024       * in practice).  This leads to a systematic subslice imbalance
8025       * within that slice regardless of the size of the primitive.  The
8026       * 32x32 hashing mode guarantees that the subslice imbalance within a
8027       * single slice hashing block is minimal, largely eliminating this
8028       * effect.
8029       */
8030      _32x32,
8031      /* Finest slice hashing mode available. */
8032      NORMAL
8033   };
8034   const unsigned subslice_hashing[] = {
8035      /* 16x16 would provide a slight cache locality benefit especially
8036       * visible in the sampler L1 cache efficiency of low-bandwidth
8037       * non-LLC platforms, but it comes at the cost of greater subslice
8038       * imbalance for primitives of dimensions approximately intermediate
8039       * between 16x4 and 16x16.
8040       */
8041      _16x4,
8042      /* Finest subslice hashing mode available. */
8043      _8x4
8044   };
8045   /* Dimensions of the smallest hashing block of a given hashing mode.  If
8046    * the rendering area is smaller than this there can't possibly be any
8047    * benefit from switching to this mode, so we optimize out the
8048    * transition.
8049    */
8050   const unsigned min_size[][2] = {
8051      { 16, 4 },
8052      { 8, 4 }
8053   };
8054   const unsigned idx = scale > 1;
8055
8056   if (width > min_size[idx][0] || height > min_size[idx][1]) {
8057      iris_emit_raw_pipe_control(batch,
8058                                 "workaround: CS stall before GT_MODE LRI",
8059                                 PIPE_CONTROL_STALL_AT_SCOREBOARD |
8060                                 PIPE_CONTROL_CS_STALL,
8061                                 NULL, 0, 0);
8062
8063      iris_emit_reg(batch, GENX(GT_MODE), reg) {
8064         reg.SliceHashing = (devinfo->num_slices > 1 ? slice_hashing[idx] : 0);
8065         reg.SliceHashingMask = (devinfo->num_slices > 1 ? -1 : 0);
8066         reg.SubsliceHashing = subslice_hashing[idx];
8067         reg.SubsliceHashingMask = -1;
8068      };
8069
8070      ice->state.current_hash_scale = scale;
8071   }
8072#endif
8073}
8074
8075static void
8076iris_set_frontend_noop(struct pipe_context *ctx, bool enable)
8077{
8078   struct iris_context *ice = (struct iris_context *) ctx;
8079
8080   if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_RENDER], enable)) {
8081      ice->state.dirty |= IRIS_ALL_DIRTY_FOR_RENDER;
8082      ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_RENDER;
8083   }
8084
8085   if (iris_batch_prepare_noop(&ice->batches[IRIS_BATCH_COMPUTE], enable)) {
8086      ice->state.dirty |= IRIS_ALL_DIRTY_FOR_COMPUTE;
8087      ice->state.stage_dirty |= IRIS_ALL_STAGE_DIRTY_FOR_COMPUTE;
8088   }
8089}
8090
8091void
8092genX(init_screen_state)(struct iris_screen *screen)
8093{
8094   assert(screen->devinfo.verx10 == GFX_VERx10);
8095   screen->vtbl.destroy_state = iris_destroy_state;
8096   screen->vtbl.init_render_context = iris_init_render_context;
8097   screen->vtbl.init_compute_context = iris_init_compute_context;
8098   screen->vtbl.upload_render_state = iris_upload_render_state;
8099   screen->vtbl.update_surface_base_address = iris_update_surface_base_address;
8100   screen->vtbl.upload_compute_state = iris_upload_compute_state;
8101   screen->vtbl.emit_raw_pipe_control = iris_emit_raw_pipe_control;
8102   screen->vtbl.emit_mi_report_perf_count = iris_emit_mi_report_perf_count;
8103   screen->vtbl.rebind_buffer = iris_rebind_buffer;
8104   screen->vtbl.load_register_reg32 = iris_load_register_reg32;
8105   screen->vtbl.load_register_reg64 = iris_load_register_reg64;
8106   screen->vtbl.load_register_imm32 = iris_load_register_imm32;
8107   screen->vtbl.load_register_imm64 = iris_load_register_imm64;
8108   screen->vtbl.load_register_mem32 = iris_load_register_mem32;
8109   screen->vtbl.load_register_mem64 = iris_load_register_mem64;
8110   screen->vtbl.store_register_mem32 = iris_store_register_mem32;
8111   screen->vtbl.store_register_mem64 = iris_store_register_mem64;
8112   screen->vtbl.store_data_imm32 = iris_store_data_imm32;
8113   screen->vtbl.store_data_imm64 = iris_store_data_imm64;
8114   screen->vtbl.copy_mem_mem = iris_copy_mem_mem;
8115   screen->vtbl.derived_program_state_size = iris_derived_program_state_size;
8116   screen->vtbl.store_derived_program_state = iris_store_derived_program_state;
8117   screen->vtbl.create_so_decl_list = iris_create_so_decl_list;
8118   screen->vtbl.populate_vs_key = iris_populate_vs_key;
8119   screen->vtbl.populate_tcs_key = iris_populate_tcs_key;
8120   screen->vtbl.populate_tes_key = iris_populate_tes_key;
8121   screen->vtbl.populate_gs_key = iris_populate_gs_key;
8122   screen->vtbl.populate_fs_key = iris_populate_fs_key;
8123   screen->vtbl.populate_cs_key = iris_populate_cs_key;
8124   screen->vtbl.lost_genx_state = iris_lost_genx_state;
8125}
8126
8127void
8128genX(init_state)(struct iris_context *ice)
8129{
8130   struct pipe_context *ctx = &ice->ctx;
8131   struct iris_screen *screen = (struct iris_screen *)ctx->screen;
8132
8133   ctx->create_blend_state = iris_create_blend_state;
8134   ctx->create_depth_stencil_alpha_state = iris_create_zsa_state;
8135   ctx->create_rasterizer_state = iris_create_rasterizer_state;
8136   ctx->create_sampler_state = iris_create_sampler_state;
8137   ctx->create_sampler_view = iris_create_sampler_view;
8138   ctx->create_surface = iris_create_surface;
8139   ctx->create_vertex_elements_state = iris_create_vertex_elements;
8140   ctx->bind_blend_state = iris_bind_blend_state;
8141   ctx->bind_depth_stencil_alpha_state = iris_bind_zsa_state;
8142   ctx->bind_sampler_states = iris_bind_sampler_states;
8143   ctx->bind_rasterizer_state = iris_bind_rasterizer_state;
8144   ctx->bind_vertex_elements_state = iris_bind_vertex_elements_state;
8145   ctx->delete_blend_state = iris_delete_state;
8146   ctx->delete_depth_stencil_alpha_state = iris_delete_state;
8147   ctx->delete_rasterizer_state = iris_delete_state;
8148   ctx->delete_sampler_state = iris_delete_state;
8149   ctx->delete_vertex_elements_state = iris_delete_state;
8150   ctx->set_blend_color = iris_set_blend_color;
8151   ctx->set_clip_state = iris_set_clip_state;
8152   ctx->set_constant_buffer = iris_set_constant_buffer;
8153   ctx->set_shader_buffers = iris_set_shader_buffers;
8154   ctx->set_shader_images = iris_set_shader_images;
8155   ctx->set_sampler_views = iris_set_sampler_views;
8156   ctx->set_compute_resources = iris_set_compute_resources;
8157   ctx->set_global_binding = iris_set_global_binding;
8158   ctx->set_tess_state = iris_set_tess_state;
8159   ctx->set_patch_vertices = iris_set_patch_vertices;
8160   ctx->set_framebuffer_state = iris_set_framebuffer_state;
8161   ctx->set_polygon_stipple = iris_set_polygon_stipple;
8162   ctx->set_sample_mask = iris_set_sample_mask;
8163   ctx->set_scissor_states = iris_set_scissor_states;
8164   ctx->set_stencil_ref = iris_set_stencil_ref;
8165   ctx->set_vertex_buffers = iris_set_vertex_buffers;
8166   ctx->set_viewport_states = iris_set_viewport_states;
8167   ctx->sampler_view_destroy = iris_sampler_view_destroy;
8168   ctx->surface_destroy = iris_surface_destroy;
8169   ctx->draw_vbo = iris_draw_vbo;
8170   ctx->launch_grid = iris_launch_grid;
8171   ctx->create_stream_output_target = iris_create_stream_output_target;
8172   ctx->stream_output_target_destroy = iris_stream_output_target_destroy;
8173   ctx->set_stream_output_targets = iris_set_stream_output_targets;
8174   ctx->set_frontend_noop = iris_set_frontend_noop;
8175
8176   ice->state.dirty = ~0ull;
8177   ice->state.stage_dirty = ~0ull;
8178
8179   ice->state.statistics_counters_enabled = true;
8180
8181   ice->state.sample_mask = 0xffff;
8182   ice->state.num_viewports = 1;
8183   ice->state.prim_mode = PIPE_PRIM_MAX;
8184   ice->state.genx = calloc(1, sizeof(struct iris_genx_state));
8185   ice->draw.derived_params.drawid = -1;
8186
8187   /* Make a 1x1x1 null surface for unbound textures */
8188   void *null_surf_map =
8189      upload_state(ice->state.surface_uploader, &ice->state.unbound_tex,
8190                   4 * GENX(RENDER_SURFACE_STATE_length), 64);
8191   isl_null_fill_state(&screen->isl_dev, null_surf_map,
8192                       .size = isl_extent3d(1, 1, 1));
8193   ice->state.unbound_tex.offset +=
8194      iris_bo_offset_from_base_address(iris_resource_bo(ice->state.unbound_tex.res));
8195
8196   /* Default all scissor rectangles to be empty regions. */
8197   for (int i = 0; i < IRIS_MAX_VIEWPORTS; i++) {
8198      ice->state.scissors[i] = (struct pipe_scissor_state) {
8199         .minx = 1, .maxx = 0, .miny = 1, .maxy = 0,
8200      };
8201   }
8202}
8203