1/*
2 * Copyright © 2015 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "anv_private.h"
25
26#include "genxml/gen_macros.h"
27#include "genxml/genX_pack.h"
28#include "genxml/gen_rt_pack.h"
29
30#include "common/intel_l3_config.h"
31#include "common/intel_sample_positions.h"
32#include "nir/nir_xfb_info.h"
33#include "vk_util.h"
34#include "vk_format.h"
35#include "vk_log.h"
36
37static uint32_t
38vertex_element_comp_control(enum isl_format format, unsigned comp)
39{
40   uint8_t bits;
41   switch (comp) {
42   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
43   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
44   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
45   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
46   default: unreachable("Invalid component");
47   }
48
49   /*
50    * Take in account hardware restrictions when dealing with 64-bit floats.
51    *
52    * From Broadwell spec, command reference structures, page 586:
53    *  "When SourceElementFormat is set to one of the *64*_PASSTHRU formats,
54    *   64-bit components are stored * in the URB without any conversion. In
55    *   this case, vertex elements must be written as 128 or 256 bits, with
56    *   VFCOMP_STORE_0 being used to pad the output as required. E.g., if
57    *   R64_PASSTHRU is used to copy a 64-bit Red component into the URB,
58    *   Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3
59    *   set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or
60    *   Components 1-3 must be specified as VFCOMP_STORE_0 in order to output
61    *   a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires
62    *   Component 3 to be specified as VFCOMP_STORE_0 in order to output a
63    *   256-bit vertex element."
64    */
65   if (bits) {
66      return VFCOMP_STORE_SRC;
67   } else if (comp >= 2 &&
68              !isl_format_layouts[format].channels.b.bits &&
69              isl_format_layouts[format].channels.r.type == ISL_RAW) {
70      /* When emitting 64-bit attributes, we need to write either 128 or 256
71       * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and
72       * VFCOMP_STORE_0 to pad the written chunk */
73      return VFCOMP_NOSTORE;
74   } else if (comp < 3 ||
75              isl_format_layouts[format].channels.r.type == ISL_RAW) {
76      /* Note we need to pad with value 0, not 1, due hardware restrictions
77       * (see comment above) */
78      return VFCOMP_STORE_0;
79   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
80            isl_format_layouts[format].channels.r.type == ISL_SINT) {
81      assert(comp == 3);
82      return VFCOMP_STORE_1_INT;
83   } else {
84      assert(comp == 3);
85      return VFCOMP_STORE_1_FP;
86   }
87}
88
89static void
90emit_vertex_input(struct anv_graphics_pipeline *pipeline,
91                  const VkPipelineVertexInputStateCreateInfo *info)
92{
93   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
94
95   /* Pull inputs_read out of the VS prog data */
96   const uint64_t inputs_read = vs_prog_data->inputs_read;
97   const uint64_t double_inputs_read =
98      vs_prog_data->double_inputs_read & inputs_read;
99   assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
100   const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
101   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
102   const bool needs_svgs_elem = vs_prog_data->uses_vertexid ||
103                                vs_prog_data->uses_instanceid ||
104                                vs_prog_data->uses_firstvertex ||
105                                vs_prog_data->uses_baseinstance;
106
107   uint32_t elem_count = __builtin_popcount(elements) -
108      __builtin_popcount(elements_double) / 2;
109
110   const uint32_t total_elems =
111      MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid);
112
113   uint32_t *p;
114
115   const uint32_t num_dwords = 1 + total_elems * 2;
116   p = anv_batch_emitn(&pipeline->base.batch, num_dwords,
117                       GENX(3DSTATE_VERTEX_ELEMENTS));
118   if (!p)
119      return;
120
121   for (uint32_t i = 0; i < total_elems; i++) {
122      /* The SKL docs for VERTEX_ELEMENT_STATE say:
123       *
124       *    "All elements must be valid from Element[0] to the last valid
125       *    element. (I.e. if Element[2] is valid then Element[1] and
126       *    Element[0] must also be valid)."
127       *
128       * The SKL docs for 3D_Vertex_Component_Control say:
129       *
130       *    "Don't store this component. (Not valid for Component 0, but can
131       *    be used for Component 1-3)."
132       *
133       * So we can't just leave a vertex element blank and hope for the best.
134       * We have to tell the VF hardware to put something in it; so we just
135       * store a bunch of zero.
136       *
137       * TODO: Compact vertex elements so we never end up with holes.
138       */
139      struct GENX(VERTEX_ELEMENT_STATE) element = {
140         .Valid = true,
141         .Component0Control = VFCOMP_STORE_0,
142         .Component1Control = VFCOMP_STORE_0,
143         .Component2Control = VFCOMP_STORE_0,
144         .Component3Control = VFCOMP_STORE_0,
145      };
146      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element);
147   }
148
149   for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) {
150      const VkVertexInputAttributeDescription *desc =
151         &info->pVertexAttributeDescriptions[i];
152      enum isl_format format = anv_get_isl_format(&pipeline->base.device->info,
153                                                  desc->format,
154                                                  VK_IMAGE_ASPECT_COLOR_BIT,
155                                                  VK_IMAGE_TILING_LINEAR);
156
157      assert(desc->binding < MAX_VBS);
158
159      if ((elements & (1 << desc->location)) == 0)
160         continue; /* Binding unused */
161
162      uint32_t slot =
163         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
164         DIV_ROUND_UP(__builtin_popcount(elements_double &
165                                        ((1 << desc->location) -1)), 2);
166
167      struct GENX(VERTEX_ELEMENT_STATE) element = {
168         .VertexBufferIndex = desc->binding,
169         .Valid = true,
170         .SourceElementFormat = format,
171         .EdgeFlagEnable = false,
172         .SourceElementOffset = desc->offset,
173         .Component0Control = vertex_element_comp_control(format, 0),
174         .Component1Control = vertex_element_comp_control(format, 1),
175         .Component2Control = vertex_element_comp_control(format, 2),
176         .Component3Control = vertex_element_comp_control(format, 3),
177      };
178      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element);
179
180#if GFX_VER >= 8
181      /* On Broadwell and later, we have a separate VF_INSTANCING packet
182       * that controls instancing.  On Haswell and prior, that's part of
183       * VERTEX_BUFFER_STATE which we emit later.
184       */
185      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
186         vfi.InstancingEnable = pipeline->vb[desc->binding].instanced;
187         vfi.VertexElementIndex = slot;
188         vfi.InstanceDataStepRate =
189            pipeline->vb[desc->binding].instance_divisor;
190      }
191#endif
192   }
193
194   const uint32_t id_slot = elem_count;
195   if (needs_svgs_elem) {
196      /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
197       *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
198       *    Control field is set to something other than VFCOMP_STORE_SRC,
199       *    no higher-numbered Component Control fields may be set to
200       *    VFCOMP_STORE_SRC"
201       *
202       * This means, that if we have BaseInstance, we need BaseVertex as
203       * well.  Just do all or nothing.
204       */
205      uint32_t base_ctrl = (vs_prog_data->uses_firstvertex ||
206                            vs_prog_data->uses_baseinstance) ?
207                           VFCOMP_STORE_SRC : VFCOMP_STORE_0;
208
209      struct GENX(VERTEX_ELEMENT_STATE) element = {
210         .VertexBufferIndex = ANV_SVGS_VB_INDEX,
211         .Valid = true,
212         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
213         .Component0Control = base_ctrl,
214         .Component1Control = base_ctrl,
215#if GFX_VER >= 8
216         .Component2Control = VFCOMP_STORE_0,
217         .Component3Control = VFCOMP_STORE_0,
218#else
219         .Component2Control = VFCOMP_STORE_VID,
220         .Component3Control = VFCOMP_STORE_IID,
221#endif
222      };
223      GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element);
224
225#if GFX_VER >= 8
226      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
227         vfi.VertexElementIndex = id_slot;
228      }
229#endif
230   }
231
232#if GFX_VER >= 8
233   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_SGVS), sgvs) {
234      sgvs.VertexIDEnable              = vs_prog_data->uses_vertexid;
235      sgvs.VertexIDComponentNumber     = 2;
236      sgvs.VertexIDElementOffset       = id_slot;
237      sgvs.InstanceIDEnable            = vs_prog_data->uses_instanceid;
238      sgvs.InstanceIDComponentNumber   = 3;
239      sgvs.InstanceIDElementOffset     = id_slot;
240   }
241#endif
242
243   const uint32_t drawid_slot = elem_count + needs_svgs_elem;
244   if (vs_prog_data->uses_drawid) {
245      struct GENX(VERTEX_ELEMENT_STATE) element = {
246         .VertexBufferIndex = ANV_DRAWID_VB_INDEX,
247         .Valid = true,
248         .SourceElementFormat = ISL_FORMAT_R32_UINT,
249         .Component0Control = VFCOMP_STORE_SRC,
250         .Component1Control = VFCOMP_STORE_0,
251         .Component2Control = VFCOMP_STORE_0,
252         .Component3Control = VFCOMP_STORE_0,
253      };
254      GENX(VERTEX_ELEMENT_STATE_pack)(NULL,
255                                      &p[1 + drawid_slot * 2],
256                                      &element);
257
258#if GFX_VER >= 8
259      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_INSTANCING), vfi) {
260         vfi.VertexElementIndex = drawid_slot;
261      }
262#endif
263   }
264}
265
266void
267genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch,
268                     const struct intel_l3_config *l3_config,
269                     VkShaderStageFlags active_stages,
270                     const unsigned entry_size[4],
271                     enum intel_urb_deref_block_size *deref_block_size)
272{
273   const struct intel_device_info *devinfo = &device->info;
274
275   unsigned entries[4];
276   unsigned start[4];
277   bool constrained;
278   intel_get_urb_config(devinfo, l3_config,
279                        active_stages &
280                           VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT,
281                        active_stages & VK_SHADER_STAGE_GEOMETRY_BIT,
282                        entry_size, entries, start, deref_block_size,
283                        &constrained);
284
285#if GFX_VERx10 == 70
286   /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1:
287    *
288    *    "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall
289    *    needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
290    *    3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
291    *    3DSTATE_SAMPLER_STATE_POINTER_VS command.  Only one PIPE_CONTROL
292    *    needs to be sent before any combination of VS associated 3DSTATE."
293    */
294   anv_batch_emit(batch, GFX7_PIPE_CONTROL, pc) {
295      pc.DepthStallEnable  = true;
296      pc.PostSyncOperation = WriteImmediateData;
297      pc.Address           = device->workaround_address;
298   }
299#endif
300
301   for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) {
302      anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) {
303         urb._3DCommandSubOpcode      += i;
304         urb.VSURBStartingAddress      = start[i];
305         urb.VSURBEntryAllocationSize  = entry_size[i] - 1;
306         urb.VSNumberofURBEntries      = entries[i];
307      }
308   }
309}
310
311static void
312emit_urb_setup(struct anv_graphics_pipeline *pipeline,
313               enum intel_urb_deref_block_size *deref_block_size)
314{
315   unsigned entry_size[4];
316   for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) {
317      const struct brw_vue_prog_data *prog_data =
318         !anv_pipeline_has_stage(pipeline, i) ? NULL :
319         (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data;
320
321      entry_size[i] = prog_data ? prog_data->urb_entry_size : 1;
322   }
323
324   genX(emit_urb_setup)(pipeline->base.device, &pipeline->base.batch,
325                        pipeline->base.l3_config,
326                        pipeline->active_stages, entry_size,
327                        deref_block_size);
328}
329
330static void
331emit_3dstate_sbe(struct anv_graphics_pipeline *pipeline)
332{
333   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
334
335   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
336      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE), sbe);
337#if GFX_VER >= 8
338      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ), sbe);
339#endif
340      return;
341   }
342
343   struct GENX(3DSTATE_SBE) sbe = {
344      GENX(3DSTATE_SBE_header),
345      .AttributeSwizzleEnable = true,
346      .PointSpriteTextureCoordinateOrigin = UPPERLEFT,
347      .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs,
348      .ConstantInterpolationEnable = wm_prog_data->flat_inputs,
349   };
350
351#if GFX_VER >= 9
352   for (unsigned i = 0; i < 32; i++)
353      sbe.AttributeActiveComponentFormat[i] = ACF_XYZW;
354#endif
355
356#if GFX_VER >= 8
357   /* On Broadwell, they broke 3DSTATE_SBE into two packets */
358   struct GENX(3DSTATE_SBE_SWIZ) swiz = {
359      GENX(3DSTATE_SBE_SWIZ_header),
360   };
361#else
362#  define swiz sbe
363#endif
364
365   if (anv_pipeline_is_primitive(pipeline)) {
366      const struct brw_vue_map *fs_input_map =
367         &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map;
368
369      int first_slot = brw_compute_first_urb_slot_required(wm_prog_data->inputs,
370                                                           fs_input_map);
371      assert(first_slot % 2 == 0);
372      unsigned urb_entry_read_offset = first_slot / 2;
373      int max_source_attr = 0;
374      for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
375         uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
376         int input_index = wm_prog_data->urb_setup[attr];
377
378         assert(0 <= input_index);
379
380         /* gl_Viewport, gl_Layer and FragmentShadingRateKHR are stored in the
381          * VUE header
382          */
383         if (attr == VARYING_SLOT_VIEWPORT ||
384             attr == VARYING_SLOT_LAYER ||
385             attr == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
386            continue;
387         }
388
389         if (attr == VARYING_SLOT_PNTC) {
390            sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
391            continue;
392         }
393
394         const int slot = fs_input_map->varying_to_slot[attr];
395
396         if (slot == -1) {
397            /* This attribute does not exist in the VUE--that means that the
398             * vertex shader did not write to it.  It could be that it's a
399             * regular varying read by the fragment shader but not written by
400             * the vertex shader or it's gl_PrimitiveID. In the first case the
401             * value is undefined, in the second it needs to be
402             * gl_PrimitiveID.
403             */
404            swiz.Attribute[input_index].ConstantSource = PRIM_ID;
405            swiz.Attribute[input_index].ComponentOverrideX = true;
406            swiz.Attribute[input_index].ComponentOverrideY = true;
407            swiz.Attribute[input_index].ComponentOverrideZ = true;
408            swiz.Attribute[input_index].ComponentOverrideW = true;
409            continue;
410         }
411
412         /* We have to subtract two slots to accout for the URB entry output
413          * read offset in the VS and GS stages.
414          */
415         const int source_attr = slot - 2 * urb_entry_read_offset;
416         assert(source_attr >= 0 && source_attr < 32);
417         max_source_attr = MAX2(max_source_attr, source_attr);
418         /* The hardware can only do overrides on 16 overrides at a time, and the
419          * other up to 16 have to be lined up so that the input index = the
420          * output index. We'll need to do some tweaking to make sure that's the
421          * case.
422          */
423         if (input_index < 16)
424            swiz.Attribute[input_index].SourceAttribute = source_attr;
425         else
426            assert(source_attr == input_index);
427      }
428
429      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
430      sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2);
431#if GFX_VER >= 8
432      sbe.ForceVertexURBEntryReadOffset = true;
433      sbe.ForceVertexURBEntryReadLength = true;
434#endif
435   }
436
437   uint32_t *dw = anv_batch_emit_dwords(&pipeline->base.batch,
438                                        GENX(3DSTATE_SBE_length));
439   if (!dw)
440      return;
441   GENX(3DSTATE_SBE_pack)(&pipeline->base.batch, dw, &sbe);
442
443#if GFX_VER >= 8
444   dw = anv_batch_emit_dwords(&pipeline->base.batch, GENX(3DSTATE_SBE_SWIZ_length));
445   if (!dw)
446      return;
447   GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->base.batch, dw, &swiz);
448#endif
449}
450
451/** Returns the final polygon mode for rasterization
452 *
453 * This function takes into account polygon mode, primitive topology and the
454 * different shader stages which might generate their own type of primitives.
455 */
456VkPolygonMode
457genX(raster_polygon_mode)(struct anv_graphics_pipeline *pipeline,
458                          VkPrimitiveTopology primitive_topology)
459{
460   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
461      switch (get_gs_prog_data(pipeline)->output_topology) {
462      case _3DPRIM_POINTLIST:
463         return VK_POLYGON_MODE_POINT;
464
465      case _3DPRIM_LINELIST:
466      case _3DPRIM_LINESTRIP:
467      case _3DPRIM_LINELOOP:
468         return VK_POLYGON_MODE_LINE;
469
470      case _3DPRIM_TRILIST:
471      case _3DPRIM_TRIFAN:
472      case _3DPRIM_TRISTRIP:
473      case _3DPRIM_RECTLIST:
474      case _3DPRIM_QUADLIST:
475      case _3DPRIM_QUADSTRIP:
476      case _3DPRIM_POLYGON:
477         return pipeline->polygon_mode;
478      }
479      unreachable("Unsupported GS output topology");
480   } else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
481      switch (get_tes_prog_data(pipeline)->output_topology) {
482      case BRW_TESS_OUTPUT_TOPOLOGY_POINT:
483         return VK_POLYGON_MODE_POINT;
484
485      case BRW_TESS_OUTPUT_TOPOLOGY_LINE:
486         return VK_POLYGON_MODE_LINE;
487
488      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW:
489      case BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW:
490         return pipeline->polygon_mode;
491      }
492      unreachable("Unsupported TCS output topology");
493   } else {
494      switch (primitive_topology) {
495      case VK_PRIMITIVE_TOPOLOGY_POINT_LIST:
496         return VK_POLYGON_MODE_POINT;
497
498      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST:
499      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP:
500      case VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY:
501      case VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY:
502         return VK_POLYGON_MODE_LINE;
503
504      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST:
505      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP:
506      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN:
507      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY:
508      case VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY:
509         return pipeline->polygon_mode;
510
511      default:
512         unreachable("Unsupported primitive topology");
513      }
514   }
515}
516
517uint32_t
518genX(ms_rasterization_mode)(struct anv_graphics_pipeline *pipeline,
519                            VkPolygonMode raster_mode)
520{
521#if GFX_VER <= 7
522   if (raster_mode == VK_POLYGON_MODE_LINE) {
523      switch (pipeline->line_mode) {
524      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
525         return MSRASTMODE_ON_PATTERN;
526
527      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
528      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
529         return MSRASTMODE_OFF_PIXEL;
530
531      default:
532         unreachable("Unsupported line rasterization mode");
533      }
534   } else {
535      return pipeline->rasterization_samples > 1 ?
536         MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL;
537   }
538#else
539   unreachable("Only on gen7");
540#endif
541}
542
543static VkProvokingVertexModeEXT
544vk_provoking_vertex_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
545{
546   const VkPipelineRasterizationProvokingVertexStateCreateInfoEXT *rs_pv_info =
547      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_PROVOKING_VERTEX_STATE_CREATE_INFO_EXT);
548
549   return rs_pv_info == NULL ? VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT :
550                               rs_pv_info->provokingVertexMode;
551}
552
553const uint32_t genX(vk_to_intel_cullmode)[] = {
554   [VK_CULL_MODE_NONE]                       = CULLMODE_NONE,
555   [VK_CULL_MODE_FRONT_BIT]                  = CULLMODE_FRONT,
556   [VK_CULL_MODE_BACK_BIT]                   = CULLMODE_BACK,
557   [VK_CULL_MODE_FRONT_AND_BACK]             = CULLMODE_BOTH
558};
559
560const uint32_t genX(vk_to_intel_fillmode)[] = {
561   [VK_POLYGON_MODE_FILL]                    = FILL_MODE_SOLID,
562   [VK_POLYGON_MODE_LINE]                    = FILL_MODE_WIREFRAME,
563   [VK_POLYGON_MODE_POINT]                   = FILL_MODE_POINT,
564};
565
566const uint32_t genX(vk_to_intel_front_face)[] = {
567   [VK_FRONT_FACE_COUNTER_CLOCKWISE]         = 1,
568   [VK_FRONT_FACE_CLOCKWISE]                 = 0
569};
570
571#if GFX_VER >= 9
572static VkConservativeRasterizationModeEXT
573vk_conservative_rasterization_mode(const VkPipelineRasterizationStateCreateInfo *rs_info)
574{
575   const VkPipelineRasterizationConservativeStateCreateInfoEXT *cr =
576      vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_CONSERVATIVE_STATE_CREATE_INFO_EXT);
577
578   return cr ? cr->conservativeRasterizationMode :
579               VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
580}
581#endif
582
583void
584genX(rasterization_mode)(VkPolygonMode raster_mode,
585                         VkLineRasterizationModeEXT line_mode,
586                         float line_width,
587                         uint32_t *api_mode,
588                         bool *msaa_rasterization_enable)
589{
590#if GFX_VER >= 8
591   if (raster_mode == VK_POLYGON_MODE_LINE) {
592      /* Unfortunately, configuring our line rasterization hardware on gfx8
593       * and later is rather painful.  Instead of giving us bits to tell the
594       * hardware what line mode to use like we had on gfx7, we now have an
595       * arcane combination of API Mode and MSAA enable bits which do things
596       * in a table which are expected to magically put the hardware into the
597       * right mode for your API.  Sadly, Vulkan isn't any of the APIs the
598       * hardware people thought of so nothing works the way you want it to.
599       *
600       * Look at the table titled "Multisample Rasterization Modes" in Vol 7
601       * of the Skylake PRM for more details.
602       */
603      switch (line_mode) {
604      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_EXT:
605         *api_mode = DX100;
606#if GFX_VER <= 9
607         /* Prior to ICL, the algorithm the HW uses to draw wide lines
608          * doesn't quite match what the CTS expects, at least for rectangular
609          * lines, so we set this to false here, making it draw parallelograms
610          * instead, which work well enough.
611          */
612         *msaa_rasterization_enable = line_width < 1.0078125;
613#else
614         *msaa_rasterization_enable = true;
615#endif
616         break;
617
618      case VK_LINE_RASTERIZATION_MODE_RECTANGULAR_SMOOTH_EXT:
619      case VK_LINE_RASTERIZATION_MODE_BRESENHAM_EXT:
620         *api_mode = DX9OGL;
621         *msaa_rasterization_enable = false;
622         break;
623
624      default:
625         unreachable("Unsupported line rasterization mode");
626      }
627   } else {
628      *api_mode = DX100;
629      *msaa_rasterization_enable = true;
630   }
631#else
632   unreachable("Invalid call");
633#endif
634}
635
636static void
637emit_rs_state(struct anv_graphics_pipeline *pipeline,
638              const VkPipelineInputAssemblyStateCreateInfo *ia_info,
639              const VkPipelineRasterizationStateCreateInfo *rs_info,
640              const VkPipelineMultisampleStateCreateInfo *ms_info,
641              const VkPipelineRasterizationLineStateCreateInfoEXT *line_info,
642              const uint32_t dynamic_states,
643              const struct anv_render_pass *pass,
644              const struct anv_subpass *subpass,
645              enum intel_urb_deref_block_size urb_deref_block_size)
646{
647   struct GENX(3DSTATE_SF) sf = {
648      GENX(3DSTATE_SF_header),
649   };
650
651   sf.ViewportTransformEnable = true;
652   sf.StatisticsEnable = true;
653   sf.VertexSubPixelPrecisionSelect = _8Bit;
654   sf.AALineDistanceMode = true;
655
656   switch (vk_provoking_vertex_mode(rs_info)) {
657   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
658      sf.TriangleStripListProvokingVertexSelect = 0;
659      sf.LineStripListProvokingVertexSelect = 0;
660      sf.TriangleFanProvokingVertexSelect = 1;
661      break;
662
663   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
664      sf.TriangleStripListProvokingVertexSelect = 2;
665      sf.LineStripListProvokingVertexSelect = 1;
666      sf.TriangleFanProvokingVertexSelect = 2;
667      break;
668
669   default:
670      unreachable("Invalid provoking vertex mode");
671   }
672
673#if GFX_VERx10 == 75
674   sf.LineStippleEnable = line_info && line_info->stippledLineEnable;
675#endif
676
677#if GFX_VER >= 12
678   sf.DerefBlockSize = urb_deref_block_size;
679#endif
680
681   if (anv_pipeline_is_primitive(pipeline)) {
682      const struct brw_vue_prog_data *last_vue_prog_data =
683         anv_pipeline_get_last_vue_prog_data(pipeline);
684
685      if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
686         sf.PointWidthSource = Vertex;
687      } else {
688         sf.PointWidthSource = State;
689         sf.PointWidth = 1.0;
690      }
691   }
692
693#if GFX_VER >= 8
694   struct GENX(3DSTATE_RASTER) raster = {
695      GENX(3DSTATE_RASTER_header),
696   };
697#else
698#  define raster sf
699#endif
700
701   VkPolygonMode raster_mode =
702      genX(raster_polygon_mode)(pipeline, ia_info->topology);
703   bool dynamic_primitive_topology =
704      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
705
706   /* For details on 3DSTATE_RASTER multisample state, see the BSpec table
707    * "Multisample Modes State".
708    */
709#if GFX_VER >= 8
710   if (!dynamic_primitive_topology)
711      genX(rasterization_mode)(raster_mode, pipeline->line_mode,
712                               rs_info->lineWidth,
713                               &raster.APIMode,
714                               &raster.DXMultisampleRasterizationEnable);
715
716   /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix
717    * computations.  If we ever set this bit to a different value, they will
718    * need to be updated accordingly.
719    */
720   raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0;
721   raster.ForceMultisampling = false;
722#else
723   uint32_t ms_rast_mode = 0;
724
725   if (!dynamic_primitive_topology)
726      ms_rast_mode = genX(ms_rasterization_mode)(pipeline, raster_mode);
727
728   raster.MultisampleRasterizationMode = ms_rast_mode;
729#endif
730
731   raster.AntialiasingEnable =
732      dynamic_primitive_topology ? 0 :
733      anv_rasterization_aa_mode(raster_mode, pipeline->line_mode);
734
735   raster.FrontWinding =
736      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_FRONT_FACE ?
737         0 : genX(vk_to_intel_front_face)[rs_info->frontFace];
738   raster.CullMode =
739      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_CULL_MODE ?
740         0 : genX(vk_to_intel_cullmode)[rs_info->cullMode];
741
742   raster.FrontFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
743   raster.BackFaceFillMode = genX(vk_to_intel_fillmode)[rs_info->polygonMode];
744   raster.ScissorRectangleEnable = true;
745
746#if GFX_VER >= 9
747   /* GFX9+ splits ViewportZClipTestEnable into near and far enable bits */
748   raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable;
749   raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable;
750#elif GFX_VER >= 8
751   raster.ViewportZClipTestEnable = pipeline->depth_clip_enable;
752#endif
753
754#if GFX_VER >= 9
755   raster.ConservativeRasterizationEnable =
756      vk_conservative_rasterization_mode(rs_info) !=
757         VK_CONSERVATIVE_RASTERIZATION_MODE_DISABLED_EXT;
758#endif
759
760   bool depth_bias_enable =
761      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_BIAS_ENABLE ?
762         0 : rs_info->depthBiasEnable;
763
764   raster.GlobalDepthOffsetEnableSolid = depth_bias_enable;
765   raster.GlobalDepthOffsetEnableWireframe = depth_bias_enable;
766   raster.GlobalDepthOffsetEnablePoint = depth_bias_enable;
767
768#if GFX_VER == 7
769   /* Gfx7 requires that we provide the depth format in 3DSTATE_SF so that it
770    * can get the depth offsets correct.
771    */
772   if (subpass->depth_stencil_attachment) {
773      VkFormat vk_format =
774         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
775      assert(vk_format_is_depth_or_stencil(vk_format));
776      if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) {
777         enum isl_format isl_format =
778            anv_get_isl_format(&pipeline->base.device->info, vk_format,
779                               VK_IMAGE_ASPECT_DEPTH_BIT,
780                               VK_IMAGE_TILING_OPTIMAL);
781         sf.DepthBufferSurfaceFormat =
782            isl_format_get_depth_format(isl_format, false);
783      }
784   }
785#endif
786
787#if GFX_VER >= 8
788   GENX(3DSTATE_SF_pack)(NULL, pipeline->gfx8.sf, &sf);
789   GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gfx8.raster, &raster);
790#else
791#  undef raster
792   GENX(3DSTATE_SF_pack)(NULL, &pipeline->gfx7.sf, &sf);
793#endif
794}
795
796static void
797emit_ms_state(struct anv_graphics_pipeline *pipeline,
798              const VkPipelineMultisampleStateCreateInfo *info,
799              uint32_t dynamic_states)
800{
801   /* Only lookup locations if the extensions is active, otherwise the default
802    * ones will be used either at device initialization time or through
803    * 3DSTATE_MULTISAMPLE on Gfx7/7.5 by passing NULL locations.
804    */
805   if (pipeline->base.device->vk.enabled_extensions.EXT_sample_locations) {
806      /* If the sample locations are dynamic, 3DSTATE_MULTISAMPLE on Gfx7/7.5
807       * will be emitted dynamically, so skip it here. On Gfx8+
808       * 3DSTATE_SAMPLE_PATTERN will be emitted dynamically, so skip it here.
809       */
810      if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SAMPLE_LOCATIONS)) {
811#if GFX_VER >= 8
812         genX(emit_sample_pattern)(&pipeline->base.batch,
813                                   pipeline->dynamic_state.sample_locations.samples,
814                                   pipeline->dynamic_state.sample_locations.locations);
815#endif
816      }
817
818      genX(emit_multisample)(&pipeline->base.batch,
819                             pipeline->dynamic_state.sample_locations.samples,
820                             pipeline->dynamic_state.sample_locations.locations);
821   } else {
822      /* On Gfx8+ 3DSTATE_MULTISAMPLE does not hold anything we need to modify
823       * for sample locations, so we don't have to emit it dynamically.
824       */
825#if GFX_VER >= 8
826      genX(emit_multisample)(&pipeline->base.batch,
827                             info ? info->rasterizationSamples : 1,
828                             NULL);
829#endif
830   }
831
832   /* From the Vulkan 1.0 spec:
833    *    If pSampleMask is NULL, it is treated as if the mask has all bits
834    *    enabled, i.e. no coverage is removed from fragments.
835    *
836    * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits.
837    */
838#if GFX_VER >= 8
839   uint32_t sample_mask = 0xffff;
840#else
841   uint32_t sample_mask = 0xff;
842#endif
843
844   if (info && info->pSampleMask)
845      sample_mask &= info->pSampleMask[0];
846
847   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_SAMPLE_MASK), sm) {
848      sm.SampleMask = sample_mask;
849   }
850
851   pipeline->cps_state = ANV_STATE_NULL;
852#if GFX_VER >= 11
853   if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_SHADING_RATE) &&
854       pipeline->base.device->vk.enabled_extensions.KHR_fragment_shading_rate) {
855#if GFX_VER >= 12
856      struct anv_device *device = pipeline->base.device;
857      const uint32_t num_dwords =
858         GENX(CPS_STATE_length) * 4 * pipeline->dynamic_state.viewport.count;
859      pipeline->cps_state =
860         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords, 32);
861#endif
862
863      genX(emit_shading_rate)(&pipeline->base.batch,
864                              pipeline,
865                              pipeline->cps_state,
866                              &pipeline->dynamic_state);
867   }
868#endif
869}
870
871const uint32_t genX(vk_to_intel_logic_op)[] = {
872   [VK_LOGIC_OP_COPY]                        = LOGICOP_COPY,
873   [VK_LOGIC_OP_CLEAR]                       = LOGICOP_CLEAR,
874   [VK_LOGIC_OP_AND]                         = LOGICOP_AND,
875   [VK_LOGIC_OP_AND_REVERSE]                 = LOGICOP_AND_REVERSE,
876   [VK_LOGIC_OP_AND_INVERTED]                = LOGICOP_AND_INVERTED,
877   [VK_LOGIC_OP_NO_OP]                       = LOGICOP_NOOP,
878   [VK_LOGIC_OP_XOR]                         = LOGICOP_XOR,
879   [VK_LOGIC_OP_OR]                          = LOGICOP_OR,
880   [VK_LOGIC_OP_NOR]                         = LOGICOP_NOR,
881   [VK_LOGIC_OP_EQUIVALENT]                  = LOGICOP_EQUIV,
882   [VK_LOGIC_OP_INVERT]                      = LOGICOP_INVERT,
883   [VK_LOGIC_OP_OR_REVERSE]                  = LOGICOP_OR_REVERSE,
884   [VK_LOGIC_OP_COPY_INVERTED]               = LOGICOP_COPY_INVERTED,
885   [VK_LOGIC_OP_OR_INVERTED]                 = LOGICOP_OR_INVERTED,
886   [VK_LOGIC_OP_NAND]                        = LOGICOP_NAND,
887   [VK_LOGIC_OP_SET]                         = LOGICOP_SET,
888};
889
890static const uint32_t vk_to_intel_blend[] = {
891   [VK_BLEND_FACTOR_ZERO]                    = BLENDFACTOR_ZERO,
892   [VK_BLEND_FACTOR_ONE]                     = BLENDFACTOR_ONE,
893   [VK_BLEND_FACTOR_SRC_COLOR]               = BLENDFACTOR_SRC_COLOR,
894   [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR]     = BLENDFACTOR_INV_SRC_COLOR,
895   [VK_BLEND_FACTOR_DST_COLOR]               = BLENDFACTOR_DST_COLOR,
896   [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR]     = BLENDFACTOR_INV_DST_COLOR,
897   [VK_BLEND_FACTOR_SRC_ALPHA]               = BLENDFACTOR_SRC_ALPHA,
898   [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA]     = BLENDFACTOR_INV_SRC_ALPHA,
899   [VK_BLEND_FACTOR_DST_ALPHA]               = BLENDFACTOR_DST_ALPHA,
900   [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA]     = BLENDFACTOR_INV_DST_ALPHA,
901   [VK_BLEND_FACTOR_CONSTANT_COLOR]          = BLENDFACTOR_CONST_COLOR,
902   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR,
903   [VK_BLEND_FACTOR_CONSTANT_ALPHA]          = BLENDFACTOR_CONST_ALPHA,
904   [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA,
905   [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE]      = BLENDFACTOR_SRC_ALPHA_SATURATE,
906   [VK_BLEND_FACTOR_SRC1_COLOR]              = BLENDFACTOR_SRC1_COLOR,
907   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR]    = BLENDFACTOR_INV_SRC1_COLOR,
908   [VK_BLEND_FACTOR_SRC1_ALPHA]              = BLENDFACTOR_SRC1_ALPHA,
909   [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA]    = BLENDFACTOR_INV_SRC1_ALPHA,
910};
911
912static const uint32_t vk_to_intel_blend_op[] = {
913   [VK_BLEND_OP_ADD]                         = BLENDFUNCTION_ADD,
914   [VK_BLEND_OP_SUBTRACT]                    = BLENDFUNCTION_SUBTRACT,
915   [VK_BLEND_OP_REVERSE_SUBTRACT]            = BLENDFUNCTION_REVERSE_SUBTRACT,
916   [VK_BLEND_OP_MIN]                         = BLENDFUNCTION_MIN,
917   [VK_BLEND_OP_MAX]                         = BLENDFUNCTION_MAX,
918};
919
920const uint32_t genX(vk_to_intel_compare_op)[] = {
921   [VK_COMPARE_OP_NEVER]                        = PREFILTEROP_NEVER,
922   [VK_COMPARE_OP_LESS]                         = PREFILTEROP_LESS,
923   [VK_COMPARE_OP_EQUAL]                        = PREFILTEROP_EQUAL,
924   [VK_COMPARE_OP_LESS_OR_EQUAL]                = PREFILTEROP_LEQUAL,
925   [VK_COMPARE_OP_GREATER]                      = PREFILTEROP_GREATER,
926   [VK_COMPARE_OP_NOT_EQUAL]                    = PREFILTEROP_NOTEQUAL,
927   [VK_COMPARE_OP_GREATER_OR_EQUAL]             = PREFILTEROP_GEQUAL,
928   [VK_COMPARE_OP_ALWAYS]                       = PREFILTEROP_ALWAYS,
929};
930
931const uint32_t genX(vk_to_intel_stencil_op)[] = {
932   [VK_STENCIL_OP_KEEP]                         = STENCILOP_KEEP,
933   [VK_STENCIL_OP_ZERO]                         = STENCILOP_ZERO,
934   [VK_STENCIL_OP_REPLACE]                      = STENCILOP_REPLACE,
935   [VK_STENCIL_OP_INCREMENT_AND_CLAMP]          = STENCILOP_INCRSAT,
936   [VK_STENCIL_OP_DECREMENT_AND_CLAMP]          = STENCILOP_DECRSAT,
937   [VK_STENCIL_OP_INVERT]                       = STENCILOP_INVERT,
938   [VK_STENCIL_OP_INCREMENT_AND_WRAP]           = STENCILOP_INCR,
939   [VK_STENCIL_OP_DECREMENT_AND_WRAP]           = STENCILOP_DECR,
940};
941
942const uint32_t genX(vk_to_intel_primitive_type)[] = {
943   [VK_PRIMITIVE_TOPOLOGY_POINT_LIST]                    = _3DPRIM_POINTLIST,
944   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST]                     = _3DPRIM_LINELIST,
945   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP]                    = _3DPRIM_LINESTRIP,
946   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST]                 = _3DPRIM_TRILIST,
947   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP]                = _3DPRIM_TRISTRIP,
948   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_FAN]                  = _3DPRIM_TRIFAN,
949   [VK_PRIMITIVE_TOPOLOGY_LINE_LIST_WITH_ADJACENCY]      = _3DPRIM_LINELIST_ADJ,
950   [VK_PRIMITIVE_TOPOLOGY_LINE_STRIP_WITH_ADJACENCY]     = _3DPRIM_LINESTRIP_ADJ,
951   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_LIST_WITH_ADJACENCY]  = _3DPRIM_TRILIST_ADJ,
952   [VK_PRIMITIVE_TOPOLOGY_TRIANGLE_STRIP_WITH_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
953};
954
955/* This function sanitizes the VkStencilOpState by looking at the compare ops
956 * and trying to determine whether or not a given stencil op can ever actually
957 * occur.  Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP.
958 * This function returns true if, after sanitation, any of the stencil ops are
959 * set to something other than VK_STENCIL_OP_KEEP.
960 */
961static bool
962sanitize_stencil_face(VkStencilOpState *face,
963                      VkCompareOp depthCompareOp)
964{
965   /* If compareOp is ALWAYS then the stencil test will never fail and failOp
966    * will never happen.  Set failOp to KEEP in this case.
967    */
968   if (face->compareOp == VK_COMPARE_OP_ALWAYS)
969      face->failOp = VK_STENCIL_OP_KEEP;
970
971   /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth
972    * or stencil tests will fail and passOp will never happen.
973    */
974   if (face->compareOp == VK_COMPARE_OP_NEVER ||
975       depthCompareOp == VK_COMPARE_OP_NEVER)
976      face->passOp = VK_STENCIL_OP_KEEP;
977
978   /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the
979    * stencil test will fail or the depth test will pass.  In either case,
980    * depthFailOp will never happen.
981    */
982   if (face->compareOp == VK_COMPARE_OP_NEVER ||
983       depthCompareOp == VK_COMPARE_OP_ALWAYS)
984      face->depthFailOp = VK_STENCIL_OP_KEEP;
985
986   return face->failOp != VK_STENCIL_OP_KEEP ||
987          face->depthFailOp != VK_STENCIL_OP_KEEP ||
988          face->passOp != VK_STENCIL_OP_KEEP;
989}
990
991/* Intel hardware is fairly sensitive to whether or not depth/stencil writes
992 * are enabled.  In the presence of discards, it's fairly easy to get into the
993 * non-promoted case which means a fairly big performance hit.  From the Iron
994 * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases":
995 *
996 *    "Non-promoted depth (N) is active whenever the depth test can be done
997 *    early but it cannot determine whether or not to write source depth to
998 *    the depth buffer, therefore the depth write must be performed post pixel
999 *    shader. This includes cases where the pixel shader can kill pixels,
1000 *    including via sampler chroma key, as well as cases where the alpha test
1001 *    function is enabled, which kills pixels based on a programmable alpha
1002 *    test. In this case, even if the depth test fails, the pixel cannot be
1003 *    killed if a stencil write is indicated. Whether or not the stencil write
1004 *    happens depends on whether or not the pixel is killed later. In these
1005 *    cases if stencil test fails and stencil writes are off, the pixels can
1006 *    also be killed early. If stencil writes are enabled, the pixels must be
1007 *    treated as Computed depth (described above)."
1008 *
1009 * The same thing as mentioned in the stencil case can happen in the depth
1010 * case as well if it thinks it writes depth but, thanks to the depth test
1011 * being GL_EQUAL, the write doesn't actually matter.  A little extra work
1012 * up-front to try and disable depth and stencil writes can make a big
1013 * difference.
1014 *
1015 * Unfortunately, the way depth and stencil testing is specified, there are
1016 * many case where, regardless of depth/stencil writes being enabled, nothing
1017 * actually gets written due to some other bit of state being set.  This
1018 * function attempts to "sanitize" the depth stencil state and disable writes
1019 * and sometimes even testing whenever possible.
1020 */
1021static void
1022sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state,
1023                  bool *stencilWriteEnable,
1024                  VkImageAspectFlags ds_aspects)
1025{
1026   *stencilWriteEnable = state->stencilTestEnable;
1027
1028   /* If the depth test is disabled, we won't be writing anything. Make sure we
1029    * treat the test as always passing later on as well.
1030    *
1031    * Also, the Vulkan spec requires that if either depth or stencil is not
1032    * present, the pipeline is to act as if the test silently passes. In that
1033    * case we won't write either.
1034    */
1035   if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) {
1036      state->depthWriteEnable = false;
1037      state->depthCompareOp = VK_COMPARE_OP_ALWAYS;
1038   }
1039
1040   if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) {
1041      *stencilWriteEnable = false;
1042      state->front.compareOp = VK_COMPARE_OP_ALWAYS;
1043      state->back.compareOp = VK_COMPARE_OP_ALWAYS;
1044   }
1045
1046   /* If the stencil test is enabled and always fails, then we will never get
1047    * to the depth test so we can just disable the depth test entirely.
1048    */
1049   if (state->stencilTestEnable &&
1050       state->front.compareOp == VK_COMPARE_OP_NEVER &&
1051       state->back.compareOp == VK_COMPARE_OP_NEVER) {
1052      state->depthTestEnable = false;
1053      state->depthWriteEnable = false;
1054   }
1055
1056   /* If depthCompareOp is EQUAL then the value we would be writing to the
1057    * depth buffer is the same as the value that's already there so there's no
1058    * point in writing it.
1059    */
1060   if (state->depthCompareOp == VK_COMPARE_OP_EQUAL)
1061      state->depthWriteEnable = false;
1062
1063   /* If the stencil ops are such that we don't actually ever modify the
1064    * stencil buffer, we should disable writes.
1065    */
1066   if (!sanitize_stencil_face(&state->front, state->depthCompareOp) &&
1067       !sanitize_stencil_face(&state->back, state->depthCompareOp))
1068      *stencilWriteEnable = false;
1069
1070   /* If the depth test always passes and we never write out depth, that's the
1071    * same as if the depth test is disabled entirely.
1072    */
1073   if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS &&
1074       !state->depthWriteEnable)
1075      state->depthTestEnable = false;
1076
1077   /* If the stencil test always passes and we never write out stencil, that's
1078    * the same as if the stencil test is disabled entirely.
1079    */
1080   if (state->front.compareOp == VK_COMPARE_OP_ALWAYS &&
1081       state->back.compareOp == VK_COMPARE_OP_ALWAYS &&
1082       !*stencilWriteEnable)
1083      state->stencilTestEnable = false;
1084}
1085
1086static void
1087emit_ds_state(struct anv_graphics_pipeline *pipeline,
1088              const VkPipelineDepthStencilStateCreateInfo *pCreateInfo,
1089              const uint32_t dynamic_states,
1090              const struct anv_render_pass *pass,
1091              const struct anv_subpass *subpass)
1092{
1093#if GFX_VER == 7
1094#  define depth_stencil_dw pipeline->gfx7.depth_stencil_state
1095#elif GFX_VER == 8
1096#  define depth_stencil_dw pipeline->gfx8.wm_depth_stencil
1097#else
1098#  define depth_stencil_dw pipeline->gfx9.wm_depth_stencil
1099#endif
1100
1101   if (pCreateInfo == NULL) {
1102      /* We're going to OR this together with the dynamic state.  We need
1103       * to make sure it's initialized to something useful.
1104       */
1105      pipeline->writes_stencil = false;
1106      pipeline->stencil_test_enable = false;
1107      pipeline->writes_depth = false;
1108      pipeline->depth_test_enable = false;
1109      pipeline->depth_bounds_test_enable = false;
1110      memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw));
1111      return;
1112   }
1113
1114   VkImageAspectFlags ds_aspects = 0;
1115   if (subpass->depth_stencil_attachment) {
1116      VkFormat depth_stencil_format =
1117         pass->attachments[subpass->depth_stencil_attachment->attachment].format;
1118      ds_aspects = vk_format_aspects(depth_stencil_format);
1119   }
1120
1121   VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo;
1122   sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects);
1123   pipeline->stencil_test_enable = info.stencilTestEnable;
1124   pipeline->writes_depth = info.depthWriteEnable;
1125   pipeline->depth_test_enable = info.depthTestEnable;
1126   pipeline->depth_bounds_test_enable = info.depthBoundsTestEnable;
1127
1128   bool dynamic_stencil_op =
1129      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_OP;
1130
1131#if GFX_VER <= 7
1132   struct GENX(DEPTH_STENCIL_STATE) depth_stencil = {
1133#else
1134   struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = {
1135#endif
1136      .DepthTestEnable =
1137         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_TEST_ENABLE ?
1138            0 : info.depthTestEnable,
1139
1140      .DepthBufferWriteEnable =
1141         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_WRITE_ENABLE ?
1142            0 : info.depthWriteEnable,
1143
1144      .DepthTestFunction =
1145         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_DEPTH_COMPARE_OP ?
1146            0 : genX(vk_to_intel_compare_op)[info.depthCompareOp],
1147
1148      .DoubleSidedStencilEnable = true,
1149
1150      .StencilTestEnable =
1151         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_STENCIL_TEST_ENABLE ?
1152            0 : info.stencilTestEnable,
1153
1154      .StencilFailOp = genX(vk_to_intel_stencil_op)[info.front.failOp],
1155      .StencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.front.passOp],
1156      .StencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.front.depthFailOp],
1157      .StencilTestFunction = genX(vk_to_intel_compare_op)[info.front.compareOp],
1158      .BackfaceStencilFailOp = genX(vk_to_intel_stencil_op)[info.back.failOp],
1159      .BackfaceStencilPassDepthPassOp = genX(vk_to_intel_stencil_op)[info.back.passOp],
1160      .BackfaceStencilPassDepthFailOp = genX(vk_to_intel_stencil_op)[info.back.depthFailOp],
1161      .BackfaceStencilTestFunction = genX(vk_to_intel_compare_op)[info.back.compareOp],
1162   };
1163
1164   if (dynamic_stencil_op) {
1165      depth_stencil.StencilFailOp = 0;
1166      depth_stencil.StencilPassDepthPassOp = 0;
1167      depth_stencil.StencilPassDepthFailOp = 0;
1168      depth_stencil.StencilTestFunction = 0;
1169      depth_stencil.BackfaceStencilFailOp = 0;
1170      depth_stencil.BackfaceStencilPassDepthPassOp = 0;
1171      depth_stencil.BackfaceStencilPassDepthFailOp = 0;
1172      depth_stencil.BackfaceStencilTestFunction = 0;
1173   }
1174
1175#if GFX_VER <= 7
1176   GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil);
1177#else
1178   GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
1179#endif
1180}
1181
1182static bool
1183is_dual_src_blend_factor(VkBlendFactor factor)
1184{
1185   return factor == VK_BLEND_FACTOR_SRC1_COLOR ||
1186          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR ||
1187          factor == VK_BLEND_FACTOR_SRC1_ALPHA ||
1188          factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA;
1189}
1190
1191static inline uint32_t *
1192write_disabled_blend(uint32_t *state)
1193{
1194   struct GENX(BLEND_STATE_ENTRY) entry = {
1195      .WriteDisableAlpha = true,
1196      .WriteDisableRed = true,
1197      .WriteDisableGreen = true,
1198      .WriteDisableBlue = true,
1199   };
1200   GENX(BLEND_STATE_ENTRY_pack)(NULL, state, &entry);
1201   return state + GENX(BLEND_STATE_ENTRY_length);
1202}
1203
1204static void
1205emit_cb_state(struct anv_graphics_pipeline *pipeline,
1206              const VkPipelineColorBlendStateCreateInfo *info,
1207              const VkPipelineMultisampleStateCreateInfo *ms_info,
1208              uint32_t dynamic_states)
1209{
1210   struct anv_device *device = pipeline->base.device;
1211   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1212
1213   struct GENX(BLEND_STATE) blend_state = {
1214#if GFX_VER >= 8
1215      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1216      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1217#endif
1218   };
1219
1220   uint32_t surface_count = 0;
1221   struct anv_pipeline_bind_map *map;
1222   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
1223      map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
1224      surface_count = map->surface_count;
1225   }
1226
1227   const uint32_t num_dwords = GENX(BLEND_STATE_length) +
1228      GENX(BLEND_STATE_ENTRY_length) * surface_count;
1229   uint32_t *blend_state_start, *state_pos;
1230
1231   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1232                         ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1233      const struct intel_device_info *devinfo = &pipeline->base.device->info;
1234      blend_state_start = devinfo->ver >= 8 ?
1235         pipeline->gfx8.blend_state : pipeline->gfx7.blend_state;
1236      pipeline->blend_state = ANV_STATE_NULL;
1237   } else {
1238      pipeline->blend_state =
1239         anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
1240      blend_state_start = pipeline->blend_state.map;
1241   }
1242   state_pos = blend_state_start;
1243
1244   bool has_writeable_rt = false;
1245   state_pos += GENX(BLEND_STATE_length);
1246#if GFX_VER >= 8
1247   struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 };
1248#endif
1249   for (unsigned i = 0; i < surface_count; i++) {
1250      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
1251
1252      /* All color attachments are at the beginning of the binding table */
1253      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
1254         break;
1255
1256      /* We can have at most 8 attachments */
1257      assert(i < MAX_RTS);
1258
1259      if (info == NULL || binding->index >= info->attachmentCount) {
1260         state_pos = write_disabled_blend(state_pos);
1261         continue;
1262      }
1263
1264      if ((pipeline->dynamic_state.color_writes & (1u << binding->index)) == 0) {
1265         state_pos = write_disabled_blend(state_pos);
1266         continue;
1267      }
1268
1269      const VkPipelineColorBlendAttachmentState *a =
1270         &info->pAttachments[binding->index];
1271
1272      struct GENX(BLEND_STATE_ENTRY) entry = {
1273#if GFX_VER < 8
1274         .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
1275         .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
1276#endif
1277         .LogicOpEnable = info->logicOpEnable,
1278         .LogicOpFunction = dynamic_states & ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP ?
1279                            0: genX(vk_to_intel_logic_op)[info->logicOp],
1280
1281         /* Vulkan specification 1.2.168, VkLogicOp:
1282          *
1283          *   "Logical operations are controlled by the logicOpEnable and
1284          *    logicOp members of VkPipelineColorBlendStateCreateInfo. If
1285          *    logicOpEnable is VK_TRUE, then a logical operation selected by
1286          *    logicOp is applied between each color attachment and the
1287          *    fragment’s corresponding output value, and blending of all
1288          *    attachments is treated as if it were disabled."
1289          *
1290          * From the Broadwell PRM Volume 2d: Command Reference: Structures:
1291          * BLEND_STATE_ENTRY:
1292          *
1293          *   "Enabling LogicOp and Color Buffer Blending at the same time is
1294          *    UNDEFINED"
1295          */
1296         .ColorBufferBlendEnable = !info->logicOpEnable && a->blendEnable,
1297         .ColorClampRange = COLORCLAMP_RTFORMAT,
1298         .PreBlendColorClampEnable = true,
1299         .PostBlendColorClampEnable = true,
1300         .SourceBlendFactor = vk_to_intel_blend[a->srcColorBlendFactor],
1301         .DestinationBlendFactor = vk_to_intel_blend[a->dstColorBlendFactor],
1302         .ColorBlendFunction = vk_to_intel_blend_op[a->colorBlendOp],
1303         .SourceAlphaBlendFactor = vk_to_intel_blend[a->srcAlphaBlendFactor],
1304         .DestinationAlphaBlendFactor = vk_to_intel_blend[a->dstAlphaBlendFactor],
1305         .AlphaBlendFunction = vk_to_intel_blend_op[a->alphaBlendOp],
1306         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
1307         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
1308         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
1309         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
1310      };
1311
1312      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
1313          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
1314          a->colorBlendOp != a->alphaBlendOp) {
1315#if GFX_VER >= 8
1316         blend_state.IndependentAlphaBlendEnable = true;
1317#else
1318         entry.IndependentAlphaBlendEnable = true;
1319#endif
1320      }
1321
1322      /* The Dual Source Blending documentation says:
1323       *
1324       * "If SRC1 is included in a src/dst blend factor and
1325       * a DualSource RT Write message is not used, results
1326       * are UNDEFINED. (This reflects the same restriction in DX APIs,
1327       * where undefined results are produced if “o1” is not written
1328       * by a PS – there are no default values defined)."
1329       *
1330       * There is no way to gracefully fix this undefined situation
1331       * so we just disable the blending to prevent possible issues.
1332       */
1333      if (!wm_prog_data->dual_src_blend &&
1334          (is_dual_src_blend_factor(a->srcColorBlendFactor) ||
1335           is_dual_src_blend_factor(a->dstColorBlendFactor) ||
1336           is_dual_src_blend_factor(a->srcAlphaBlendFactor) ||
1337           is_dual_src_blend_factor(a->dstAlphaBlendFactor))) {
1338         vk_logw(VK_LOG_OBJS(&device->vk.base),
1339                 "Enabled dual-src blend factors without writing both targets "
1340                 "in the shader.  Disabling blending to avoid GPU hangs.");
1341         entry.ColorBufferBlendEnable = false;
1342      }
1343
1344      if (a->colorWriteMask != 0)
1345         has_writeable_rt = true;
1346
1347      /* Our hardware applies the blend factor prior to the blend function
1348       * regardless of what function is used.  Technically, this means the
1349       * hardware can do MORE than GL or Vulkan specify.  However, it also
1350       * means that, for MIN and MAX, we have to stomp the blend factor to
1351       * ONE to make it a no-op.
1352       */
1353      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
1354          a->colorBlendOp == VK_BLEND_OP_MAX) {
1355         entry.SourceBlendFactor = BLENDFACTOR_ONE;
1356         entry.DestinationBlendFactor = BLENDFACTOR_ONE;
1357      }
1358      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
1359          a->alphaBlendOp == VK_BLEND_OP_MAX) {
1360         entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
1361         entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
1362      }
1363      GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry);
1364      state_pos += GENX(BLEND_STATE_ENTRY_length);
1365#if GFX_VER >= 8
1366      if (i == 0)
1367         bs0 = entry;
1368#endif
1369   }
1370
1371#if GFX_VER >= 8
1372   struct GENX(3DSTATE_PS_BLEND) blend = {
1373      GENX(3DSTATE_PS_BLEND_header),
1374   };
1375   blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
1376   blend.HasWriteableRT                = has_writeable_rt;
1377   blend.ColorBufferBlendEnable        = bs0.ColorBufferBlendEnable;
1378   blend.SourceAlphaBlendFactor        = bs0.SourceAlphaBlendFactor;
1379   blend.DestinationAlphaBlendFactor   = bs0.DestinationAlphaBlendFactor;
1380   blend.SourceBlendFactor             = bs0.SourceBlendFactor;
1381   blend.DestinationBlendFactor        = bs0.DestinationBlendFactor;
1382   blend.AlphaTestEnable               = false;
1383   blend.IndependentAlphaBlendEnable   = blend_state.IndependentAlphaBlendEnable;
1384
1385   if (dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1386                        ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP)) {
1387      GENX(3DSTATE_PS_BLEND_pack)(NULL, pipeline->gfx8.ps_blend, &blend);
1388   } else {
1389      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_BLEND), _blend)
1390         _blend = blend;
1391   }
1392#else
1393   (void)has_writeable_rt;
1394#endif
1395
1396   GENX(BLEND_STATE_pack)(NULL, blend_state_start, &blend_state);
1397
1398   if (!(dynamic_states & (ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE |
1399                           ANV_CMD_DIRTY_DYNAMIC_LOGIC_OP))) {
1400      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
1401         bsp.BlendStatePointer      = pipeline->blend_state.offset;
1402#if GFX_VER >= 8
1403         bsp.BlendStatePointerValid = true;
1404#endif
1405      }
1406   }
1407}
1408
1409static void
1410emit_3dstate_clip(struct anv_graphics_pipeline *pipeline,
1411                  const VkPipelineInputAssemblyStateCreateInfo *ia_info,
1412                  const VkPipelineViewportStateCreateInfo *vp_info,
1413                  const VkPipelineRasterizationStateCreateInfo *rs_info,
1414                  const uint32_t dynamic_states)
1415{
1416   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
1417   (void) wm_prog_data;
1418
1419   struct GENX(3DSTATE_CLIP) clip = {
1420      GENX(3DSTATE_CLIP_header),
1421   };
1422
1423   clip.ClipEnable               = true;
1424   clip.StatisticsEnable         = true;
1425   clip.EarlyCullEnable          = true;
1426   clip.APIMode                  = APIMODE_D3D;
1427   clip.GuardbandClipTestEnable  = true;
1428
1429   /* Only enable the XY clip test when the final polygon rasterization
1430    * mode is VK_POLYGON_MODE_FILL.  We want to leave it disabled for
1431    * points and lines so we get "pop-free" clipping.
1432    */
1433   VkPolygonMode raster_mode =
1434      genX(raster_polygon_mode)(pipeline, ia_info->topology);
1435   clip.ViewportXYClipTestEnable =
1436      dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ?
1437         0 : (raster_mode == VK_POLYGON_MODE_FILL);
1438
1439#if GFX_VER >= 8
1440   clip.VertexSubPixelPrecisionSelect = _8Bit;
1441#endif
1442   clip.ClipMode = CLIPMODE_NORMAL;
1443
1444   switch (vk_provoking_vertex_mode(rs_info)) {
1445   case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1446      clip.TriangleStripListProvokingVertexSelect = 0;
1447      clip.LineStripListProvokingVertexSelect = 0;
1448      clip.TriangleFanProvokingVertexSelect = 1;
1449      break;
1450
1451   case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1452      clip.TriangleStripListProvokingVertexSelect = 2;
1453      clip.LineStripListProvokingVertexSelect = 1;
1454      clip.TriangleFanProvokingVertexSelect = 2;
1455      break;
1456
1457   default:
1458      unreachable("Invalid provoking vertex mode");
1459   }
1460
1461   clip.MinimumPointWidth = 0.125;
1462   clip.MaximumPointWidth = 255.875;
1463
1464   if (anv_pipeline_is_primitive(pipeline)) {
1465      const struct brw_vue_prog_data *last =
1466         anv_pipeline_get_last_vue_prog_data(pipeline);
1467
1468      /* From the Vulkan 1.0.45 spec:
1469       *
1470       *    "If the last active vertex processing stage shader entry point's
1471       *    interface does not include a variable decorated with
1472       *    ViewportIndex, then the first viewport is used."
1473       */
1474      if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) {
1475         clip.MaximumVPIndex = vp_info->viewportCount > 0 ?
1476            vp_info->viewportCount - 1 : 0;
1477      } else {
1478         clip.MaximumVPIndex = 0;
1479      }
1480
1481      /* From the Vulkan 1.0.45 spec:
1482       *
1483       *    "If the last active vertex processing stage shader entry point's
1484       *    interface does not include a variable decorated with Layer, then
1485       *    the first layer is used."
1486       */
1487      clip.ForceZeroRTAIndexEnable =
1488         !(last->vue_map.slots_valid & VARYING_BIT_LAYER);
1489
1490#if GFX_VER == 7
1491      clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask;
1492      clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask;
1493#endif
1494   }
1495
1496#if GFX_VER == 7
1497   clip.FrontWinding            = genX(vk_to_intel_front_face)[rs_info->frontFace];
1498   clip.CullMode                = genX(vk_to_intel_cullmode)[rs_info->cullMode];
1499   clip.ViewportZClipTestEnable = pipeline->depth_clip_enable;
1500#else
1501   clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
1502      (wm_prog_data->barycentric_interp_modes &
1503       BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0;
1504#endif
1505
1506   GENX(3DSTATE_CLIP_pack)(NULL, pipeline->gfx7.clip, &clip);
1507}
1508
1509static void
1510emit_3dstate_streamout(struct anv_graphics_pipeline *pipeline,
1511                       const VkPipelineRasterizationStateCreateInfo *rs_info,
1512                       const uint32_t dynamic_states)
1513{
1514   const struct brw_vue_prog_data *prog_data =
1515      anv_pipeline_get_last_vue_prog_data(pipeline);
1516   const struct brw_vue_map *vue_map = &prog_data->vue_map;
1517
1518   nir_xfb_info *xfb_info;
1519   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY))
1520      xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info;
1521   else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL))
1522      xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info;
1523   else
1524      xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info;
1525
1526#if GFX_VER == 7
1527#  define streamout_state_dw pipeline->gfx7.streamout_state
1528#else
1529#  define streamout_state_dw pipeline->gfx8.streamout_state
1530#endif
1531
1532   struct GENX(3DSTATE_STREAMOUT) so = {
1533      GENX(3DSTATE_STREAMOUT_header),
1534      .RenderingDisable =
1535         (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) ?
1536            0 : rs_info->rasterizerDiscardEnable,
1537   };
1538
1539   if (xfb_info) {
1540      so.SOFunctionEnable = true;
1541      so.SOStatisticsEnable = true;
1542
1543      switch (vk_provoking_vertex_mode(rs_info)) {
1544      case VK_PROVOKING_VERTEX_MODE_FIRST_VERTEX_EXT:
1545         so.ReorderMode = LEADING;
1546         break;
1547
1548      case VK_PROVOKING_VERTEX_MODE_LAST_VERTEX_EXT:
1549         so.ReorderMode = TRAILING;
1550         break;
1551
1552      default:
1553         unreachable("Invalid provoking vertex mode");
1554      }
1555
1556      const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info =
1557         vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT);
1558      so.RenderStreamSelect = stream_info ?
1559                              stream_info->rasterizationStream : 0;
1560
1561#if GFX_VER >= 8
1562      so.Buffer0SurfacePitch = xfb_info->buffers[0].stride;
1563      so.Buffer1SurfacePitch = xfb_info->buffers[1].stride;
1564      so.Buffer2SurfacePitch = xfb_info->buffers[2].stride;
1565      so.Buffer3SurfacePitch = xfb_info->buffers[3].stride;
1566#else
1567      pipeline->gfx7.xfb_bo_pitch[0] = xfb_info->buffers[0].stride;
1568      pipeline->gfx7.xfb_bo_pitch[1] = xfb_info->buffers[1].stride;
1569      pipeline->gfx7.xfb_bo_pitch[2] = xfb_info->buffers[2].stride;
1570      pipeline->gfx7.xfb_bo_pitch[3] = xfb_info->buffers[3].stride;
1571
1572      /* On Gfx7, the SO buffer enables live in 3DSTATE_STREAMOUT which
1573       * is a bit inconvenient because we don't know what buffers will
1574       * actually be enabled until draw time.  We do our best here by
1575       * setting them based on buffers_written and we disable them
1576       * as-needed at draw time by setting EndAddress = BaseAddress.
1577       */
1578      so.SOBufferEnable0 = xfb_info->buffers_written & (1 << 0);
1579      so.SOBufferEnable1 = xfb_info->buffers_written & (1 << 1);
1580      so.SOBufferEnable2 = xfb_info->buffers_written & (1 << 2);
1581      so.SOBufferEnable3 = xfb_info->buffers_written & (1 << 3);
1582#endif
1583
1584      int urb_entry_read_offset = 0;
1585      int urb_entry_read_length =
1586         (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset;
1587
1588      /* We always read the whole vertex.  This could be reduced at some
1589       * point by reading less and offsetting the register index in the
1590       * SO_DECLs.
1591       */
1592      so.Stream0VertexReadOffset = urb_entry_read_offset;
1593      so.Stream0VertexReadLength = urb_entry_read_length - 1;
1594      so.Stream1VertexReadOffset = urb_entry_read_offset;
1595      so.Stream1VertexReadLength = urb_entry_read_length - 1;
1596      so.Stream2VertexReadOffset = urb_entry_read_offset;
1597      so.Stream2VertexReadLength = urb_entry_read_length - 1;
1598      so.Stream3VertexReadOffset = urb_entry_read_offset;
1599      so.Stream3VertexReadLength = urb_entry_read_length - 1;
1600   }
1601
1602   if (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE) {
1603      GENX(3DSTATE_STREAMOUT_pack)(NULL, streamout_state_dw, &so);
1604   } else {
1605      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_STREAMOUT), _so)
1606         _so = so;
1607   }
1608
1609   if (xfb_info) {
1610      struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128];
1611      int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0};
1612      int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0};
1613
1614      memset(so_decl, 0, sizeof(so_decl));
1615
1616      for (unsigned i = 0; i < xfb_info->output_count; i++) {
1617         const nir_xfb_output_info *output = &xfb_info->outputs[i];
1618         unsigned buffer = output->buffer;
1619         unsigned stream = xfb_info->buffer_to_stream[buffer];
1620
1621         /* Our hardware is unusual in that it requires us to program SO_DECLs
1622          * for fake "hole" components, rather than simply taking the offset
1623          * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
1624          * program as many size = 4 holes as we can, then a final hole to
1625          * accommodate the final 1, 2, or 3 remaining.
1626          */
1627         int hole_dwords = (output->offset - next_offset[buffer]) / 4;
1628         while (hole_dwords > 0) {
1629            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1630               .HoleFlag = 1,
1631               .OutputBufferSlot = buffer,
1632               .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1,
1633            };
1634            hole_dwords -= 4;
1635         }
1636
1637         int varying = output->location;
1638         uint8_t component_mask = output->component_mask;
1639         /* VARYING_SLOT_PSIZ contains four scalar fields packed together:
1640          * - VARYING_SLOT_PRIMITIVE_SHADING_RATE in VARYING_SLOT_PSIZ.x
1641          * - VARYING_SLOT_LAYER                  in VARYING_SLOT_PSIZ.y
1642          * - VARYING_SLOT_VIEWPORT               in VARYING_SLOT_PSIZ.z
1643          * - VARYING_SLOT_PSIZ                   in VARYING_SLOT_PSIZ.w
1644          */
1645         if (varying == VARYING_SLOT_PRIMITIVE_SHADING_RATE) {
1646            varying = VARYING_SLOT_PSIZ;
1647            component_mask = 1 << 0; // SO_DECL_COMPMASK_X
1648         } else if (varying == VARYING_SLOT_LAYER) {
1649            varying = VARYING_SLOT_PSIZ;
1650            component_mask = 1 << 1; // SO_DECL_COMPMASK_Y
1651         } else if (varying == VARYING_SLOT_VIEWPORT) {
1652            varying = VARYING_SLOT_PSIZ;
1653            component_mask = 1 << 2; // SO_DECL_COMPMASK_Z
1654         } else if (varying == VARYING_SLOT_PSIZ) {
1655            component_mask = 1 << 3; // SO_DECL_COMPMASK_W
1656         }
1657
1658         next_offset[buffer] = output->offset +
1659                               __builtin_popcount(component_mask) * 4;
1660
1661         const int slot = vue_map->varying_to_slot[varying];
1662         if (slot < 0) {
1663            /* This can happen if the shader never writes to the varying.
1664             * Insert a hole instead of actual varying data.
1665             */
1666            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1667               .HoleFlag = true,
1668               .OutputBufferSlot = buffer,
1669               .ComponentMask = component_mask,
1670            };
1671         } else {
1672            so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) {
1673               .OutputBufferSlot = buffer,
1674               .RegisterIndex = slot,
1675               .ComponentMask = component_mask,
1676            };
1677         }
1678      }
1679
1680      int max_decls = 0;
1681      for (unsigned s = 0; s < MAX_XFB_STREAMS; s++)
1682         max_decls = MAX2(max_decls, decls[s]);
1683
1684      uint8_t sbs[MAX_XFB_STREAMS] = { };
1685      for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) {
1686         if (xfb_info->buffers_written & (1 << b))
1687            sbs[xfb_info->buffer_to_stream[b]] |= 1 << b;
1688      }
1689
1690      uint32_t *dw = anv_batch_emitn(&pipeline->base.batch, 3 + 2 * max_decls,
1691                                     GENX(3DSTATE_SO_DECL_LIST),
1692                                     .StreamtoBufferSelects0 = sbs[0],
1693                                     .StreamtoBufferSelects1 = sbs[1],
1694                                     .StreamtoBufferSelects2 = sbs[2],
1695                                     .StreamtoBufferSelects3 = sbs[3],
1696                                     .NumEntries0 = decls[0],
1697                                     .NumEntries1 = decls[1],
1698                                     .NumEntries2 = decls[2],
1699                                     .NumEntries3 = decls[3]);
1700
1701      for (int i = 0; i < max_decls; i++) {
1702         GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2,
1703            &(struct GENX(SO_DECL_ENTRY)) {
1704               .Stream0Decl = so_decl[0][i],
1705               .Stream1Decl = so_decl[1][i],
1706               .Stream2Decl = so_decl[2][i],
1707               .Stream3Decl = so_decl[3][i],
1708            });
1709      }
1710   }
1711}
1712
1713static uint32_t
1714get_sampler_count(const struct anv_shader_bin *bin)
1715{
1716   uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4);
1717
1718   /* We can potentially have way more than 32 samplers and that's ok.
1719    * However, the 3DSTATE_XS packets only have 3 bits to specify how
1720    * many to pre-fetch and all values above 4 are marked reserved.
1721    */
1722   return MIN2(count_by_4, 4);
1723}
1724
1725static UNUSED struct anv_address
1726get_scratch_address(struct anv_pipeline *pipeline,
1727                    gl_shader_stage stage,
1728                    const struct anv_shader_bin *bin)
1729{
1730   return (struct anv_address) {
1731      .bo = anv_scratch_pool_alloc(pipeline->device,
1732                                   &pipeline->device->scratch_pool,
1733                                   stage, bin->prog_data->total_scratch),
1734      .offset = 0,
1735   };
1736}
1737
1738static UNUSED uint32_t
1739get_scratch_space(const struct anv_shader_bin *bin)
1740{
1741   return ffs(bin->prog_data->total_scratch / 2048);
1742}
1743
1744static UNUSED uint32_t
1745get_scratch_surf(struct anv_pipeline *pipeline,
1746                 gl_shader_stage stage,
1747                 const struct anv_shader_bin *bin)
1748{
1749   if (bin->prog_data->total_scratch == 0)
1750      return 0;
1751
1752   struct anv_bo *bo =
1753      anv_scratch_pool_alloc(pipeline->device,
1754                             &pipeline->device->scratch_pool,
1755                             stage, bin->prog_data->total_scratch);
1756   anv_reloc_list_add_bo(pipeline->batch.relocs,
1757                         pipeline->batch.alloc, bo);
1758   return anv_scratch_pool_get_surf(pipeline->device,
1759                                    &pipeline->device->scratch_pool,
1760                                    bin->prog_data->total_scratch) >> 4;
1761}
1762
1763static void
1764emit_3dstate_vs(struct anv_graphics_pipeline *pipeline)
1765{
1766   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1767   const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline);
1768   const struct anv_shader_bin *vs_bin =
1769      pipeline->shaders[MESA_SHADER_VERTEX];
1770
1771   assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX));
1772
1773   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VS), vs) {
1774      vs.Enable               = true;
1775      vs.StatisticsEnable     = true;
1776      vs.KernelStartPointer   = vs_bin->kernel.offset;
1777#if GFX_VER >= 8
1778      vs.SIMD8DispatchEnable  =
1779         vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
1780#endif
1781
1782      assert(!vs_prog_data->base.base.use_alt_mode);
1783#if GFX_VER < 11
1784      vs.SingleVertexDispatch       = false;
1785#endif
1786      vs.VectorMaskEnable           = false;
1787      /* Wa_1606682166:
1788       * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
1789       * Disable the Sampler state prefetch functionality in the SARB by
1790       * programming 0xB000[30] to '1'.
1791       */
1792      vs.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(vs_bin);
1793      vs.BindingTableEntryCount     = vs_bin->bind_map.surface_count;
1794      vs.FloatingPointMode          = IEEE754;
1795      vs.IllegalOpcodeExceptionEnable = false;
1796      vs.SoftwareExceptionEnable    = false;
1797      vs.MaximumNumberofThreads     = devinfo->max_vs_threads - 1;
1798
1799      if (GFX_VER == 9 && devinfo->gt == 4 &&
1800          anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1801         /* On Sky Lake GT4, we have experienced some hangs related to the VS
1802          * cache and tessellation.  It is unknown exactly what is happening
1803          * but the Haswell docs for the "VS Reference Count Full Force Miss
1804          * Enable" field of the "Thread Mode" register refer to a HSW bug in
1805          * which the VUE handle reference count would overflow resulting in
1806          * internal reference counting bugs.  My (Jason's) best guess is that
1807          * this bug cropped back up on SKL GT4 when we suddenly had more
1808          * threads in play than any previous gfx9 hardware.
1809          *
1810          * What we do know for sure is that setting this bit when
1811          * tessellation shaders are in use fixes a GPU hang in Batman: Arkham
1812          * City when playing with DXVK (https://bugs.freedesktop.org/107280).
1813          * Disabling the vertex cache with tessellation shaders should only
1814          * have a minor performance impact as the tessellation shaders are
1815          * likely generating and processing far more geometry than the vertex
1816          * stage.
1817          */
1818         vs.VertexCacheDisable = true;
1819      }
1820
1821      vs.VertexURBEntryReadLength      = vs_prog_data->base.urb_read_length;
1822      vs.VertexURBEntryReadOffset      = 0;
1823      vs.DispatchGRFStartRegisterForURBData =
1824         vs_prog_data->base.base.dispatch_grf_start_reg;
1825
1826#if GFX_VER >= 8
1827      vs.UserClipDistanceClipTestEnableBitmask =
1828         vs_prog_data->base.clip_distance_mask;
1829      vs.UserClipDistanceCullTestEnableBitmask =
1830         vs_prog_data->base.cull_distance_mask;
1831#endif
1832
1833#if GFX_VERx10 >= 125
1834      vs.ScratchSpaceBuffer =
1835         get_scratch_surf(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1836#else
1837      vs.PerThreadScratchSpace   = get_scratch_space(vs_bin);
1838      vs.ScratchSpaceBasePointer =
1839         get_scratch_address(&pipeline->base, MESA_SHADER_VERTEX, vs_bin);
1840#endif
1841   }
1842}
1843
1844static void
1845emit_3dstate_hs_te_ds(struct anv_graphics_pipeline *pipeline,
1846                      const VkPipelineTessellationStateCreateInfo *tess_info)
1847{
1848   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) {
1849      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs);
1850      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te);
1851      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds);
1852      return;
1853   }
1854
1855   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1856   const struct anv_shader_bin *tcs_bin =
1857      pipeline->shaders[MESA_SHADER_TESS_CTRL];
1858   const struct anv_shader_bin *tes_bin =
1859      pipeline->shaders[MESA_SHADER_TESS_EVAL];
1860
1861   const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline);
1862   const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline);
1863
1864   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_HS), hs) {
1865      hs.Enable = true;
1866      hs.StatisticsEnable = true;
1867      hs.KernelStartPointer = tcs_bin->kernel.offset;
1868      /* Wa_1606682166 */
1869      hs.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tcs_bin);
1870      hs.BindingTableEntryCount = tcs_bin->bind_map.surface_count;
1871
1872#if GFX_VER >= 12
1873      /* Wa_1604578095:
1874       *
1875       *    Hang occurs when the number of max threads is less than 2 times
1876       *    the number of instance count. The number of max threads must be
1877       *    more than 2 times the number of instance count.
1878       */
1879      assert((devinfo->max_tcs_threads / 2) > tcs_prog_data->instances);
1880#endif
1881
1882      hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
1883      hs.IncludeVertexHandles = true;
1884      hs.InstanceCount = tcs_prog_data->instances - 1;
1885
1886      hs.VertexURBEntryReadLength = 0;
1887      hs.VertexURBEntryReadOffset = 0;
1888      hs.DispatchGRFStartRegisterForURBData =
1889         tcs_prog_data->base.base.dispatch_grf_start_reg & 0x1f;
1890#if GFX_VER >= 12
1891      hs.DispatchGRFStartRegisterForURBData5 =
1892         tcs_prog_data->base.base.dispatch_grf_start_reg >> 5;
1893#endif
1894
1895#if GFX_VERx10 >= 125
1896      hs.ScratchSpaceBuffer =
1897         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1898#else
1899      hs.PerThreadScratchSpace = get_scratch_space(tcs_bin);
1900      hs.ScratchSpaceBasePointer =
1901         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_CTRL, tcs_bin);
1902#endif
1903
1904#if GFX_VER == 12
1905      /*  Patch Count threshold specifies the maximum number of patches that
1906       *  will be accumulated before a thread dispatch is forced.
1907       */
1908      hs.PatchCountThreshold = tcs_prog_data->patch_count_threshold;
1909#endif
1910
1911#if GFX_VER >= 9
1912      hs.DispatchMode = tcs_prog_data->base.dispatch_mode;
1913      hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
1914#endif
1915   }
1916
1917   const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state =
1918      tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL;
1919
1920   VkTessellationDomainOrigin uv_origin =
1921      domain_origin_state ? domain_origin_state->domainOrigin :
1922                            VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT;
1923
1924   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_TE), te) {
1925      te.Partitioning = tes_prog_data->partitioning;
1926
1927      if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) {
1928         te.OutputTopology = tes_prog_data->output_topology;
1929      } else {
1930         /* When the origin is upper-left, we have to flip the winding order */
1931         if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) {
1932            te.OutputTopology = OUTPUT_TRI_CW;
1933         } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) {
1934            te.OutputTopology = OUTPUT_TRI_CCW;
1935         } else {
1936            te.OutputTopology = tes_prog_data->output_topology;
1937         }
1938      }
1939
1940      te.TEDomain = tes_prog_data->domain;
1941      te.TEEnable = true;
1942      te.MaximumTessellationFactorOdd = 63.0;
1943      te.MaximumTessellationFactorNotOdd = 64.0;
1944   }
1945
1946   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_DS), ds) {
1947      ds.Enable = true;
1948      ds.StatisticsEnable = true;
1949      ds.KernelStartPointer = tes_bin->kernel.offset;
1950      /* Wa_1606682166 */
1951      ds.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(tes_bin);
1952      ds.BindingTableEntryCount = tes_bin->bind_map.surface_count;
1953      ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
1954
1955      ds.ComputeWCoordinateEnable =
1956         tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
1957
1958      ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length;
1959      ds.PatchURBEntryReadOffset = 0;
1960      ds.DispatchGRFStartRegisterForURBData =
1961         tes_prog_data->base.base.dispatch_grf_start_reg;
1962
1963#if GFX_VER >= 8
1964#if GFX_VER < 11
1965      ds.DispatchMode =
1966         tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ?
1967            DISPATCH_MODE_SIMD8_SINGLE_PATCH :
1968            DISPATCH_MODE_SIMD4X2;
1969#else
1970      assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8);
1971      ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
1972#endif
1973
1974      ds.UserClipDistanceClipTestEnableBitmask =
1975         tes_prog_data->base.clip_distance_mask;
1976      ds.UserClipDistanceCullTestEnableBitmask =
1977         tes_prog_data->base.cull_distance_mask;
1978#endif
1979
1980#if GFX_VERx10 >= 125
1981      ds.ScratchSpaceBuffer =
1982         get_scratch_surf(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1983#else
1984      ds.PerThreadScratchSpace = get_scratch_space(tes_bin);
1985      ds.ScratchSpaceBasePointer =
1986         get_scratch_address(&pipeline->base, MESA_SHADER_TESS_EVAL, tes_bin);
1987#endif
1988   }
1989}
1990
1991static void
1992emit_3dstate_gs(struct anv_graphics_pipeline *pipeline)
1993{
1994   const struct intel_device_info *devinfo = &pipeline->base.device->info;
1995   const struct anv_shader_bin *gs_bin =
1996      pipeline->shaders[MESA_SHADER_GEOMETRY];
1997
1998   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) {
1999      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs);
2000      return;
2001   }
2002
2003   const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline);
2004
2005   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_GS), gs) {
2006      gs.Enable                  = true;
2007      gs.StatisticsEnable        = true;
2008      gs.KernelStartPointer      = gs_bin->kernel.offset;
2009      gs.DispatchMode            = gs_prog_data->base.dispatch_mode;
2010
2011      gs.SingleProgramFlow       = false;
2012      gs.VectorMaskEnable        = false;
2013      /* Wa_1606682166 */
2014      gs.SamplerCount            = GFX_VER == 11 ? 0 : get_sampler_count(gs_bin);
2015      gs.BindingTableEntryCount  = gs_bin->bind_map.surface_count;
2016      gs.IncludeVertexHandles    = gs_prog_data->base.include_vue_handles;
2017      gs.IncludePrimitiveID      = gs_prog_data->include_primitive_id;
2018
2019      if (GFX_VER == 8) {
2020         /* Broadwell is weird.  It needs us to divide by 2. */
2021         gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1;
2022      } else {
2023         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2024      }
2025
2026      gs.OutputVertexSize        = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2027      gs.OutputTopology          = gs_prog_data->output_topology;
2028      gs.ControlDataFormat       = gs_prog_data->control_data_format;
2029      gs.ControlDataHeaderSize   = gs_prog_data->control_data_header_size_hwords;
2030      gs.InstanceControl         = MAX2(gs_prog_data->invocations, 1) - 1;
2031      gs.ReorderMode             = TRAILING;
2032
2033#if GFX_VER >= 8
2034      gs.ExpectedVertexCount     = gs_prog_data->vertices_in;
2035      gs.StaticOutput            = gs_prog_data->static_vertex_count >= 0;
2036      gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ?
2037                                   gs_prog_data->static_vertex_count : 0;
2038#endif
2039
2040      gs.VertexURBEntryReadOffset = 0;
2041      gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length;
2042      gs.DispatchGRFStartRegisterForURBData =
2043         gs_prog_data->base.base.dispatch_grf_start_reg;
2044
2045#if GFX_VER >= 8
2046      gs.UserClipDistanceClipTestEnableBitmask =
2047         gs_prog_data->base.clip_distance_mask;
2048      gs.UserClipDistanceCullTestEnableBitmask =
2049         gs_prog_data->base.cull_distance_mask;
2050#endif
2051
2052#if GFX_VERx10 >= 125
2053      gs.ScratchSpaceBuffer =
2054         get_scratch_surf(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
2055#else
2056      gs.PerThreadScratchSpace   = get_scratch_space(gs_bin);
2057      gs.ScratchSpaceBasePointer =
2058         get_scratch_address(&pipeline->base, MESA_SHADER_GEOMETRY, gs_bin);
2059#endif
2060   }
2061}
2062
2063static bool
2064has_color_buffer_write_enabled(const struct anv_graphics_pipeline *pipeline,
2065                               const VkPipelineColorBlendStateCreateInfo *blend)
2066{
2067   const struct anv_shader_bin *shader_bin =
2068      pipeline->shaders[MESA_SHADER_FRAGMENT];
2069   if (!shader_bin)
2070      return false;
2071
2072   if (!pipeline->dynamic_state.color_writes)
2073      return false;
2074
2075   const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map;
2076   for (int i = 0; i < bind_map->surface_count; i++) {
2077      struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i];
2078
2079      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
2080         continue;
2081
2082      if (binding->index == UINT32_MAX)
2083         continue;
2084
2085      if (blend && blend->pAttachments[binding->index].colorWriteMask != 0)
2086         return true;
2087   }
2088
2089   return false;
2090}
2091
2092static void
2093emit_3dstate_wm(struct anv_graphics_pipeline *pipeline, struct anv_subpass *subpass,
2094                const VkPipelineInputAssemblyStateCreateInfo *ia,
2095                const VkPipelineRasterizationStateCreateInfo *raster,
2096                const VkPipelineColorBlendStateCreateInfo *blend,
2097                const VkPipelineMultisampleStateCreateInfo *multisample,
2098                const VkPipelineRasterizationLineStateCreateInfoEXT *line,
2099                const uint32_t dynamic_states)
2100{
2101   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2102
2103   struct GENX(3DSTATE_WM) wm = {
2104      GENX(3DSTATE_WM_header),
2105   };
2106   wm.StatisticsEnable                    = true;
2107   wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
2108   wm.LineAntialiasingRegionWidth         = _10pixels;
2109   wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
2110
2111   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2112      if (wm_prog_data->early_fragment_tests) {
2113            wm.EarlyDepthStencilControl         = EDSC_PREPS;
2114      } else if (wm_prog_data->has_side_effects) {
2115         wm.EarlyDepthStencilControl         = EDSC_PSEXEC;
2116      } else {
2117         wm.EarlyDepthStencilControl         = EDSC_NORMAL;
2118      }
2119
2120#if GFX_VER >= 8
2121      /* Gen8 hardware tries to compute ThreadDispatchEnable for us but
2122       * doesn't take into account KillPixels when no depth or stencil
2123       * writes are enabled.  In order for occlusion queries to work
2124       * correctly with no attachments, we need to force-enable PS thread
2125       * dispatch.
2126       *
2127       * The BDW docs are pretty clear that that this bit isn't validated
2128       * and probably shouldn't be used in production:
2129       *
2130       *    "This must always be set to Normal. This field should not be
2131       *    tested for functional validation."
2132       *
2133       * Unfortunately, however, the other mechanism we have for doing this
2134       * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW.
2135       * Given two bad options, we choose the one which works.
2136       */
2137      pipeline->force_fragment_thread_dispatch =
2138         wm_prog_data->has_side_effects ||
2139         wm_prog_data->uses_kill;
2140
2141      if (pipeline->force_fragment_thread_dispatch ||
2142          !has_color_buffer_write_enabled(pipeline, blend)) {
2143         /* Only set this value in non dynamic mode. */
2144         wm.ForceThreadDispatchEnable =
2145            !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE) ? ForceON : 0;
2146      }
2147#endif
2148
2149      wm.BarycentricInterpolationMode =
2150         wm_prog_data->barycentric_interp_modes;
2151
2152#if GFX_VER < 8
2153      wm.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
2154      wm.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
2155      wm.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
2156      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2157
2158      /* If the subpass has a depth or stencil self-dependency, then we
2159       * need to force the hardware to do the depth/stencil write *after*
2160       * fragment shader execution.  Otherwise, the writes may hit memory
2161       * before we get around to fetching from the input attachment and we
2162       * may get the depth or stencil value from the current draw rather
2163       * than the previous one.
2164       */
2165      wm.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
2166                                         wm_prog_data->uses_kill;
2167
2168      pipeline->force_fragment_thread_dispatch =
2169         wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF ||
2170         wm_prog_data->has_side_effects ||
2171         wm.PixelShaderKillsPixel;
2172
2173      if (pipeline->force_fragment_thread_dispatch ||
2174          has_color_buffer_write_enabled(pipeline, blend)) {
2175         /* Only set this value in non dynamic mode. */
2176         wm.ThreadDispatchEnable = !(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE);
2177      }
2178
2179      if (multisample && multisample->rasterizationSamples > 1) {
2180         if (wm_prog_data->persample_dispatch) {
2181            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2182         } else {
2183            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2184         }
2185      } else {
2186         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2187      }
2188
2189      VkPolygonMode raster_mode =
2190         genX(raster_polygon_mode)(pipeline, ia->topology);
2191
2192      wm.MultisampleRasterizationMode =
2193         dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY ? 0 :
2194         genX(ms_rasterization_mode)(pipeline, raster_mode);
2195#endif
2196
2197      wm.LineStippleEnable = line && line->stippledLineEnable;
2198   }
2199
2200   uint32_t dynamic_wm_states = ANV_CMD_DIRTY_DYNAMIC_COLOR_BLEND_STATE;
2201
2202#if GFX_VER < 8
2203   dynamic_wm_states |= ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY;
2204#endif
2205
2206   if (dynamic_states & dynamic_wm_states) {
2207      const struct intel_device_info *devinfo = &pipeline->base.device->info;
2208      uint32_t *dws = devinfo->ver >= 8 ? pipeline->gfx8.wm : pipeline->gfx7.wm;
2209      GENX(3DSTATE_WM_pack)(NULL, dws, &wm);
2210   } else {
2211      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_WM), _wm)
2212         _wm = wm;
2213   }
2214}
2215
2216static void
2217emit_3dstate_ps(struct anv_graphics_pipeline *pipeline,
2218                const VkPipelineColorBlendStateCreateInfo *blend,
2219                const VkPipelineMultisampleStateCreateInfo *multisample)
2220{
2221   UNUSED const struct intel_device_info *devinfo =
2222      &pipeline->base.device->info;
2223   const struct anv_shader_bin *fs_bin =
2224      pipeline->shaders[MESA_SHADER_FRAGMENT];
2225
2226   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2227      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2228#if GFX_VER == 7
2229         /* Even if no fragments are ever dispatched, gfx7 hardware hangs if
2230          * we don't at least set the maximum number of threads.
2231          */
2232         ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
2233#endif
2234      }
2235      return;
2236   }
2237
2238   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2239
2240#if GFX_VER < 8
2241   /* The hardware wedges if you have this bit set but don't turn on any dual
2242    * source blend factors.
2243    */
2244   bool dual_src_blend = false;
2245   if (wm_prog_data->dual_src_blend && blend) {
2246      for (uint32_t i = 0; i < blend->attachmentCount; i++) {
2247         const VkPipelineColorBlendAttachmentState *bstate =
2248            &blend->pAttachments[i];
2249
2250         if (bstate->blendEnable &&
2251             (is_dual_src_blend_factor(bstate->srcColorBlendFactor) ||
2252              is_dual_src_blend_factor(bstate->dstColorBlendFactor) ||
2253              is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) ||
2254              is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) {
2255            dual_src_blend = true;
2256            break;
2257         }
2258      }
2259   }
2260#endif
2261
2262   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS), ps) {
2263      ps._8PixelDispatchEnable      = wm_prog_data->dispatch_8;
2264      ps._16PixelDispatchEnable     = wm_prog_data->dispatch_16;
2265      ps._32PixelDispatchEnable     = wm_prog_data->dispatch_32;
2266
2267      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
2268       *
2269       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
2270       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
2271       *
2272       * Since 16x MSAA is first introduced on SKL, we don't need to apply
2273       * the workaround on any older hardware.
2274       */
2275      if (GFX_VER >= 9 && !wm_prog_data->persample_dispatch &&
2276          multisample && multisample->rasterizationSamples == 16) {
2277         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
2278         ps._32PixelDispatchEnable = false;
2279      }
2280
2281      ps.KernelStartPointer0 = fs_bin->kernel.offset +
2282                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0);
2283      ps.KernelStartPointer1 = fs_bin->kernel.offset +
2284                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1);
2285      ps.KernelStartPointer2 = fs_bin->kernel.offset +
2286                               brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2);
2287
2288      ps.SingleProgramFlow          = false;
2289      ps.VectorMaskEnable           = GFX_VER >= 8;
2290      /* Wa_1606682166 */
2291      ps.SamplerCount               = GFX_VER == 11 ? 0 : get_sampler_count(fs_bin);
2292      ps.BindingTableEntryCount     = fs_bin->bind_map.surface_count;
2293      ps.PushConstantEnable         = wm_prog_data->base.nr_params > 0 ||
2294                                      wm_prog_data->base.ubo_ranges[0].length;
2295      ps.PositionXYOffsetSelect     = wm_prog_data->uses_pos_offset ?
2296                                      POSOFFSET_SAMPLE: POSOFFSET_NONE;
2297#if GFX_VER < 8
2298      ps.AttributeEnable            = wm_prog_data->num_varying_inputs > 0;
2299      ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
2300      ps.DualSourceBlendEnable      = dual_src_blend;
2301#endif
2302
2303#if GFX_VERx10 == 75
2304      /* Haswell requires the sample mask to be set in this packet as well
2305       * as in 3DSTATE_SAMPLE_MASK; the values should match.
2306       */
2307      ps.SampleMask                 = 0xff;
2308#endif
2309
2310#if GFX_VER >= 9
2311      ps.MaximumNumberofThreadsPerPSD  = 64 - 1;
2312#elif GFX_VER >= 8
2313      ps.MaximumNumberofThreadsPerPSD  = 64 - 2;
2314#else
2315      ps.MaximumNumberofThreads        = devinfo->max_wm_threads - 1;
2316#endif
2317
2318      ps.DispatchGRFStartRegisterForConstantSetupData0 =
2319         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0);
2320      ps.DispatchGRFStartRegisterForConstantSetupData1 =
2321         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1);
2322      ps.DispatchGRFStartRegisterForConstantSetupData2 =
2323         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2);
2324
2325#if GFX_VERx10 >= 125
2326      ps.ScratchSpaceBuffer =
2327         get_scratch_surf(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2328#else
2329      ps.PerThreadScratchSpace   = get_scratch_space(fs_bin);
2330      ps.ScratchSpaceBasePointer =
2331         get_scratch_address(&pipeline->base, MESA_SHADER_FRAGMENT, fs_bin);
2332#endif
2333   }
2334}
2335
2336#if GFX_VER >= 8
2337static void
2338emit_3dstate_ps_extra(struct anv_graphics_pipeline *pipeline,
2339                      struct anv_subpass *subpass,
2340                      const VkPipelineRasterizationStateCreateInfo *rs_info)
2341{
2342   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2343
2344   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2345      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps);
2346      return;
2347   }
2348
2349   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PS_EXTRA), ps) {
2350      ps.PixelShaderValid              = true;
2351      ps.AttributeEnable               = wm_prog_data->num_varying_inputs > 0;
2352      ps.oMaskPresenttoRenderTarget    = wm_prog_data->uses_omask;
2353      ps.PixelShaderIsPerSample        = wm_prog_data->persample_dispatch;
2354      ps.PixelShaderComputedDepthMode  = wm_prog_data->computed_depth_mode;
2355      ps.PixelShaderUsesSourceDepth    = wm_prog_data->uses_src_depth;
2356      ps.PixelShaderUsesSourceW        = wm_prog_data->uses_src_w;
2357
2358      /* If the subpass has a depth or stencil self-dependency, then we need
2359       * to force the hardware to do the depth/stencil write *after* fragment
2360       * shader execution.  Otherwise, the writes may hit memory before we get
2361       * around to fetching from the input attachment and we may get the depth
2362       * or stencil value from the current draw rather than the previous one.
2363       */
2364      ps.PixelShaderKillsPixel         = subpass->has_ds_self_dep ||
2365                                         wm_prog_data->uses_kill;
2366
2367#if GFX_VER >= 9
2368      ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil;
2369      ps.PixelShaderPullsBary    = wm_prog_data->pulls_bary;
2370
2371      ps.InputCoverageMaskState = ICMS_NONE;
2372      assert(!wm_prog_data->inner_coverage); /* Not available in SPIR-V */
2373      if (!wm_prog_data->uses_sample_mask)
2374         ps.InputCoverageMaskState = ICMS_NONE;
2375      else if (wm_prog_data->per_coarse_pixel_dispatch)
2376         ps.InputCoverageMaskState  = ICMS_NORMAL;
2377      else if (wm_prog_data->post_depth_coverage)
2378         ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
2379      else
2380         ps.InputCoverageMaskState = ICMS_NORMAL;
2381#else
2382      ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2383#endif
2384
2385#if GFX_VER >= 11
2386      ps.PixelShaderRequiresSourceDepthandorWPlaneCoefficients =
2387         wm_prog_data->uses_depth_w_coefficients;
2388      ps.PixelShaderIsPerCoarsePixel = wm_prog_data->per_coarse_pixel_dispatch;
2389#endif
2390   }
2391}
2392
2393static void
2394emit_3dstate_vf_topology(struct anv_graphics_pipeline *pipeline)
2395{
2396   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_TOPOLOGY), vft) {
2397      vft.PrimitiveTopologyType = pipeline->topology;
2398   }
2399}
2400#endif
2401
2402static void
2403emit_3dstate_vf_statistics(struct anv_graphics_pipeline *pipeline)
2404{
2405   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_VF_STATISTICS), vfs) {
2406      vfs.StatisticsEnable = true;
2407   }
2408}
2409
2410static void
2411compute_kill_pixel(struct anv_graphics_pipeline *pipeline,
2412                   const VkPipelineMultisampleStateCreateInfo *ms_info,
2413                   const struct anv_subpass *subpass)
2414{
2415   if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
2416      pipeline->kill_pixel = false;
2417      return;
2418   }
2419
2420   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
2421
2422   /* This computes the KillPixel portion of the computation for whether or
2423    * not we want to enable the PMA fix on gfx8 or gfx9.  It's given by this
2424    * chunk of the giant formula:
2425    *
2426    *    (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
2427    *     3DSTATE_PS_EXTRA::oMask Present to RenderTarget ||
2428    *     3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
2429    *     3DSTATE_PS_BLEND::AlphaTestEnable ||
2430    *     3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable)
2431    *
2432    * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is
2433    * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept
2434    * of an alpha test.
2435    */
2436   pipeline->kill_pixel =
2437      subpass->has_ds_self_dep || wm_prog_data->uses_kill ||
2438      wm_prog_data->uses_omask ||
2439      (ms_info && ms_info->alphaToCoverageEnable);
2440}
2441
2442#if GFX_VER == 12
2443static void
2444emit_3dstate_primitive_replication(struct anv_graphics_pipeline *pipeline)
2445{
2446   if (!pipeline->use_primitive_replication) {
2447      anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr);
2448      return;
2449   }
2450
2451   uint32_t view_mask = pipeline->subpass->view_mask;
2452   int view_count = util_bitcount(view_mask);
2453   assert(view_count > 1 && view_count <= MAX_VIEWS_FOR_PRIMITIVE_REPLICATION);
2454
2455   anv_batch_emit(&pipeline->base.batch, GENX(3DSTATE_PRIMITIVE_REPLICATION), pr) {
2456      pr.ReplicaMask = (1 << view_count) - 1;
2457      pr.ReplicationCount = view_count - 1;
2458
2459      int i = 0;
2460      u_foreach_bit(view_index, view_mask) {
2461         pr.RTAIOffset[i] = view_index;
2462         i++;
2463      }
2464   }
2465}
2466#endif
2467
2468static VkResult
2469genX(graphics_pipeline_create)(
2470    VkDevice                                    _device,
2471    struct anv_pipeline_cache *                 cache,
2472    const VkGraphicsPipelineCreateInfo*         pCreateInfo,
2473    const VkAllocationCallbacks*                pAllocator,
2474    VkPipeline*                                 pPipeline)
2475{
2476   ANV_FROM_HANDLE(anv_device, device, _device);
2477   ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass);
2478   struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass];
2479   struct anv_graphics_pipeline *pipeline;
2480   VkResult result;
2481
2482   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO);
2483
2484   /* Use the default pipeline cache if none is specified */
2485   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2486      cache = &device->default_pipeline_cache;
2487
2488   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2489                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2490   if (pipeline == NULL)
2491      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2492
2493   result = anv_graphics_pipeline_init(pipeline, device, cache,
2494                                       pCreateInfo, pAllocator);
2495   if (result != VK_SUCCESS) {
2496      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2497      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2498         *pPipeline = VK_NULL_HANDLE;
2499      return result;
2500   }
2501
2502   /* Information on which states are considered dynamic. */
2503   const VkPipelineDynamicStateCreateInfo *dyn_info =
2504      pCreateInfo->pDynamicState;
2505   uint32_t dynamic_states = 0;
2506   if (dyn_info) {
2507      for (unsigned i = 0; i < dyn_info->dynamicStateCount; i++)
2508         dynamic_states |=
2509            anv_cmd_dirty_bit_for_vk_dynamic_state(dyn_info->pDynamicStates[i]);
2510   }
2511
2512
2513   /* If rasterization is not enabled, various CreateInfo structs must be
2514    * ignored.
2515    */
2516   const bool raster_enabled =
2517      !pCreateInfo->pRasterizationState->rasterizerDiscardEnable ||
2518      (dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE);
2519
2520   const VkPipelineViewportStateCreateInfo *vp_info =
2521      raster_enabled ? pCreateInfo->pViewportState : NULL;
2522
2523   const VkPipelineMultisampleStateCreateInfo *ms_info =
2524      raster_enabled ? pCreateInfo->pMultisampleState : NULL;
2525
2526   const VkPipelineDepthStencilStateCreateInfo *ds_info =
2527      raster_enabled ? pCreateInfo->pDepthStencilState : NULL;
2528
2529   const VkPipelineColorBlendStateCreateInfo *cb_info =
2530      raster_enabled ? pCreateInfo->pColorBlendState : NULL;
2531
2532   const VkPipelineRasterizationLineStateCreateInfoEXT *line_info =
2533      vk_find_struct_const(pCreateInfo->pRasterizationState->pNext,
2534                           PIPELINE_RASTERIZATION_LINE_STATE_CREATE_INFO_EXT);
2535
2536   enum intel_urb_deref_block_size urb_deref_block_size;
2537   emit_urb_setup(pipeline, &urb_deref_block_size);
2538
2539   assert(pCreateInfo->pRasterizationState);
2540   emit_rs_state(pipeline, pCreateInfo->pInputAssemblyState,
2541                           pCreateInfo->pRasterizationState,
2542                           ms_info, line_info, dynamic_states, pass, subpass,
2543                           urb_deref_block_size);
2544   emit_ms_state(pipeline, ms_info, dynamic_states);
2545   emit_ds_state(pipeline, ds_info, dynamic_states, pass, subpass);
2546   emit_cb_state(pipeline, cb_info, ms_info, dynamic_states);
2547   compute_kill_pixel(pipeline, ms_info, subpass);
2548
2549   emit_3dstate_clip(pipeline,
2550                     pCreateInfo->pInputAssemblyState,
2551                     vp_info,
2552                     pCreateInfo->pRasterizationState,
2553                     dynamic_states);
2554
2555#if GFX_VER == 12
2556   emit_3dstate_primitive_replication(pipeline);
2557#endif
2558
2559#if 0
2560   /* From gfx7_vs_state.c */
2561
2562   /**
2563    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2564    * Geometry > Geometry Shader > State:
2565    *
2566    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2567    *     whole fixed function pipeline when the GS enable changes value in
2568    *     the 3DSTATE_GS."
2569    *
2570    * The hardware architects have clarified that in this context "flush the
2571    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2572    * Stall" bit set.
2573    */
2574   if (!device->info.is_haswell && !device->info.is_baytrail)
2575      gfx7_emit_vs_workaround_flush(brw);
2576#endif
2577
2578   if (anv_pipeline_is_primitive(pipeline)) {
2579      assert(pCreateInfo->pVertexInputState);
2580      emit_vertex_input(pipeline, pCreateInfo->pVertexInputState);
2581
2582      emit_3dstate_vs(pipeline);
2583      emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState);
2584      emit_3dstate_gs(pipeline);
2585
2586#if GFX_VER >= 8
2587      if (!(dynamic_states & ANV_CMD_DIRTY_DYNAMIC_PRIMITIVE_TOPOLOGY))
2588         emit_3dstate_vf_topology(pipeline);
2589#endif
2590
2591      emit_3dstate_vf_statistics(pipeline);
2592
2593      emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState,
2594                             dynamic_states);
2595   }
2596
2597   emit_3dstate_sbe(pipeline);
2598   emit_3dstate_wm(pipeline, subpass,
2599                   pCreateInfo->pInputAssemblyState,
2600                   pCreateInfo->pRasterizationState,
2601                   cb_info, ms_info, line_info, dynamic_states);
2602   emit_3dstate_ps(pipeline, cb_info, ms_info);
2603#if GFX_VER >= 8
2604   emit_3dstate_ps_extra(pipeline, subpass,
2605                         pCreateInfo->pRasterizationState);
2606#endif
2607
2608   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2609
2610   return pipeline->base.batch.status;
2611}
2612
2613#if GFX_VERx10 >= 125
2614
2615static void
2616emit_compute_state(struct anv_compute_pipeline *pipeline,
2617                   const struct anv_device *device)
2618{
2619   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2620   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2621
2622   const UNUSED struct anv_shader_bin *cs_bin = pipeline->cs;
2623   const struct intel_device_info *devinfo = &device->info;
2624
2625   anv_batch_emit(&pipeline->base.batch, GENX(CFE_STATE), cfe) {
2626      cfe.MaximumNumberofThreads =
2627         devinfo->max_cs_threads * devinfo->subslice_total - 1;
2628      cfe.ScratchSpaceBuffer =
2629         get_scratch_surf(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2630   }
2631}
2632
2633#else /* #if GFX_VERx10 >= 125 */
2634
2635static void
2636emit_compute_state(struct anv_compute_pipeline *pipeline,
2637                   const struct anv_device *device)
2638{
2639   const struct intel_device_info *devinfo = &device->info;
2640   const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline);
2641
2642   anv_pipeline_setup_l3_config(&pipeline->base, cs_prog_data->base.total_shared > 0);
2643
2644   const struct brw_cs_dispatch_info dispatch =
2645      brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL);
2646   const uint32_t vfe_curbe_allocation =
2647      ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
2648            cs_prog_data->push.cross_thread.regs, 2);
2649
2650   const struct anv_shader_bin *cs_bin = pipeline->cs;
2651
2652   anv_batch_emit(&pipeline->base.batch, GENX(MEDIA_VFE_STATE), vfe) {
2653#if GFX_VER > 7
2654      vfe.StackSize              = 0;
2655#else
2656      vfe.GPGPUMode              = true;
2657#endif
2658      vfe.MaximumNumberofThreads =
2659         devinfo->max_cs_threads * devinfo->subslice_total - 1;
2660      vfe.NumberofURBEntries     = GFX_VER <= 7 ? 0 : 2;
2661#if GFX_VER < 11
2662      vfe.ResetGatewayTimer      = true;
2663#endif
2664#if GFX_VER <= 8
2665      vfe.BypassGatewayControl   = true;
2666#endif
2667      vfe.URBEntryAllocationSize = GFX_VER <= 7 ? 0 : 2;
2668      vfe.CURBEAllocationSize    = vfe_curbe_allocation;
2669
2670      if (cs_bin->prog_data->total_scratch) {
2671         if (GFX_VER >= 8) {
2672            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
2673             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
2674             */
2675            vfe.PerThreadScratchSpace =
2676               ffs(cs_bin->prog_data->total_scratch) - 11;
2677         } else if (GFX_VERx10 == 75) {
2678            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
2679             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
2680             */
2681            vfe.PerThreadScratchSpace =
2682               ffs(cs_bin->prog_data->total_scratch) - 12;
2683         } else {
2684            /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB]
2685             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
2686             */
2687            vfe.PerThreadScratchSpace =
2688               cs_bin->prog_data->total_scratch / 1024 - 1;
2689         }
2690         vfe.ScratchSpaceBasePointer =
2691            get_scratch_address(&pipeline->base, MESA_SHADER_COMPUTE, cs_bin);
2692      }
2693   }
2694
2695   struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = {
2696      .KernelStartPointer     =
2697         cs_bin->kernel.offset +
2698         brw_cs_prog_data_prog_offset(cs_prog_data, dispatch.simd_size),
2699
2700      /* Wa_1606682166 */
2701      .SamplerCount           = GFX_VER == 11 ? 0 : get_sampler_count(cs_bin),
2702      /* We add 1 because the CS indirect parameters buffer isn't accounted
2703       * for in bind_map.surface_count.
2704       */
2705      .BindingTableEntryCount = 1 + MIN2(cs_bin->bind_map.surface_count, 30),
2706      .BarrierEnable          = cs_prog_data->uses_barrier,
2707      .SharedLocalMemorySize  =
2708         encode_slm_size(GFX_VER, cs_prog_data->base.total_shared),
2709
2710#if GFX_VERx10 != 75
2711      .ConstantURBEntryReadOffset = 0,
2712#endif
2713      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
2714#if GFX_VERx10 >= 75
2715      .CrossThreadConstantDataReadLength =
2716         cs_prog_data->push.cross_thread.regs,
2717#endif
2718#if GFX_VER >= 12
2719      /* TODO: Check if we are missing workarounds and enable mid-thread
2720       * preemption.
2721       *
2722       * We still have issues with mid-thread preemption (it was already
2723       * disabled by the kernel on gfx11, due to missing workarounds). It's
2724       * possible that we are just missing some workarounds, and could enable
2725       * it later, but for now let's disable it to fix a GPU in compute in Car
2726       * Chase (and possibly more).
2727       */
2728      .ThreadPreemptionDisable = true,
2729#endif
2730
2731      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
2732   };
2733   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL,
2734                                        pipeline->interface_descriptor_data,
2735                                        &desc);
2736}
2737
2738#endif /* #if GFX_VERx10 >= 125 */
2739
2740static VkResult
2741compute_pipeline_create(
2742    VkDevice                                    _device,
2743    struct anv_pipeline_cache *                 cache,
2744    const VkComputePipelineCreateInfo*          pCreateInfo,
2745    const VkAllocationCallbacks*                pAllocator,
2746    VkPipeline*                                 pPipeline)
2747{
2748   ANV_FROM_HANDLE(anv_device, device, _device);
2749   struct anv_compute_pipeline *pipeline;
2750   VkResult result;
2751
2752   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO);
2753
2754   /* Use the default pipeline cache if none is specified */
2755   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2756      cache = &device->default_pipeline_cache;
2757
2758   pipeline = vk_zalloc2(&device->vk.alloc, pAllocator, sizeof(*pipeline), 8,
2759                         VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
2760   if (pipeline == NULL)
2761      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2762
2763   result = anv_pipeline_init(&pipeline->base, device,
2764                              ANV_PIPELINE_COMPUTE, pCreateInfo->flags,
2765                              pAllocator);
2766   if (result != VK_SUCCESS) {
2767      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2768      return result;
2769   }
2770
2771   anv_batch_set_storage(&pipeline->base.batch, ANV_NULL_ADDRESS,
2772                         pipeline->batch_data, sizeof(pipeline->batch_data));
2773
2774   assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
2775   VK_FROM_HANDLE(vk_shader_module, module,  pCreateInfo->stage.module);
2776   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
2777                                    pCreateInfo->stage.pName,
2778                                    pCreateInfo->stage.pSpecializationInfo);
2779   if (result != VK_SUCCESS) {
2780      anv_pipeline_finish(&pipeline->base, device, pAllocator);
2781      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2782      if (result == VK_PIPELINE_COMPILE_REQUIRED_EXT)
2783         *pPipeline = VK_NULL_HANDLE;
2784      return result;
2785   }
2786
2787   emit_compute_state(pipeline, device);
2788
2789   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
2790
2791   return pipeline->base.batch.status;
2792}
2793
2794VkResult genX(CreateGraphicsPipelines)(
2795    VkDevice                                    _device,
2796    VkPipelineCache                             pipelineCache,
2797    uint32_t                                    count,
2798    const VkGraphicsPipelineCreateInfo*         pCreateInfos,
2799    const VkAllocationCallbacks*                pAllocator,
2800    VkPipeline*                                 pPipelines)
2801{
2802   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2803
2804   VkResult result = VK_SUCCESS;
2805
2806   unsigned i;
2807   for (i = 0; i < count; i++) {
2808      VkResult res = genX(graphics_pipeline_create)(_device,
2809                                                    pipeline_cache,
2810                                                    &pCreateInfos[i],
2811                                                    pAllocator, &pPipelines[i]);
2812
2813      if (res == VK_SUCCESS)
2814         continue;
2815
2816      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2817       * is not obvious what error should be report upon 2 different failures.
2818       * */
2819      result = res;
2820      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2821         break;
2822
2823      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2824         break;
2825   }
2826
2827   for (; i < count; i++)
2828      pPipelines[i] = VK_NULL_HANDLE;
2829
2830   return result;
2831}
2832
2833VkResult genX(CreateComputePipelines)(
2834    VkDevice                                    _device,
2835    VkPipelineCache                             pipelineCache,
2836    uint32_t                                    count,
2837    const VkComputePipelineCreateInfo*          pCreateInfos,
2838    const VkAllocationCallbacks*                pAllocator,
2839    VkPipeline*                                 pPipelines)
2840{
2841   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
2842
2843   VkResult result = VK_SUCCESS;
2844
2845   unsigned i;
2846   for (i = 0; i < count; i++) {
2847      VkResult res = compute_pipeline_create(_device, pipeline_cache,
2848                                             &pCreateInfos[i],
2849                                             pAllocator, &pPipelines[i]);
2850
2851      if (res == VK_SUCCESS)
2852         continue;
2853
2854      /* Bail out on the first error != VK_PIPELINE_COMPILE_REQUIRED_EX as it
2855       * is not obvious what error should be report upon 2 different failures.
2856       * */
2857      result = res;
2858      if (res != VK_PIPELINE_COMPILE_REQUIRED_EXT)
2859         break;
2860
2861      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
2862         break;
2863   }
2864
2865   for (; i < count; i++)
2866      pPipelines[i] = VK_NULL_HANDLE;
2867
2868   return result;
2869}
2870
2871#if GFX_VERx10 >= 125
2872
2873static void
2874assert_rt_stage_index_valid(const VkRayTracingPipelineCreateInfoKHR* pCreateInfo,
2875                            uint32_t stage_idx,
2876                            VkShaderStageFlags valid_stages)
2877{
2878   if (stage_idx == VK_SHADER_UNUSED_KHR)
2879      return;
2880
2881   assert(stage_idx <= pCreateInfo->stageCount);
2882   assert(util_bitcount(pCreateInfo->pStages[stage_idx].stage) == 1);
2883   assert(pCreateInfo->pStages[stage_idx].stage & valid_stages);
2884}
2885
2886static VkResult
2887ray_tracing_pipeline_create(
2888    VkDevice                                    _device,
2889    struct anv_pipeline_cache *                 cache,
2890    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfo,
2891    const VkAllocationCallbacks*                pAllocator,
2892    VkPipeline*                                 pPipeline)
2893{
2894   ANV_FROM_HANDLE(anv_device, device, _device);
2895   VkResult result;
2896
2897   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_RAY_TRACING_PIPELINE_CREATE_INFO_KHR);
2898
2899   /* Use the default pipeline cache if none is specified */
2900   if (cache == NULL && device->physical->instance->pipeline_cache_enabled)
2901      cache = &device->default_pipeline_cache;
2902
2903   VK_MULTIALLOC(ma);
2904   VK_MULTIALLOC_DECL(&ma, struct anv_ray_tracing_pipeline, pipeline, 1);
2905   VK_MULTIALLOC_DECL(&ma, struct anv_rt_shader_group, groups, pCreateInfo->groupCount);
2906   if (!vk_multialloc_zalloc2(&ma, &device->vk.alloc, pAllocator,
2907                              VK_SYSTEM_ALLOCATION_SCOPE_DEVICE))
2908      return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
2909
2910   result = anv_pipeline_init(&pipeline->base, device,
2911                              ANV_PIPELINE_RAY_TRACING, pCreateInfo->flags,
2912                              pAllocator);
2913   if (result != VK_SUCCESS) {
2914      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2915      return result;
2916   }
2917
2918   pipeline->group_count = pCreateInfo->groupCount;
2919   pipeline->groups = groups;
2920
2921   ASSERTED const VkShaderStageFlags ray_tracing_stages =
2922      VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2923      VK_SHADER_STAGE_ANY_HIT_BIT_KHR |
2924      VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR |
2925      VK_SHADER_STAGE_MISS_BIT_KHR |
2926      VK_SHADER_STAGE_INTERSECTION_BIT_KHR |
2927      VK_SHADER_STAGE_CALLABLE_BIT_KHR;
2928
2929   for (uint32_t i = 0; i < pCreateInfo->stageCount; i++)
2930      assert((pCreateInfo->pStages[i].stage & ~ray_tracing_stages) == 0);
2931
2932   for (uint32_t i = 0; i < pCreateInfo->groupCount; i++) {
2933      const VkRayTracingShaderGroupCreateInfoKHR *ginfo =
2934         &pCreateInfo->pGroups[i];
2935      assert_rt_stage_index_valid(pCreateInfo, ginfo->generalShader,
2936                                  VK_SHADER_STAGE_RAYGEN_BIT_KHR |
2937                                  VK_SHADER_STAGE_MISS_BIT_KHR |
2938                                  VK_SHADER_STAGE_CALLABLE_BIT_KHR);
2939      assert_rt_stage_index_valid(pCreateInfo, ginfo->closestHitShader,
2940                                  VK_SHADER_STAGE_CLOSEST_HIT_BIT_KHR);
2941      assert_rt_stage_index_valid(pCreateInfo, ginfo->anyHitShader,
2942                                  VK_SHADER_STAGE_ANY_HIT_BIT_KHR);
2943      assert_rt_stage_index_valid(pCreateInfo, ginfo->intersectionShader,
2944                                  VK_SHADER_STAGE_INTERSECTION_BIT_KHR);
2945      switch (ginfo->type) {
2946      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR:
2947         assert(ginfo->generalShader < pCreateInfo->stageCount);
2948         assert(ginfo->anyHitShader == VK_SHADER_UNUSED_KHR);
2949         assert(ginfo->closestHitShader == VK_SHADER_UNUSED_KHR);
2950         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2951         break;
2952
2953      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR:
2954         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2955         assert(ginfo->intersectionShader == VK_SHADER_UNUSED_KHR);
2956         break;
2957
2958      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR:
2959         assert(ginfo->generalShader == VK_SHADER_UNUSED_KHR);
2960         break;
2961
2962      default:
2963         unreachable("Invalid ray-tracing shader group type");
2964      }
2965   }
2966
2967   result = anv_ray_tracing_pipeline_init(pipeline, device, cache,
2968                                          pCreateInfo, pAllocator);
2969   if (result != VK_SUCCESS) {
2970      anv_pipeline_finish(&pipeline->base, device, pAllocator);
2971      vk_free2(&device->vk.alloc, pAllocator, pipeline);
2972      return result;
2973   }
2974
2975   for (uint32_t i = 0; i < pipeline->group_count; i++) {
2976      struct anv_rt_shader_group *group = &pipeline->groups[i];
2977
2978      switch (group->type) {
2979      case VK_RAY_TRACING_SHADER_GROUP_TYPE_GENERAL_KHR: {
2980         struct GFX_RT_GENERAL_SBT_HANDLE sh = {};
2981         sh.General = anv_shader_bin_get_bsr(group->general, 32);
2982         GFX_RT_GENERAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
2983         break;
2984      }
2985
2986      case VK_RAY_TRACING_SHADER_GROUP_TYPE_TRIANGLES_HIT_GROUP_KHR: {
2987         struct GFX_RT_TRIANGLES_SBT_HANDLE sh = {};
2988         if (group->closest_hit)
2989            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
2990         if (group->any_hit)
2991            sh.AnyHit = anv_shader_bin_get_bsr(group->any_hit, 24);
2992         GFX_RT_TRIANGLES_SBT_HANDLE_pack(NULL, group->handle, &sh);
2993         break;
2994      }
2995
2996      case VK_RAY_TRACING_SHADER_GROUP_TYPE_PROCEDURAL_HIT_GROUP_KHR: {
2997         struct GFX_RT_PROCEDURAL_SBT_HANDLE sh = {};
2998         if (group->closest_hit)
2999            sh.ClosestHit = anv_shader_bin_get_bsr(group->closest_hit, 32);
3000         sh.Intersection = anv_shader_bin_get_bsr(group->intersection, 24);
3001         GFX_RT_PROCEDURAL_SBT_HANDLE_pack(NULL, group->handle, &sh);
3002         break;
3003      }
3004
3005      default:
3006         unreachable("Invalid shader group type");
3007      }
3008   }
3009
3010   *pPipeline = anv_pipeline_to_handle(&pipeline->base);
3011
3012   return pipeline->base.batch.status;
3013}
3014
3015VkResult
3016genX(CreateRayTracingPipelinesKHR)(
3017    VkDevice                                    _device,
3018    VkDeferredOperationKHR                      deferredOperation,
3019    VkPipelineCache                             pipelineCache,
3020    uint32_t                                    createInfoCount,
3021    const VkRayTracingPipelineCreateInfoKHR*    pCreateInfos,
3022    const VkAllocationCallbacks*                pAllocator,
3023    VkPipeline*                                 pPipelines)
3024{
3025   ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache);
3026
3027   VkResult result = VK_SUCCESS;
3028
3029   unsigned i;
3030   for (i = 0; i < createInfoCount; i++) {
3031      VkResult res = ray_tracing_pipeline_create(_device, pipeline_cache,
3032                                                 &pCreateInfos[i],
3033                                                 pAllocator, &pPipelines[i]);
3034
3035      if (res == VK_SUCCESS)
3036         continue;
3037
3038      /* Bail out on the first error as it is not obvious what error should be
3039       * report upon 2 different failures. */
3040      result = res;
3041      if (result != VK_PIPELINE_COMPILE_REQUIRED_EXT)
3042         break;
3043
3044      if (pCreateInfos[i].flags & VK_PIPELINE_CREATE_EARLY_RETURN_ON_FAILURE_BIT_EXT)
3045         break;
3046   }
3047
3048   for (; i < createInfoCount; i++)
3049      pPipelines[i] = VK_NULL_HANDLE;
3050
3051   return result;
3052}
3053#endif /* GFX_VERx10 >= 125 */
3054