1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25
26#include "main/samplerobj.h"
27
28#include "dev/intel_device_info.h"
29#include "common/intel_sample_positions.h"
30#include "genxml/gen_macros.h"
31#include "common/intel_guardband.h"
32
33#include "main/bufferobj.h"
34#include "main/context.h"
35#include "main/enums.h"
36#include "main/macros.h"
37#include "main/state.h"
38
39#include "genX_boilerplate.h"
40
41#include "brw_context.h"
42#include "brw_cs.h"
43#include "brw_draw.h"
44#include "brw_multisample_state.h"
45#include "brw_state.h"
46#include "brw_wm.h"
47#include "brw_util.h"
48
49#include "brw_batch.h"
50#include "brw_buffer_objects.h"
51#include "brw_fbo.h"
52
53#include "main/enums.h"
54#include "main/fbobject.h"
55#include "main/framebuffer.h"
56#include "main/glformats.h"
57#include "main/shaderapi.h"
58#include "main/stencil.h"
59#include "main/transformfeedback.h"
60#include "main/varray.h"
61#include "main/viewport.h"
62#include "util/half_float.h"
63
64#if GFX_VER == 4
65static struct brw_address
66KSP(struct brw_context *brw, uint32_t offset)
67{
68   return ro_bo(brw->cache.bo, offset);
69}
70#else
71static uint32_t
72KSP(UNUSED struct brw_context *brw, uint32_t offset)
73{
74   return offset;
75}
76#endif
77
78#if GFX_VER >= 7
79static void
80emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
81{
82   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
83      lrm.RegisterAddress  = reg;
84      lrm.MemoryAddress    = addr;
85   }
86}
87#endif
88
89#if GFX_VER == 7
90static void
91emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
92{
93   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
94      lri.RegisterOffset   = reg;
95      lri.DataDWord        = imm;
96   }
97}
98#endif
99
100/**
101 * Polygon stipple packet
102 */
103static void
104genX(upload_polygon_stipple)(struct brw_context *brw)
105{
106   struct gl_context *ctx = &brw->ctx;
107
108   /* _NEW_POLYGON */
109   if (!ctx->Polygon.StippleFlag)
110      return;
111
112   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
113      /* Polygon stipple is provided in OpenGL order, i.e. bottom
114       * row first.  If we're rendering to a window (i.e. the
115       * default frame buffer object, 0), then we need to invert
116       * it to match our pixel layout.  But if we're rendering
117       * to a FBO (i.e. any named frame buffer object), we *don't*
118       * need to invert - we already match the layout.
119       */
120      if (ctx->DrawBuffer->FlipY) {
121         for (unsigned i = 0; i < 32; i++)
122            poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
123      } else {
124         for (unsigned i = 0; i < 32; i++)
125            poly.PatternRow[i] = ctx->PolygonStipple[i];
126      }
127   }
128}
129
130static const struct brw_tracked_state genX(polygon_stipple) = {
131   .dirty = {
132      .mesa = _NEW_POLYGON |
133              _NEW_POLYGONSTIPPLE,
134      .brw = BRW_NEW_CONTEXT,
135   },
136   .emit = genX(upload_polygon_stipple),
137};
138
139/**
140 * Polygon stipple offset packet
141 */
142static void
143genX(upload_polygon_stipple_offset)(struct brw_context *brw)
144{
145   struct gl_context *ctx = &brw->ctx;
146
147   /* _NEW_POLYGON */
148   if (!ctx->Polygon.StippleFlag)
149      return;
150
151   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
152      /* _NEW_BUFFERS
153       *
154       * If we're drawing to a system window we have to invert the Y axis
155       * in order to match the OpenGL pixel coordinate system, and our
156       * offset must be matched to the window position.  If we're drawing
157       * to a user-created FBO then our native pixel coordinate system
158       * works just fine, and there's no window system to worry about.
159       */
160      if (ctx->DrawBuffer->FlipY) {
161         poly.PolygonStippleYOffset =
162            (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
163      }
164   }
165}
166
167static const struct brw_tracked_state genX(polygon_stipple_offset) = {
168   .dirty = {
169      .mesa = _NEW_BUFFERS |
170              _NEW_POLYGON,
171      .brw = BRW_NEW_CONTEXT,
172   },
173   .emit = genX(upload_polygon_stipple_offset),
174};
175
176/**
177 * Line stipple packet
178 */
179static void
180genX(upload_line_stipple)(struct brw_context *brw)
181{
182   struct gl_context *ctx = &brw->ctx;
183
184   if (!ctx->Line.StippleFlag)
185      return;
186
187   brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
188      line.LineStipplePattern = ctx->Line.StipplePattern;
189
190      line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
191      line.LineStippleRepeatCount = ctx->Line.StippleFactor;
192   }
193}
194
195static const struct brw_tracked_state genX(line_stipple) = {
196   .dirty = {
197      .mesa = _NEW_LINE,
198      .brw = BRW_NEW_CONTEXT,
199   },
200   .emit = genX(upload_line_stipple),
201};
202
203/* Constant single cliprect for framebuffer object or DRI2 drawing */
204static void
205genX(upload_drawing_rect)(struct brw_context *brw)
206{
207   struct gl_context *ctx = &brw->ctx;
208   const struct gl_framebuffer *fb = ctx->DrawBuffer;
209   const unsigned int fb_width = _mesa_geometric_width(fb);
210   const unsigned int fb_height = _mesa_geometric_height(fb);
211
212   brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
213      rect.ClippedDrawingRectangleXMax = fb_width - 1;
214      rect.ClippedDrawingRectangleYMax = fb_height - 1;
215   }
216}
217
218static const struct brw_tracked_state genX(drawing_rect) = {
219   .dirty = {
220      .mesa = _NEW_BUFFERS,
221      .brw = BRW_NEW_BLORP |
222             BRW_NEW_CONTEXT,
223   },
224   .emit = genX(upload_drawing_rect),
225};
226
227static uint32_t *
228genX(emit_vertex_buffer_state)(struct brw_context *brw,
229                               uint32_t *dw,
230                               unsigned buffer_nr,
231                               struct brw_bo *bo,
232                               unsigned start_offset,
233                               UNUSED unsigned end_offset,
234                               unsigned stride,
235                               UNUSED unsigned step_rate)
236{
237   struct GENX(VERTEX_BUFFER_STATE) buf_state = {
238      .VertexBufferIndex = buffer_nr,
239      .BufferPitch = stride,
240
241      /* The VF cache designers apparently cut corners, and made the cache
242       * only consider the bottom 32 bits of memory addresses.  If you happen
243       * to have two vertex buffers which get placed exactly 4 GiB apart and
244       * use them in back-to-back draw calls, you can get collisions.  To work
245       * around this problem, we restrict vertex buffers to the low 32 bits of
246       * the address space.
247       */
248      .BufferStartingAddress = ro_32_bo(bo, start_offset),
249#if GFX_VER >= 8
250      .BufferSize = end_offset - start_offset,
251#endif
252
253#if GFX_VER >= 7
254      .AddressModifyEnable = true,
255#endif
256
257#if GFX_VER < 8
258      .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
259      .InstanceDataStepRate = step_rate,
260#if GFX_VER >= 5
261      .EndAddress = ro_bo(bo, end_offset - 1),
262#endif
263#endif
264
265#if GFX_VER == 11
266      .MOCS = ICL_MOCS_WB,
267#elif GFX_VER == 10
268      .MOCS = CNL_MOCS_WB,
269#elif GFX_VER == 9
270      .MOCS = SKL_MOCS_WB,
271#elif GFX_VER == 8
272      .MOCS = BDW_MOCS_WB,
273#elif GFX_VER == 7
274      .MOCS = GFX7_MOCS_L3,
275#endif
276   };
277
278   GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
279   return dw + GENX(VERTEX_BUFFER_STATE_length);
280}
281
282UNUSED static bool
283is_passthru_format(uint32_t format)
284{
285   switch (format) {
286   case ISL_FORMAT_R64_PASSTHRU:
287   case ISL_FORMAT_R64G64_PASSTHRU:
288   case ISL_FORMAT_R64G64B64_PASSTHRU:
289   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
290      return true;
291   default:
292      return false;
293   }
294}
295
296UNUSED static int
297uploads_needed(uint32_t format,
298               bool is_dual_slot)
299{
300   if (!is_passthru_format(format))
301      return 1;
302
303   if (is_dual_slot)
304      return 2;
305
306   switch (format) {
307   case ISL_FORMAT_R64_PASSTHRU:
308   case ISL_FORMAT_R64G64_PASSTHRU:
309      return 1;
310   case ISL_FORMAT_R64G64B64_PASSTHRU:
311   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
312      return 2;
313   default:
314      unreachable("not reached");
315   }
316}
317
318/*
319 * Returns the format that we are finally going to use when upload a vertex
320 * element. It will only change if we are using *64*PASSTHRU formats, as for
321 * gen < 8 they need to be splitted on two *32*FLOAT formats.
322 *
323 * @upload points in which upload we are. Valid values are [0,1]
324 */
325static uint32_t
326downsize_format_if_needed(uint32_t format,
327                          int upload)
328{
329   assert(upload == 0 || upload == 1);
330
331   if (!is_passthru_format(format))
332      return format;
333
334   /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
335    * 1 means that we have been forced to do 2 uploads for a size <= 2. This
336    * happens with gen < 8 and dvec3 or dvec4 vertex shader input
337    * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
338    * flagging that we want to fill with zeroes this second forced upload.
339    */
340   switch (format) {
341   case ISL_FORMAT_R64_PASSTHRU:
342      return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
343                         : ISL_FORMAT_R32_FLOAT;
344   case ISL_FORMAT_R64G64_PASSTHRU:
345      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
346                         : ISL_FORMAT_R32_FLOAT;
347   case ISL_FORMAT_R64G64B64_PASSTHRU:
348      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
349                         : ISL_FORMAT_R32G32_FLOAT;
350   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
351      return ISL_FORMAT_R32G32B32A32_FLOAT;
352   default:
353      unreachable("not reached");
354   }
355}
356
357/*
358 * Returns the number of componentes associated with a format that is used on
359 * a 64 to 32 format split. See downsize_format()
360 */
361static int
362upload_format_size(uint32_t upload_format)
363{
364   switch (upload_format) {
365   case ISL_FORMAT_R32_FLOAT:
366
367      /* downsized_format has returned this one in order to flag that we are
368       * performing a second upload which we want to have filled with
369       * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
370       * vertex shader input variables.
371       */
372
373      return 0;
374   case ISL_FORMAT_R32G32_FLOAT:
375      return 2;
376   case ISL_FORMAT_R32G32B32A32_FLOAT:
377      return 4;
378   default:
379      unreachable("not reached");
380   }
381}
382
383static UNUSED uint16_t
384pinned_bo_high_bits(struct brw_bo *bo)
385{
386   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
387}
388
389/* The VF cache designers apparently cut corners, and made the cache key's
390 * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
391 * of the address.  If you happen to have two vertex buffers which get placed
392 * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
393 * collisions.  (These collisions can happen within a single batch.)
394 *
395 * In the soft-pin world, we'd like to assign addresses up front, and never
396 * move buffers.  So, we need to do a VF cache invalidate if the buffer for
397 * a particular VB slot has different [48:32] address bits than the last one.
398 *
399 * In the relocation world, we have no idea what the addresses will be, so
400 * we can't apply this workaround.  Instead, we tell the kernel to move it
401 * to the low 4GB regardless.
402 *
403 * This HW issue is gone on Gfx11+.
404 */
405static void
406vf_invalidate_for_vb_48bit_transitions(UNUSED struct brw_context *brw)
407{
408#if GFX_VER >= 8 && GFX_VER < 11
409   bool need_invalidate = false;
410
411   for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
412      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
413
414      if (high_bits != brw->vb.last_bo_high_bits[i]) {
415         need_invalidate = true;
416         brw->vb.last_bo_high_bits[i] = high_bits;
417      }
418   }
419
420   if (brw->draw.draw_params_bo) {
421      uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
422
423      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
424         need_invalidate = true;
425         brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
426      }
427   }
428
429   if (brw->draw.derived_draw_params_bo) {
430      uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
431
432      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
433         need_invalidate = true;
434         brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
435      }
436   }
437
438   if (need_invalidate) {
439      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
440   }
441#endif
442}
443
444static void
445vf_invalidate_for_ib_48bit_transition(UNUSED struct brw_context *brw)
446{
447#if GFX_VER >= 8
448   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
449
450   if (high_bits != brw->ib.last_bo_high_bits) {
451      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
452      brw->ib.last_bo_high_bits = high_bits;
453   }
454#endif
455}
456
457static void
458genX(emit_vertices)(struct brw_context *brw)
459{
460   const struct intel_device_info *devinfo = &brw->screen->devinfo;
461   uint32_t *dw;
462
463   brw_prepare_vertices(brw);
464   brw_prepare_shader_draw_parameters(brw);
465
466#if GFX_VER < 6
467   brw_emit_query_begin(brw);
468#endif
469
470   const struct brw_vs_prog_data *vs_prog_data =
471      brw_vs_prog_data(brw->vs.base.prog_data);
472
473#if GFX_VER >= 8
474   struct gl_context *ctx = &brw->ctx;
475   const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
476                                ctx->Polygon.BackMode != GL_FILL);
477
478   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
479      unsigned vue = brw->vb.nr_enabled;
480
481      /* The element for the edge flags must always be last, so we have to
482       * insert the SGVS before it in that case.
483       */
484      if (uses_edge_flag) {
485         assert(vue > 0);
486         vue--;
487      }
488
489      WARN_ONCE(vue >= 33,
490                "Trying to insert VID/IID past 33rd vertex element, "
491                "need to reorder the vertex attrbutes.");
492
493      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
494         if (vs_prog_data->uses_vertexid) {
495            vfs.VertexIDEnable = true;
496            vfs.VertexIDComponentNumber = 2;
497            vfs.VertexIDElementOffset = vue;
498         }
499
500         if (vs_prog_data->uses_instanceid) {
501            vfs.InstanceIDEnable = true;
502            vfs.InstanceIDComponentNumber = 3;
503            vfs.InstanceIDElementOffset = vue;
504         }
505      }
506
507      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
508         vfi.InstancingEnable = true;
509         vfi.VertexElementIndex = vue;
510      }
511   } else {
512      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
513   }
514#endif
515
516   const bool uses_draw_params =
517      vs_prog_data->uses_firstvertex ||
518      vs_prog_data->uses_baseinstance;
519
520   const bool uses_derived_draw_params =
521      vs_prog_data->uses_drawid ||
522      vs_prog_data->uses_is_indexed_draw;
523
524   const bool needs_sgvs_element = (uses_draw_params ||
525                                    vs_prog_data->uses_instanceid ||
526                                    vs_prog_data->uses_vertexid);
527
528   unsigned nr_elements =
529      brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
530
531#if GFX_VER < 8
532   /* If any of the formats of vb.enabled needs more that one upload, we need
533    * to add it to nr_elements
534    */
535   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
536      struct brw_vertex_element *input = brw->vb.enabled[i];
537      uint32_t format = brw_get_vertex_surface_type(brw, input->glformat);
538
539      if (uploads_needed(format, input->is_dual_slot) > 1)
540         nr_elements++;
541   }
542#endif
543
544   /* If the VS doesn't read any inputs (calculating vertex position from
545    * a state variable for some reason, for example), emit a single pad
546    * VERTEX_ELEMENT struct and bail.
547    *
548    * The stale VB state stays in place, but they don't do anything unless
549    * a VE loads from them.
550    */
551   if (nr_elements == 0) {
552      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
553                           1 + GENX(VERTEX_ELEMENT_STATE_length));
554      struct GENX(VERTEX_ELEMENT_STATE) elem = {
555         .Valid = true,
556         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
557         .Component0Control = VFCOMP_STORE_0,
558         .Component1Control = VFCOMP_STORE_0,
559         .Component2Control = VFCOMP_STORE_0,
560         .Component3Control = VFCOMP_STORE_1_FP,
561      };
562      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
563      return;
564   }
565
566   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
567   const unsigned nr_buffers = brw->vb.nr_buffers +
568      uses_draw_params + uses_derived_draw_params;
569
570   vf_invalidate_for_vb_48bit_transitions(brw);
571
572   if (nr_buffers) {
573      assert(nr_buffers <= (GFX_VER >= 6 ? 33 : 17));
574
575      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
576                           1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
577
578      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
579         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
580         /* Prior to Haswell and Bay Trail we have to use 4-component formats
581          * to fake 3-component ones.  In particular, we do this for
582          * half-float and 8 and 16-bit integer formats.  This means that the
583          * vertex element may poke over the end of the buffer by 2 bytes.
584          */
585         const unsigned padding =
586            (GFX_VERx10 < 75 && !devinfo->is_baytrail) * 2;
587         const unsigned end = buffer->offset + buffer->size + padding;
588         dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
589                                             buffer->offset,
590                                             end,
591                                             buffer->stride,
592                                             buffer->step_rate);
593      }
594
595      if (uses_draw_params) {
596         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
597                                             brw->draw.draw_params_bo,
598                                             brw->draw.draw_params_offset,
599                                             brw->draw.draw_params_bo->size,
600                                             0 /* stride */,
601                                             0 /* step rate */);
602      }
603
604      if (uses_derived_draw_params) {
605         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
606                                             brw->draw.derived_draw_params_bo,
607                                             brw->draw.derived_draw_params_offset,
608                                             brw->draw.derived_draw_params_bo->size,
609                                             0 /* stride */,
610                                             0 /* step rate */);
611      }
612   }
613
614   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
615    * presumably for VertexID/InstanceID.
616    */
617#if GFX_VER >= 6
618   assert(nr_elements <= 34);
619   const struct brw_vertex_element *gfx6_edgeflag_input = NULL;
620#else
621   assert(nr_elements <= 18);
622#endif
623
624   dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
625                        1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
626   unsigned i;
627   for (i = 0; i < brw->vb.nr_enabled; i++) {
628      const struct brw_vertex_element *input = brw->vb.enabled[i];
629      const struct gl_vertex_format *glformat = input->glformat;
630      uint32_t format = brw_get_vertex_surface_type(brw, glformat);
631      uint32_t comp0 = VFCOMP_STORE_SRC;
632      uint32_t comp1 = VFCOMP_STORE_SRC;
633      uint32_t comp2 = VFCOMP_STORE_SRC;
634      uint32_t comp3 = VFCOMP_STORE_SRC;
635      const unsigned num_uploads = GFX_VER < 8 ?
636         uploads_needed(format, input->is_dual_slot) : 1;
637
638#if GFX_VER >= 8
639      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
640       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
641       * element which has edge flag enabled."
642       */
643      assert(!(is_passthru_format(format) && uses_edge_flag));
644#endif
645
646      /* The gfx4 driver expects edgeflag to come in as a float, and passes
647       * that float on to the tests in the clipper.  Mesa's current vertex
648       * attribute value for EdgeFlag is stored as a float, which works out.
649       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
650       * integer ubyte.  Just rewrite that to convert to a float.
651       *
652       * Gfx6+ passes edgeflag as sideband along with the vertex, instead
653       * of in the VUE.  We have to upload it sideband as the last vertex
654       * element according to the B-Spec.
655       */
656#if GFX_VER >= 6
657      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
658         gfx6_edgeflag_input = input;
659         continue;
660      }
661#endif
662
663      for (unsigned c = 0; c < num_uploads; c++) {
664         const uint32_t upload_format = GFX_VER >= 8 ? format :
665            downsize_format_if_needed(format, c);
666         /* If we need more that one upload, the offset stride would be 128
667          * bits (16 bytes), as for previous uploads we are using the full
668          * entry. */
669         const unsigned offset = input->offset + c * 16;
670
671         const int size = (GFX_VER < 8 && is_passthru_format(format)) ?
672            upload_format_size(upload_format) : glformat->Size;
673
674         switch (size) {
675            case 0: comp0 = VFCOMP_STORE_0; FALLTHROUGH;
676            case 1: comp1 = VFCOMP_STORE_0; FALLTHROUGH;
677            case 2: comp2 = VFCOMP_STORE_0; FALLTHROUGH;
678            case 3:
679               if (GFX_VER >= 8 && glformat->Doubles) {
680                  comp3 = VFCOMP_STORE_0;
681               } else if (glformat->Integer) {
682                  comp3 = VFCOMP_STORE_1_INT;
683               } else {
684                  comp3 = VFCOMP_STORE_1_FP;
685               }
686
687               break;
688         }
689
690#if GFX_VER >= 8
691         /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
692          *
693          *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
694          *     formats, 64-bit components are stored in the URB without any
695          *     conversion. In this case, vertex elements must be written as 128
696          *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
697          *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
698          *     component into the URB, Component 1 must be specified as
699          *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
700          *     order to output a 128-bit vertex element, or Components 1-3 must
701          *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
702          *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
703          *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
704          *     vertex element."
705          */
706         if (glformat->Doubles && !input->is_dual_slot) {
707            /* Store vertex elements which correspond to double and dvec2 vertex
708             * shader inputs as 128-bit vertex elements, instead of 256-bits.
709             */
710            comp2 = VFCOMP_NOSTORE;
711            comp3 = VFCOMP_NOSTORE;
712         }
713#endif
714
715         struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
716            .VertexBufferIndex = input->buffer,
717            .Valid = true,
718            .SourceElementFormat = upload_format,
719            .SourceElementOffset = offset,
720            .Component0Control = comp0,
721            .Component1Control = comp1,
722            .Component2Control = comp2,
723            .Component3Control = comp3,
724#if GFX_VER < 5
725            .DestinationElementOffset = i * 4,
726#endif
727         };
728
729         GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
730         dw += GENX(VERTEX_ELEMENT_STATE_length);
731      }
732   }
733
734   if (needs_sgvs_element) {
735      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
736         .Valid = true,
737         .Component0Control = VFCOMP_STORE_0,
738         .Component1Control = VFCOMP_STORE_0,
739         .Component2Control = VFCOMP_STORE_0,
740         .Component3Control = VFCOMP_STORE_0,
741#if GFX_VER < 5
742         .DestinationElementOffset = i * 4,
743#endif
744      };
745
746#if GFX_VER >= 8
747      if (uses_draw_params) {
748         elem_state.VertexBufferIndex = brw->vb.nr_buffers;
749         elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
750         elem_state.Component0Control = VFCOMP_STORE_SRC;
751         elem_state.Component1Control = VFCOMP_STORE_SRC;
752      }
753#else
754      elem_state.VertexBufferIndex = brw->vb.nr_buffers;
755      elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
756      if (uses_draw_params) {
757         elem_state.Component0Control = VFCOMP_STORE_SRC;
758         elem_state.Component1Control = VFCOMP_STORE_SRC;
759      }
760
761      if (vs_prog_data->uses_vertexid)
762         elem_state.Component2Control = VFCOMP_STORE_VID;
763
764      if (vs_prog_data->uses_instanceid)
765         elem_state.Component3Control = VFCOMP_STORE_IID;
766#endif
767
768      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
769      dw += GENX(VERTEX_ELEMENT_STATE_length);
770   }
771
772   if (uses_derived_draw_params) {
773      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
774         .Valid = true,
775         .VertexBufferIndex = brw->vb.nr_buffers + 1,
776         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
777         .Component0Control = VFCOMP_STORE_SRC,
778         .Component1Control = VFCOMP_STORE_SRC,
779         .Component2Control = VFCOMP_STORE_0,
780         .Component3Control = VFCOMP_STORE_0,
781#if GFX_VER < 5
782         .DestinationElementOffset = i * 4,
783#endif
784      };
785
786      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
787      dw += GENX(VERTEX_ELEMENT_STATE_length);
788   }
789
790#if GFX_VER >= 6
791   if (gfx6_edgeflag_input) {
792      const struct gl_vertex_format *glformat = gfx6_edgeflag_input->glformat;
793      const uint32_t format = brw_get_vertex_surface_type(brw, glformat);
794
795      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
796         .Valid = true,
797         .VertexBufferIndex = gfx6_edgeflag_input->buffer,
798         .EdgeFlagEnable = true,
799         .SourceElementFormat = format,
800         .SourceElementOffset = gfx6_edgeflag_input->offset,
801         .Component0Control = VFCOMP_STORE_SRC,
802         .Component1Control = VFCOMP_STORE_0,
803         .Component2Control = VFCOMP_STORE_0,
804         .Component3Control = VFCOMP_STORE_0,
805      };
806
807      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
808      dw += GENX(VERTEX_ELEMENT_STATE_length);
809   }
810#endif
811
812#if GFX_VER >= 8
813   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
814      const struct brw_vertex_element *input = brw->vb.enabled[i];
815      const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
816      unsigned element_index;
817
818      /* The edge flag element is reordered to be the last one in the code
819       * above so we need to compensate for that in the element indices used
820       * below.
821       */
822      if (input == gfx6_edgeflag_input)
823         element_index = nr_elements - 1;
824      else
825         element_index = j++;
826
827      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
828         vfi.VertexElementIndex = element_index;
829         vfi.InstancingEnable = buffer->step_rate != 0;
830         vfi.InstanceDataStepRate = buffer->step_rate;
831      }
832   }
833
834   if (vs_prog_data->uses_drawid) {
835      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
836
837      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
838         vfi.VertexElementIndex = element;
839      }
840   }
841#endif
842}
843
844static const struct brw_tracked_state genX(vertices) = {
845   .dirty = {
846      .mesa = _NEW_POLYGON,
847      .brw = BRW_NEW_BATCH |
848             BRW_NEW_BLORP |
849             BRW_NEW_VERTEX_PROGRAM |
850             BRW_NEW_VERTICES |
851             BRW_NEW_VS_PROG_DATA,
852   },
853   .emit = genX(emit_vertices),
854};
855
856static void
857genX(emit_index_buffer)(struct brw_context *brw)
858{
859   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
860
861   if (index_buffer == NULL)
862      return;
863
864   vf_invalidate_for_ib_48bit_transition(brw);
865
866   brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
867#if GFX_VERx10 < 75
868      assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
869      ib.CutIndexEnable = brw->ib.enable_cut_index;
870#endif
871      ib.IndexFormat = brw_get_index_type(1 << index_buffer->index_size_shift);
872
873      /* The VF cache designers apparently cut corners, and made the cache
874       * only consider the bottom 32 bits of memory addresses.  If you happen
875       * to have two index buffers which get placed exactly 4 GiB apart and
876       * use them in back-to-back draw calls, you can get collisions.  To work
877       * around this problem, we restrict index buffers to the low 32 bits of
878       * the address space.
879       */
880      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
881#if GFX_VER >= 8
882      ib.MOCS = GFX_VER >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
883      ib.BufferSize = brw->ib.size;
884#else
885      ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
886#endif
887   }
888}
889
890static const struct brw_tracked_state genX(index_buffer) = {
891   .dirty = {
892      .mesa = 0,
893      .brw = BRW_NEW_BATCH |
894             BRW_NEW_BLORP |
895             BRW_NEW_INDEX_BUFFER,
896   },
897   .emit = genX(emit_index_buffer),
898};
899
900#if GFX_VERx10 >= 75
901static void
902genX(upload_cut_index)(struct brw_context *brw)
903{
904   brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
905      if (brw->prim_restart.enable_cut_index && brw->ib.ib) {
906         vf.IndexedDrawCutIndexEnable = true;
907         vf.CutIndex = brw->prim_restart.restart_index;
908      }
909   }
910}
911
912const struct brw_tracked_state genX(cut_index) = {
913   .dirty = {
914      .mesa  = _NEW_TRANSFORM,
915      .brw   = BRW_NEW_INDEX_BUFFER,
916   },
917   .emit = genX(upload_cut_index),
918};
919#endif
920
921static void
922genX(upload_vf_statistics)(struct brw_context *brw)
923{
924   brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) {
925      vf.StatisticsEnable = true;
926   }
927}
928
929const struct brw_tracked_state genX(vf_statistics) = {
930   .dirty = {
931      .mesa  = 0,
932      .brw   = BRW_NEW_BLORP | BRW_NEW_CONTEXT,
933   },
934   .emit = genX(upload_vf_statistics),
935};
936
937#if GFX_VER >= 6
938/**
939 * Determine the appropriate attribute override value to store into the
940 * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
941 * override value contains two pieces of information: the location of the
942 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
943 * flag indicating whether to "swizzle" the attribute based on the direction
944 * the triangle is facing.
945 *
946 * If an attribute is "swizzled", then the given VUE location is used for
947 * front-facing triangles, and the VUE location that immediately follows is
948 * used for back-facing triangles.  We use this to implement the mapping from
949 * gl_FrontColor/gl_BackColor to gl_Color.
950 *
951 * urb_entry_read_offset is the offset into the VUE at which the SF unit is
952 * being instructed to begin reading attribute data.  It can be set to a
953 * nonzero value to prevent the SF unit from wasting time reading elements of
954 * the VUE that are not needed by the fragment shader.  It is measured in
955 * 256-bit increments.
956 */
957static void
958genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
959                        const struct brw_vue_map *vue_map,
960                        int urb_entry_read_offset, int fs_attr,
961                        bool two_side_color, uint32_t *max_source_attr)
962{
963   /* Find the VUE slot for this attribute. */
964   int slot = vue_map->varying_to_slot[fs_attr];
965
966   /* Viewport and Layer are stored in the VUE header.  We need to override
967    * them to zero if earlier stages didn't write them, as GL requires that
968    * they read back as zero when not explicitly set.
969    */
970   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
971      attr->ComponentOverrideX = true;
972      attr->ComponentOverrideW = true;
973      attr->ConstantSource = CONST_0000;
974
975      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
976         attr->ComponentOverrideY = true;
977      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
978         attr->ComponentOverrideZ = true;
979
980      return;
981   }
982
983   /* If there was only a back color written but not front, use back
984    * as the color instead of undefined
985    */
986   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
987      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
988   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
989      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
990
991   if (slot == -1) {
992      /* This attribute does not exist in the VUE--that means that the vertex
993       * shader did not write to it.  This means that either:
994       *
995       * (a) This attribute is a texture coordinate, and it is going to be
996       * replaced with point coordinates (as a consequence of a call to
997       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
998       * hardware will ignore whatever attribute override we supply.
999       *
1000       * (b) This attribute is read by the fragment shader but not written by
1001       * the vertex shader, so its value is undefined.  Therefore the
1002       * attribute override we supply doesn't matter.
1003       *
1004       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
1005       * previous shader stage.
1006       *
1007       * Note that we don't have to worry about the cases where the attribute
1008       * is gl_PointCoord or is undergoing point sprite coordinate
1009       * replacement, because in those cases, this function isn't called.
1010       *
1011       * In case (c), we need to program the attribute overrides so that the
1012       * primitive ID will be stored in this slot.  In every other case, the
1013       * attribute override we supply doesn't matter.  So just go ahead and
1014       * program primitive ID in every case.
1015       */
1016      attr->ComponentOverrideW = true;
1017      attr->ComponentOverrideX = true;
1018      attr->ComponentOverrideY = true;
1019      attr->ComponentOverrideZ = true;
1020      attr->ConstantSource = PRIM_ID;
1021      return;
1022   }
1023
1024   /* Compute the location of the attribute relative to urb_entry_read_offset.
1025    * Each increment of urb_entry_read_offset represents a 256-bit value, so
1026    * it counts for two 128-bit VUE slots.
1027    */
1028   int source_attr = slot - 2 * urb_entry_read_offset;
1029   assert(source_attr >= 0 && source_attr < 32);
1030
1031   /* If we are doing two-sided color, and the VUE slot following this one
1032    * represents a back-facing color, then we need to instruct the SF unit to
1033    * do back-facing swizzling.
1034    */
1035   bool swizzling = two_side_color &&
1036      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1037        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1038       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1039        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1040
1041   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1042   if (*max_source_attr < source_attr + swizzling)
1043      *max_source_attr = source_attr + swizzling;
1044
1045   attr->SourceAttribute = source_attr;
1046   if (swizzling)
1047      attr->SwizzleSelect = INPUTATTR_FACING;
1048}
1049
1050
1051static void
1052genX(calculate_attr_overrides)(const struct brw_context *brw,
1053                               struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1054                               uint32_t *point_sprite_enables,
1055                               uint32_t *urb_entry_read_length,
1056                               uint32_t *urb_entry_read_offset)
1057{
1058   const struct gl_context *ctx = &brw->ctx;
1059
1060   /* _NEW_POINT */
1061   const struct gl_point_attrib *point = &ctx->Point;
1062
1063   /* BRW_NEW_FRAGMENT_PROGRAM */
1064   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1065
1066   /* BRW_NEW_FS_PROG_DATA */
1067   const struct brw_wm_prog_data *wm_prog_data =
1068      brw_wm_prog_data(brw->wm.base.prog_data);
1069   uint32_t max_source_attr = 0;
1070
1071   *point_sprite_enables = 0;
1072
1073   int first_slot =
1074      brw_compute_first_urb_slot_required(fp->info.inputs_read,
1075                                          &brw->vue_map_geom_out);
1076
1077   /* Each URB offset packs two varying slots */
1078   assert(first_slot % 2 == 0);
1079   *urb_entry_read_offset = first_slot / 2;
1080
1081   /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1082    * description of dw10 Point Sprite Texture Coordinate Enable:
1083    *
1084    * "This field must be programmed to zero when non-point primitives
1085    * are rendered."
1086    *
1087    * The SandyBridge PRM doesn't explicitly say that point sprite enables
1088    * must be programmed to zero when rendering non-point primitives, but
1089    * the IvyBridge PRM does, and if we don't, we get garbage.
1090    *
1091    * This is not required on Haswell, as the hardware ignores this state
1092    * when drawing non-points -- although we do still need to be careful to
1093    * correctly set the attr overrides.
1094    *
1095    * _NEW_POLYGON
1096    * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1097    */
1098   bool drawing_points = brw_is_drawing_points(brw);
1099
1100   for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) {
1101      uint8_t attr = wm_prog_data->urb_setup_attribs[idx];
1102      int input_index = wm_prog_data->urb_setup[attr];
1103
1104      assert(0 <= input_index);
1105
1106      /* _NEW_POINT */
1107      bool point_sprite = false;
1108      if (drawing_points) {
1109         if (point->PointSprite &&
1110             (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1111             (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1112            point_sprite = true;
1113         }
1114
1115         if (attr == VARYING_SLOT_PNTC)
1116            point_sprite = true;
1117
1118         if (point_sprite)
1119            *point_sprite_enables |= (1 << input_index);
1120      }
1121
1122      /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1123      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1124
1125      if (!point_sprite) {
1126         genX(get_attr_override)(&attribute,
1127                                 &brw->vue_map_geom_out,
1128                                 *urb_entry_read_offset, attr,
1129                                 _mesa_vertex_program_two_side_enabled(ctx),
1130                                 &max_source_attr);
1131      }
1132
1133      /* The hardware can only do the overrides on 16 overrides at a
1134       * time, and the other up to 16 have to be lined up so that the
1135       * input index = the output index.  We'll need to do some
1136       * tweaking to make sure that's the case.
1137       */
1138      if (input_index < 16)
1139         attr_overrides[input_index] = attribute;
1140      else
1141         assert(attribute.SourceAttribute == input_index);
1142   }
1143
1144   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1145    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1146    *
1147    * "This field should be set to the minimum length required to read the
1148    *  maximum source attribute.  The maximum source attribute is indicated
1149    *  by the maximum value of the enabled Attribute # Source Attribute if
1150    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1151    *  enable is not set.
1152    *  read_length = ceiling((max_source_attr + 1) / 2)
1153    *
1154    *  [errata] Corruption/Hang possible if length programmed larger than
1155    *  recommended"
1156    *
1157    * Similar text exists for Ivy Bridge.
1158    */
1159   *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1160}
1161#endif
1162
1163/* ---------------------------------------------------------------------- */
1164
1165#if GFX_VER >= 8
1166typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1167#elif GFX_VER >= 6
1168typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1169#else
1170typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1171#endif
1172
1173static inline void
1174set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1175{
1176   struct gl_context *ctx = &brw->ctx;
1177
1178   /* _NEW_BUFFERS */
1179   struct brw_renderbuffer *depth_irb =
1180      brw_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1181
1182   /* _NEW_DEPTH */
1183   struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1184
1185   /* _NEW_STENCIL */
1186   struct gl_stencil_attrib *stencil = &ctx->Stencil;
1187   const int b = stencil->_BackFace;
1188
1189   if (depth->Test && depth_irb) {
1190      ds->DepthTestEnable = true;
1191      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1192      ds->DepthTestFunction = brw_translate_compare_func(depth->Func);
1193   }
1194
1195   if (brw->stencil_enabled) {
1196      ds->StencilTestEnable = true;
1197      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1198      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1199
1200      ds->StencilTestFunction =
1201         brw_translate_compare_func(stencil->Function[0]);
1202      ds->StencilFailOp =
1203         brw_translate_stencil_op(stencil->FailFunc[0]);
1204      ds->StencilPassDepthPassOp =
1205         brw_translate_stencil_op(stencil->ZPassFunc[0]);
1206      ds->StencilPassDepthFailOp =
1207         brw_translate_stencil_op(stencil->ZFailFunc[0]);
1208
1209      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1210
1211      if (brw->stencil_two_sided) {
1212         ds->DoubleSidedStencilEnable = true;
1213         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1214         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1215
1216         ds->BackfaceStencilTestFunction =
1217            brw_translate_compare_func(stencil->Function[b]);
1218         ds->BackfaceStencilFailOp =
1219            brw_translate_stencil_op(stencil->FailFunc[b]);
1220         ds->BackfaceStencilPassDepthPassOp =
1221            brw_translate_stencil_op(stencil->ZPassFunc[b]);
1222         ds->BackfaceStencilPassDepthFailOp =
1223            brw_translate_stencil_op(stencil->ZFailFunc[b]);
1224      }
1225
1226#if GFX_VER <= 5 || GFX_VER >= 9
1227      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1228      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1229#endif
1230   }
1231}
1232
1233#if GFX_VER >= 6
1234static void
1235genX(upload_depth_stencil_state)(struct brw_context *brw)
1236{
1237#if GFX_VER >= 8
1238   brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1239      set_depth_stencil_bits(brw, &wmds);
1240   }
1241#else
1242   uint32_t ds_offset;
1243   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1244      set_depth_stencil_bits(brw, &ds);
1245   }
1246
1247   /* Now upload a pointer to the indirect state */
1248#if GFX_VER == 6
1249   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1250      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1251      ptr.DEPTH_STENCIL_STATEChange = true;
1252   }
1253#else
1254   brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1255      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1256   }
1257#endif
1258#endif
1259}
1260
1261static const struct brw_tracked_state genX(depth_stencil_state) = {
1262   .dirty = {
1263      .mesa = _NEW_BUFFERS |
1264              _NEW_DEPTH |
1265              _NEW_STENCIL,
1266      .brw  = BRW_NEW_BLORP |
1267              (GFX_VER >= 8 ? BRW_NEW_CONTEXT
1268                            : BRW_NEW_BATCH |
1269                              BRW_NEW_STATE_BASE_ADDRESS),
1270   },
1271   .emit = genX(upload_depth_stencil_state),
1272};
1273#endif
1274
1275/* ---------------------------------------------------------------------- */
1276
1277#if GFX_VER <= 5
1278
1279static void
1280genX(upload_clip_state)(struct brw_context *brw)
1281{
1282   struct gl_context *ctx = &brw->ctx;
1283
1284   ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
1285   brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1286      clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1287      clip.GRFRegisterCount =
1288         DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1289      clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1290      clip.SingleProgramFlow = true;
1291      clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1292      clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1293
1294      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1295      clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1296      clip.DispatchGRFStartRegisterForURBData = 1;
1297      clip.VertexURBEntryReadOffset = 0;
1298
1299      /* BRW_NEW_URB_FENCE */
1300      clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1301      clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1302
1303      if (brw->urb.nr_clip_entries >= 10) {
1304         /* Half of the URB entries go to each thread, and it has to be an
1305          * even number.
1306          */
1307         assert(brw->urb.nr_clip_entries % 2 == 0);
1308
1309         /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1310          * only 2 threads can output VUEs at a time.
1311          */
1312         clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1;
1313      } else {
1314         assert(brw->urb.nr_clip_entries >= 5);
1315         clip.MaximumNumberofThreads = 1 - 1;
1316      }
1317
1318      clip.VertexPositionSpace = VPOS_NDCSPACE;
1319      clip.UserClipFlagsMustClipEnable = true;
1320      clip.GuardbandClipTestEnable = true;
1321
1322      clip.ClipperViewportStatePointer =
1323         ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1324
1325      clip.ScreenSpaceViewportXMin = -1;
1326      clip.ScreenSpaceViewportXMax = 1;
1327      clip.ScreenSpaceViewportYMin = -1;
1328      clip.ScreenSpaceViewportYMax = 1;
1329
1330      clip.ViewportXYClipTestEnable = true;
1331      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1332                                       ctx->Transform.DepthClampFar);
1333
1334      /* _NEW_TRANSFORM */
1335      if (GFX_VER == 5 || GFX_VERx10 == 45) {
1336         clip.UserClipDistanceClipTestEnableBitmask =
1337            ctx->Transform.ClipPlanesEnabled;
1338      } else {
1339         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1340          * workaround.
1341          */
1342         clip.UserClipDistanceClipTestEnableBitmask =
1343            (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1344      }
1345
1346      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1347         clip.APIMode = APIMODE_D3D;
1348      else
1349         clip.APIMode = APIMODE_OGL;
1350
1351      clip.GuardbandClipTestEnable = true;
1352
1353      clip.ClipMode = brw->clip.prog_data->clip_mode;
1354
1355#if GFX_VERx10 == 45
1356      clip.NegativeWClipTestEnable = true;
1357#endif
1358   }
1359}
1360
1361const struct brw_tracked_state genX(clip_state) = {
1362   .dirty = {
1363      .mesa  = _NEW_TRANSFORM |
1364               _NEW_VIEWPORT,
1365      .brw   = BRW_NEW_BATCH |
1366               BRW_NEW_BLORP |
1367               BRW_NEW_CLIP_PROG_DATA |
1368               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1369               BRW_NEW_PROGRAM_CACHE |
1370               BRW_NEW_URB_FENCE,
1371   },
1372   .emit = genX(upload_clip_state),
1373};
1374
1375#else
1376
1377static void
1378genX(upload_clip_state)(struct brw_context *brw)
1379{
1380   struct gl_context *ctx = &brw->ctx;
1381
1382   /* _NEW_BUFFERS */
1383   struct gl_framebuffer *fb = ctx->DrawBuffer;
1384
1385   /* BRW_NEW_FS_PROG_DATA */
1386   struct brw_wm_prog_data *wm_prog_data =
1387      brw_wm_prog_data(brw->wm.base.prog_data);
1388
1389   brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1390      clip.StatisticsEnable = !brw->meta_in_progress;
1391
1392      if (wm_prog_data->barycentric_interp_modes &
1393          BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1394         clip.NonPerspectiveBarycentricEnable = true;
1395
1396#if GFX_VER >= 7
1397      clip.EarlyCullEnable = true;
1398#endif
1399
1400#if GFX_VER == 7
1401      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
1402
1403      if (ctx->Polygon.CullFlag) {
1404         switch (ctx->Polygon.CullFaceMode) {
1405         case GL_FRONT:
1406            clip.CullMode = CULLMODE_FRONT;
1407            break;
1408         case GL_BACK:
1409            clip.CullMode = CULLMODE_BACK;
1410            break;
1411         case GL_FRONT_AND_BACK:
1412            clip.CullMode = CULLMODE_BOTH;
1413            break;
1414         default:
1415            unreachable("Should not get here: invalid CullFlag");
1416         }
1417      } else {
1418         clip.CullMode = CULLMODE_NONE;
1419      }
1420#endif
1421
1422#if GFX_VER < 8
1423      clip.UserClipDistanceCullTestEnableBitmask =
1424         brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1425
1426      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1427                                       ctx->Transform.DepthClampFar);
1428#endif
1429
1430      /* _NEW_LIGHT */
1431      if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1432         clip.TriangleStripListProvokingVertexSelect = 0;
1433         clip.TriangleFanProvokingVertexSelect = 1;
1434         clip.LineStripListProvokingVertexSelect = 0;
1435      } else {
1436         clip.TriangleStripListProvokingVertexSelect = 2;
1437         clip.TriangleFanProvokingVertexSelect = 2;
1438         clip.LineStripListProvokingVertexSelect = 1;
1439      }
1440
1441      /* _NEW_TRANSFORM */
1442      clip.UserClipDistanceClipTestEnableBitmask =
1443         ctx->Transform.ClipPlanesEnabled;
1444
1445#if GFX_VER >= 8
1446      clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1447#endif
1448
1449      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1450         clip.APIMode = APIMODE_D3D;
1451      else
1452         clip.APIMode = APIMODE_OGL;
1453
1454      clip.GuardbandClipTestEnable = true;
1455
1456      /* BRW_NEW_VIEWPORT_COUNT */
1457      const unsigned viewport_count = brw->clip.viewport_count;
1458
1459      if (ctx->RasterDiscard) {
1460         clip.ClipMode = CLIPMODE_REJECT_ALL;
1461#if GFX_VER == 6
1462         perf_debug("Rasterizer discard is currently implemented via the "
1463                    "clipper; having the GS not write primitives would "
1464                    "likely be faster.\n");
1465#endif
1466      } else {
1467         clip.ClipMode = CLIPMODE_NORMAL;
1468      }
1469
1470      clip.ClipEnable = true;
1471
1472      /* _NEW_POLYGON,
1473       * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1474       */
1475      if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1476         clip.ViewportXYClipTestEnable = true;
1477
1478      clip.MinimumPointWidth = 0.125;
1479      clip.MaximumPointWidth = 255.875;
1480      clip.MaximumVPIndex = viewport_count - 1;
1481      if (_mesa_geometric_layers(fb) == 0)
1482         clip.ForceZeroRTAIndexEnable = true;
1483   }
1484}
1485
1486static const struct brw_tracked_state genX(clip_state) = {
1487   .dirty = {
1488      .mesa  = _NEW_BUFFERS |
1489               _NEW_LIGHT |
1490               _NEW_POLYGON |
1491               _NEW_TRANSFORM,
1492      .brw   = BRW_NEW_BLORP |
1493               BRW_NEW_CONTEXT |
1494               BRW_NEW_FS_PROG_DATA |
1495               BRW_NEW_GS_PROG_DATA |
1496               BRW_NEW_VS_PROG_DATA |
1497               BRW_NEW_META_IN_PROGRESS |
1498               BRW_NEW_PRIMITIVE |
1499               BRW_NEW_RASTERIZER_DISCARD |
1500               BRW_NEW_TES_PROG_DATA |
1501               BRW_NEW_VIEWPORT_COUNT,
1502   },
1503   .emit = genX(upload_clip_state),
1504};
1505#endif
1506
1507/* ---------------------------------------------------------------------- */
1508
1509static void
1510genX(upload_sf)(struct brw_context *brw)
1511{
1512   struct gl_context *ctx = &brw->ctx;
1513   float point_size;
1514
1515#if GFX_VER <= 7
1516   /* _NEW_BUFFERS */
1517   bool flip_y = ctx->DrawBuffer->FlipY;
1518   UNUSED const bool multisampled_fbo =
1519      _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1520#endif
1521
1522#if GFX_VER < 6
1523   const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1524
1525   ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
1526
1527   brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1528      sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1529      sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1530      sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1531      sf.DispatchGRFStartRegisterForURBData = 3;
1532      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1533      sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1534      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1535      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1536
1537      /* STATE_PREFETCH command description describes this state as being
1538       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1539       * domain.
1540       */
1541      sf.SetupViewportStateOffset =
1542         ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1543
1544      sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1545
1546      /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1547      /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1548
1549      sf.MaximumNumberofThreads =
1550         MIN2(GFX_VER == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1551
1552      sf.SpritePointEnable = ctx->Point.PointSprite;
1553
1554      sf.DestinationOriginHorizontalBias = 0.5;
1555      sf.DestinationOriginVerticalBias = 0.5;
1556#else
1557   brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1558      sf.StatisticsEnable = true;
1559#endif
1560      sf.ViewportTransformEnable = true;
1561
1562#if GFX_VER == 7
1563      /* _NEW_BUFFERS */
1564      sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1565#endif
1566
1567#if GFX_VER <= 7
1568      /* _NEW_POLYGON */
1569      sf.FrontWinding = brw->polygon_front_bit != flip_y;
1570#if GFX_VER >= 6
1571      sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1572      sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1573      sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1574
1575      switch (ctx->Polygon.FrontMode) {
1576         case GL_FILL:
1577            sf.FrontFaceFillMode = FILL_MODE_SOLID;
1578            break;
1579         case GL_LINE:
1580            sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1581            break;
1582         case GL_POINT:
1583            sf.FrontFaceFillMode = FILL_MODE_POINT;
1584            break;
1585         default:
1586            unreachable("not reached");
1587      }
1588
1589      switch (ctx->Polygon.BackMode) {
1590         case GL_FILL:
1591            sf.BackFaceFillMode = FILL_MODE_SOLID;
1592            break;
1593         case GL_LINE:
1594            sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1595            break;
1596         case GL_POINT:
1597            sf.BackFaceFillMode = FILL_MODE_POINT;
1598            break;
1599         default:
1600            unreachable("not reached");
1601      }
1602
1603      if (multisampled_fbo && ctx->Multisample.Enabled)
1604         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1605
1606      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1607      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1608      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1609#endif
1610
1611      sf.ScissorRectangleEnable = true;
1612
1613      if (ctx->Polygon.CullFlag) {
1614         switch (ctx->Polygon.CullFaceMode) {
1615            case GL_FRONT:
1616               sf.CullMode = CULLMODE_FRONT;
1617               break;
1618            case GL_BACK:
1619               sf.CullMode = CULLMODE_BACK;
1620               break;
1621            case GL_FRONT_AND_BACK:
1622               sf.CullMode = CULLMODE_BOTH;
1623               break;
1624            default:
1625               unreachable("not reached");
1626         }
1627      } else {
1628         sf.CullMode = CULLMODE_NONE;
1629      }
1630
1631#if GFX_VERx10 == 75
1632      sf.LineStippleEnable = ctx->Line.StippleFlag;
1633#endif
1634
1635#endif
1636
1637      /* _NEW_LINE */
1638#if GFX_VER == 8
1639      const struct intel_device_info *devinfo = &brw->screen->devinfo;
1640
1641      if (devinfo->is_cherryview)
1642         sf.CHVLineWidth = brw_get_line_width(brw);
1643      else
1644         sf.LineWidth = brw_get_line_width(brw);
1645#else
1646      sf.LineWidth = brw_get_line_width(brw);
1647#endif
1648
1649      if (ctx->Line.SmoothFlag) {
1650         sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1651#if GFX_VER <= 7
1652         sf.AntialiasingEnable = true;
1653#endif
1654      }
1655
1656      /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1657      point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1658      /* Clamp to the hardware limits */
1659      sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1660
1661      /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1662      if (use_state_point_size(brw))
1663         sf.PointWidthSource = State;
1664
1665#if GFX_VER >= 8
1666      /* _NEW_POINT | _NEW_MULTISAMPLE */
1667      if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1668          !ctx->Point.PointSprite)
1669         sf.SmoothPointEnable = true;
1670#endif
1671
1672#if GFX_VER == 10
1673      /* _NEW_BUFFERS
1674       * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1675       */
1676      const bool multisampled_fbo =
1677         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1678      if (multisampled_fbo)
1679         sf.SmoothPointEnable = false;
1680#endif
1681
1682#if GFX_VERx10 >= 45
1683      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1684#endif
1685
1686      /* _NEW_LIGHT */
1687      if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1688         sf.TriangleStripListProvokingVertexSelect = 2;
1689         sf.TriangleFanProvokingVertexSelect = 2;
1690         sf.LineStripListProvokingVertexSelect = 1;
1691      } else {
1692         sf.TriangleFanProvokingVertexSelect = 1;
1693      }
1694
1695#if GFX_VER == 6
1696      /* BRW_NEW_FS_PROG_DATA */
1697      const struct brw_wm_prog_data *wm_prog_data =
1698         brw_wm_prog_data(brw->wm.base.prog_data);
1699
1700      sf.AttributeSwizzleEnable = true;
1701      sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1702
1703      /*
1704       * Window coordinates in an FBO are inverted, which means point
1705       * sprite origin must be inverted, too.
1706       */
1707      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
1708         sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1709      } else {
1710         sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1711      }
1712
1713      /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1714       * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1715       */
1716      uint32_t urb_entry_read_length;
1717      uint32_t urb_entry_read_offset;
1718      uint32_t point_sprite_enables;
1719      genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1720                                     &urb_entry_read_length,
1721                                     &urb_entry_read_offset);
1722      sf.VertexURBEntryReadLength = urb_entry_read_length;
1723      sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1724      sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1725      sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1726#endif
1727   }
1728}
1729
1730static const struct brw_tracked_state genX(sf_state) = {
1731   .dirty = {
1732      .mesa  = _NEW_LIGHT |
1733               _NEW_LINE |
1734               _NEW_POINT |
1735               _NEW_PROGRAM |
1736               (GFX_VER >= 6 ? _NEW_MULTISAMPLE : 0) |
1737               (GFX_VER <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1738               (GFX_VER == 10 ? _NEW_BUFFERS : 0),
1739      .brw   = BRW_NEW_BLORP |
1740               BRW_NEW_VUE_MAP_GEOM_OUT |
1741               (GFX_VER <= 5 ? BRW_NEW_BATCH |
1742                               BRW_NEW_PROGRAM_CACHE |
1743                               BRW_NEW_SF_PROG_DATA |
1744                               BRW_NEW_SF_VP |
1745                               BRW_NEW_URB_FENCE
1746                             : 0) |
1747               (GFX_VER >= 6 ? BRW_NEW_CONTEXT : 0) |
1748               (GFX_VER >= 6 && GFX_VER <= 7 ?
1749                               BRW_NEW_GS_PROG_DATA |
1750                               BRW_NEW_PRIMITIVE |
1751                               BRW_NEW_TES_PROG_DATA
1752                             : 0) |
1753               (GFX_VER == 6 ? BRW_NEW_FS_PROG_DATA |
1754                               BRW_NEW_FRAGMENT_PROGRAM
1755                             : 0),
1756   },
1757   .emit = genX(upload_sf),
1758};
1759
1760/* ---------------------------------------------------------------------- */
1761
1762static bool
1763brw_color_buffer_write_enabled(struct brw_context *brw)
1764{
1765   struct gl_context *ctx = &brw->ctx;
1766   /* BRW_NEW_FRAGMENT_PROGRAM */
1767   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1768   unsigned i;
1769
1770   /* _NEW_BUFFERS */
1771   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1772      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1773      uint64_t outputs_written = fp->info.outputs_written;
1774
1775      /* _NEW_COLOR */
1776      if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1777                 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1778          GET_COLORMASK(ctx->Color.ColorMask, i)) {
1779         return true;
1780      }
1781   }
1782
1783   return false;
1784}
1785
1786static void
1787genX(upload_wm)(struct brw_context *brw)
1788{
1789   struct gl_context *ctx = &brw->ctx;
1790
1791   /* BRW_NEW_FS_PROG_DATA */
1792   const struct brw_wm_prog_data *wm_prog_data =
1793      brw_wm_prog_data(brw->wm.base.prog_data);
1794
1795   UNUSED bool writes_depth =
1796      wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1797   UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1798   UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo;
1799
1800#if GFX_VER == 6
1801   /* We can't fold this into gfx6_upload_wm_push_constants(), because
1802    * according to the SNB PRM, vol 2 part 1 section 7.2.2
1803    * (3DSTATE_CONSTANT_PS [DevSNB]):
1804    *
1805    *     "[DevSNB]: This packet must be followed by WM_STATE."
1806    */
1807   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1808      if (wm_prog_data->base.nr_params != 0) {
1809         wmcp.Buffer0Valid = true;
1810         /* Pointer to the WM constant buffer.  Covered by the set of
1811          * state flags from gfx6_upload_wm_push_constants.
1812          */
1813         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
1814         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1815      }
1816   }
1817#endif
1818
1819#if GFX_VER >= 6
1820   brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1821#else
1822   ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
1823   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1824#endif
1825
1826#if GFX_VER <= 6
1827      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1828      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1829      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
1830#endif
1831
1832#if GFX_VER == 4
1833      /* On gfx4, we only have one shader kernel */
1834      if (brw_wm_state_has_ksp(wm, 0)) {
1835         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
1836         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1837         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1838         wm.DispatchGRFStartRegisterForConstantSetupData0 =
1839            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1840      }
1841#elif GFX_VER == 5
1842      /* On gfx5, we have multiple shader kernels but only one GRF start
1843       * register for all kernels
1844       */
1845      wm.KernelStartPointer0 = stage_state->prog_offset +
1846                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1847      wm.KernelStartPointer1 = stage_state->prog_offset +
1848                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1849      wm.KernelStartPointer2 = stage_state->prog_offset +
1850                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1851
1852      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1853      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
1854      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
1855
1856      wm.DispatchGRFStartRegisterForConstantSetupData0 =
1857         wm_prog_data->base.dispatch_grf_start_reg;
1858
1859      /* Dispatch GRF Start should be the same for all shaders on gfx5 */
1860      if (brw_wm_state_has_ksp(wm, 1)) {
1861         assert(wm_prog_data->base.dispatch_grf_start_reg ==
1862                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
1863      }
1864      if (brw_wm_state_has_ksp(wm, 2)) {
1865         assert(wm_prog_data->base.dispatch_grf_start_reg ==
1866                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
1867      }
1868#elif GFX_VER == 6
1869      /* On gfx6, we have multiple shader kernels and we no longer specify a
1870       * register count for each one.
1871       */
1872      wm.KernelStartPointer0 = stage_state->prog_offset +
1873                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1874      wm.KernelStartPointer1 = stage_state->prog_offset +
1875                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1876      wm.KernelStartPointer2 = stage_state->prog_offset +
1877                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1878
1879      wm.DispatchGRFStartRegisterForConstantSetupData0 =
1880         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1881      wm.DispatchGRFStartRegisterForConstantSetupData1 =
1882         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
1883      wm.DispatchGRFStartRegisterForConstantSetupData2 =
1884         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
1885#endif
1886
1887#if GFX_VER <= 5
1888      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1889      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1890      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1891      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1892      wm.SetupURBEntryReadOffset = 0;
1893      wm.EarlyDepthTestEnable = true;
1894#endif
1895
1896#if GFX_VER >= 6
1897      wm.LineAntialiasingRegionWidth = _10pixels;
1898      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1899
1900      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1901      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1902#else
1903      if (stage_state->sampler_count)
1904         wm.SamplerStatePointer =
1905            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1906
1907      wm.LineAntialiasingRegionWidth = _05pixels;
1908      wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1909
1910      /* _NEW_POLYGON */
1911      if (ctx->Polygon.OffsetFill) {
1912         wm.GlobalDepthOffsetEnable = true;
1913         /* Something weird going on with legacy_global_depth_bias,
1914          * offset_constant, scaling and MRD.  This value passes glean
1915          * but gives some odd results elsewere (eg. the
1916          * quad-offset-units test).
1917          */
1918         wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1919
1920         /* This is the only value that passes glean:
1921         */
1922         wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1923      }
1924
1925      wm.DepthCoefficientURBReadOffset = 1;
1926#endif
1927
1928      /* BRW_NEW_STATS_WM */
1929      wm.StatisticsEnable = GFX_VER >= 6 || brw->stats_wm;
1930
1931#if GFX_VER < 7
1932      if (wm_prog_data->base.use_alt_mode)
1933         wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1934
1935      wm.SamplerCount = GFX_VER == 5 ?
1936         0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1937
1938      wm.BindingTableEntryCount =
1939         wm_prog_data->base.binding_table.size_bytes / 4;
1940      wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1941
1942#if GFX_VER == 6
1943      wm.DualSourceBlendEnable =
1944         wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1945         ctx->Color._BlendUsesDualSrc & 0x1;
1946      wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1947      wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1948
1949      /* From the SNB PRM, volume 2 part 1, page 281:
1950       * "If the PS kernel does not need the Position XY Offsets
1951       * to compute a Position XY value, then this field should be
1952       * programmed to POSOFFSET_NONE."
1953       *
1954       * "SW Recommendation: If the PS kernel needs the Position Offsets
1955       * to compute a Position XY value, this field should match Position
1956       * ZW Interpolation Mode to ensure a consistent position.xyzw
1957       * computation."
1958       * We only require XY sample offsets. So, this recommendation doesn't
1959       * look useful at the moment. We might need this in future.
1960       */
1961      if (wm_prog_data->uses_pos_offset)
1962         wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1963      else
1964         wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1965#endif
1966
1967      if (wm_prog_data->base.total_scratch) {
1968         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
1969         wm.PerThreadScratchSpace =
1970            ffs(stage_state->per_thread_scratch) - 11;
1971      }
1972
1973      wm.PixelShaderComputedDepth = writes_depth;
1974#endif
1975
1976      /* _NEW_LINE */
1977      wm.LineStippleEnable = ctx->Line.StippleFlag;
1978
1979      /* _NEW_POLYGON */
1980      wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1981
1982#if GFX_VER < 8
1983
1984#if GFX_VER >= 6
1985      wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1986
1987      /* _NEW_BUFFERS */
1988      const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1989
1990      if (multisampled_fbo) {
1991         /* _NEW_MULTISAMPLE */
1992         if (ctx->Multisample.Enabled)
1993            wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1994         else
1995            wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
1996
1997         if (wm_prog_data->persample_dispatch)
1998            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
1999         else
2000            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2001      } else {
2002         wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2003         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2004      }
2005#endif
2006      wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2007      if (wm_prog_data->uses_kill ||
2008          _mesa_is_alpha_test_enabled(ctx) ||
2009          _mesa_is_alpha_to_coverage_enabled(ctx) ||
2010          (GFX_VER >= 6 && wm_prog_data->uses_omask)) {
2011         wm.PixelShaderKillsPixel = true;
2012      }
2013
2014      /* _NEW_BUFFERS | _NEW_COLOR */
2015      if (brw_color_buffer_write_enabled(brw) || writes_depth ||
2016          wm.PixelShaderKillsPixel ||
2017          (GFX_VER >= 6 && wm_prog_data->has_side_effects)) {
2018         wm.ThreadDispatchEnable = true;
2019      }
2020
2021#if GFX_VER >= 7
2022      wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2023      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2024#endif
2025
2026      /* The "UAV access enable" bits are unnecessary on HSW because they only
2027       * seem to have an effect on the HW-assisted coherency mechanism which we
2028       * don't need, and the rasterization-related UAV_ONLY flag and the
2029       * DISPATCH_ENABLE bit can be set independently from it.
2030       * C.f. gfx8_upload_ps_extra().
2031       *
2032       * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
2033       * _NEW_COLOR
2034       */
2035#if GFX_VERx10 == 75
2036      if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
2037          wm_prog_data->has_side_effects)
2038         wm.PSUAVonly = ON;
2039#endif
2040#endif
2041
2042#if GFX_VER >= 7
2043      /* BRW_NEW_FS_PROG_DATA */
2044      if (wm_prog_data->early_fragment_tests)
2045         wm.EarlyDepthStencilControl = EDSC_PREPS;
2046      else if (wm_prog_data->has_side_effects)
2047         wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2048#endif
2049   }
2050
2051#if GFX_VER <= 5
2052   if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
2053      brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
2054         clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
2055      }
2056
2057      brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
2058   }
2059#endif
2060}
2061
2062static const struct brw_tracked_state genX(wm_state) = {
2063   .dirty = {
2064      .mesa  = _NEW_LINE |
2065               _NEW_POLYGON |
2066               (GFX_VER < 8 ? _NEW_BUFFERS |
2067                              _NEW_COLOR :
2068                              0) |
2069               (GFX_VER == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2070               (GFX_VER < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2071               (GFX_VER < 8 && GFX_VER >= 6 ? _NEW_MULTISAMPLE : 0),
2072      .brw   = BRW_NEW_BLORP |
2073               BRW_NEW_FS_PROG_DATA |
2074               (GFX_VER < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2075                              BRW_NEW_FRAGMENT_PROGRAM |
2076                              BRW_NEW_PROGRAM_CACHE |
2077                              BRW_NEW_SAMPLER_STATE_TABLE |
2078                              BRW_NEW_STATS_WM
2079                            : 0) |
2080               (GFX_VER < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2081   },
2082   .emit = genX(upload_wm),
2083};
2084
2085/* ---------------------------------------------------------------------- */
2086
2087/* We restrict scratch buffers to the bottom 32 bits of the address space
2088 * by using rw_32_bo().
2089 *
2090 * General State Base Address is a bit broken.  If the address + size as
2091 * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
2092 * all accesses to the buffer as being out of bounds and returns zero.
2093 */
2094
2095#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2096   pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2097   /* Wa_1606682166 */                                                    \
2098   pkt.SamplerCount       =                                               \
2099      GFX_VER == 11 ?                                                     \
2100      0 :                                                                 \
2101      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2102   pkt.BindingTableEntryCount =                                           \
2103      stage_prog_data->binding_table.size_bytes / 4;                      \
2104   pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2105                                                                          \
2106   if (stage_prog_data->total_scratch) {                                  \
2107      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
2108      pkt.PerThreadScratchSpace =                                         \
2109         ffs(stage_state->per_thread_scratch) - 11;                       \
2110   }                                                                      \
2111                                                                          \
2112   pkt.DispatchGRFStartRegisterForURBData =                               \
2113      stage_prog_data->dispatch_grf_start_reg;                            \
2114   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2115   pkt.prefix##URBEntryReadOffset = 0;                                    \
2116                                                                          \
2117   pkt.StatisticsEnable = true;                                           \
2118   pkt.Enable           = true;
2119
2120static void
2121genX(upload_vs_state)(struct brw_context *brw)
2122{
2123   UNUSED struct gl_context *ctx = &brw->ctx;
2124   const struct intel_device_info *devinfo = &brw->screen->devinfo;
2125   struct brw_stage_state *stage_state = &brw->vs.base;
2126
2127   /* BRW_NEW_VS_PROG_DATA */
2128   const struct brw_vue_prog_data *vue_prog_data =
2129      brw_vue_prog_data(brw->vs.base.prog_data);
2130   const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2131
2132   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2133          vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2134   assert(GFX_VER < 11 ||
2135          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2136
2137#if GFX_VER == 6
2138   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2139    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2140    *
2141    *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2142    *   command that causes the VS Function Enable to toggle. Pipeline
2143    *   flush can be executed by sending a PIPE_CONTROL command with CS
2144    *   stall bit set and a post sync operation.
2145    *
2146    * We've already done such a flush at the start of state upload, so we
2147    * don't need to do another one here.
2148    */
2149   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2150      if (stage_state->push_const_size != 0) {
2151         cvs.Buffer0Valid = true;
2152         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2153         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2154      }
2155   }
2156#endif
2157
2158   if (GFX_VER == 7 && devinfo->is_ivybridge)
2159      gfx7_emit_vs_workaround_flush(brw);
2160
2161#if GFX_VER >= 6
2162   brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2163#else
2164   ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
2165   brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2166#endif
2167      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2168
2169      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2170
2171#if GFX_VER < 6
2172      vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2173      vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2174      vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2175
2176      vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0);
2177      vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2178
2179      vs.MaximumNumberofThreads =
2180         CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2181
2182      vs.StatisticsEnable = false;
2183      vs.SamplerStatePointer =
2184         ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2185#endif
2186
2187#if GFX_VER == 5
2188      /* Force single program flow on Ironlake.  We cannot reliably get
2189       * all applications working without it.  See:
2190       * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2191       *
2192       * The most notable and reliably failing application is the Humus
2193       * demo "CelShading"
2194       */
2195      vs.SingleProgramFlow = true;
2196      vs.SamplerCount = 0; /* hardware requirement */
2197#endif
2198
2199#if GFX_VER >= 8
2200      vs.SIMD8DispatchEnable =
2201         vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2202
2203      vs.UserClipDistanceCullTestEnableBitmask =
2204         vue_prog_data->cull_distance_mask;
2205#endif
2206   }
2207
2208#if GFX_VER == 6
2209   /* Based on my reading of the simulator, the VS constants don't get
2210    * pulled into the VS FF unit until an appropriate pipeline flush
2211    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2212    * references to them into a little FIFO.  The flushes are common,
2213    * but don't reliably happen between this and a 3DPRIMITIVE, causing
2214    * the primitive to use the wrong constants.  Then the FIFO
2215    * containing the constant setup gets added to again on the next
2216    * constants change, and eventually when a flush does happen the
2217    * unit is overwhelmed by constant changes and dies.
2218    *
2219    * To avoid this, send a PIPE_CONTROL down the line that will
2220    * update the unit immediately loading the constants.  The flush
2221    * type bits here were those set by the STATE_BASE_ADDRESS whose
2222    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2223    * bug reports that led to this workaround, and may be more than
2224    * what is strictly required to avoid the issue.
2225    */
2226   brw_emit_pipe_control_flush(brw,
2227                               PIPE_CONTROL_DEPTH_STALL |
2228                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2229                               PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2230#endif
2231}
2232
2233static const struct brw_tracked_state genX(vs_state) = {
2234   .dirty = {
2235      .mesa  = (GFX_VER == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2236      .brw   = BRW_NEW_BATCH |
2237               BRW_NEW_BLORP |
2238               BRW_NEW_CONTEXT |
2239               BRW_NEW_VS_PROG_DATA |
2240               (GFX_VER == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2241               (GFX_VER <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2242                               BRW_NEW_PROGRAM_CACHE |
2243                               BRW_NEW_SAMPLER_STATE_TABLE |
2244                               BRW_NEW_URB_FENCE
2245                             : 0),
2246   },
2247   .emit = genX(upload_vs_state),
2248};
2249
2250/* ---------------------------------------------------------------------- */
2251
2252static void
2253genX(upload_cc_viewport)(struct brw_context *brw)
2254{
2255   struct gl_context *ctx = &brw->ctx;
2256
2257   /* BRW_NEW_VIEWPORT_COUNT */
2258   const unsigned viewport_count = brw->clip.viewport_count;
2259
2260   struct GENX(CC_VIEWPORT) ccv;
2261   uint32_t cc_vp_offset;
2262   uint32_t *cc_map =
2263      brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2264                      32, &cc_vp_offset);
2265
2266   for (unsigned i = 0; i < viewport_count; i++) {
2267      /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2268      const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2269      if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
2270         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2271         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2272      } else if (ctx->Transform.DepthClampNear) {
2273         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2274         ccv.MaximumDepth = 0.0;
2275      } else if (ctx->Transform.DepthClampFar) {
2276         ccv.MinimumDepth = 0.0;
2277         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2278      } else {
2279         ccv.MinimumDepth = 0.0;
2280         ccv.MaximumDepth = 1.0;
2281      }
2282      GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2283      cc_map += GENX(CC_VIEWPORT_length);
2284   }
2285
2286#if GFX_VER >= 7
2287   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2288      ptr.CCViewportPointer = cc_vp_offset;
2289   }
2290#elif GFX_VER == 6
2291   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2292      vp.CCViewportStateChange = 1;
2293      vp.PointertoCC_VIEWPORT = cc_vp_offset;
2294   }
2295#else
2296   brw->cc.vp_offset = cc_vp_offset;
2297   ctx->NewDriverState |= BRW_NEW_CC_VP;
2298#endif
2299}
2300
2301const struct brw_tracked_state genX(cc_vp) = {
2302   .dirty = {
2303      .mesa = _NEW_TRANSFORM |
2304              _NEW_VIEWPORT,
2305      .brw = BRW_NEW_BATCH |
2306             BRW_NEW_BLORP |
2307             BRW_NEW_VIEWPORT_COUNT,
2308   },
2309   .emit = genX(upload_cc_viewport)
2310};
2311
2312/* ---------------------------------------------------------------------- */
2313
2314static void
2315set_scissor_bits(const struct gl_context *ctx, int i,
2316                 bool flip_y, unsigned fb_width, unsigned fb_height,
2317                 struct GENX(SCISSOR_RECT) *sc)
2318{
2319   int bbox[4];
2320
2321   bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2322   bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2323   bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
2324   bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2325   _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2326
2327   if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2328      /* If the scissor was out of bounds and got clamped to 0 width/height
2329       * at the bounds, the subtraction of 1 from maximums could produce a
2330       * negative number and thus not clip anything.  Instead, just provide
2331       * a min > max scissor inside the bounds, which produces the expected
2332       * no rendering.
2333       */
2334      sc->ScissorRectangleXMin = 1;
2335      sc->ScissorRectangleXMax = 0;
2336      sc->ScissorRectangleYMin = 1;
2337      sc->ScissorRectangleYMax = 0;
2338   } else if (!flip_y) {
2339      /* texmemory: Y=0=bottom */
2340      sc->ScissorRectangleXMin = bbox[0];
2341      sc->ScissorRectangleXMax = bbox[1] - 1;
2342      sc->ScissorRectangleYMin = bbox[2];
2343      sc->ScissorRectangleYMax = bbox[3] - 1;
2344   } else {
2345      /* memory: Y=0=top */
2346      sc->ScissorRectangleXMin = bbox[0];
2347      sc->ScissorRectangleXMax = bbox[1] - 1;
2348      sc->ScissorRectangleYMin = fb_height - bbox[3];
2349      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2350   }
2351}
2352
2353#if GFX_VER >= 6
2354static void
2355genX(upload_scissor_state)(struct brw_context *brw)
2356{
2357   struct gl_context *ctx = &brw->ctx;
2358   const bool flip_y = ctx->DrawBuffer->FlipY;
2359   struct GENX(SCISSOR_RECT) scissor;
2360   uint32_t scissor_state_offset;
2361   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2362   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2363   uint32_t *scissor_map;
2364
2365   /* BRW_NEW_VIEWPORT_COUNT */
2366   const unsigned viewport_count = brw->clip.viewport_count;
2367   /* Wa_1409725701:
2368    *    "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
2369    *    stored as an array of up to 16 elements. The location of first
2370    *    element of the array, as specified by Pointer to SCISSOR_RECT, should
2371    *    be aligned to a 64-byte boundary.
2372    */
2373   const unsigned alignment = 64;
2374   scissor_map = brw_state_batch(
2375      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2376      alignment, &scissor_state_offset);
2377
2378   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2379
2380   /* The scissor only needs to handle the intersection of drawable and
2381    * scissor rect.  Clipping to the boundaries of static shared buffers
2382    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2383    *
2384    * Note that the hardware's coordinates are inclusive, while Mesa's min is
2385    * inclusive but max is exclusive.
2386    */
2387   for (unsigned i = 0; i < viewport_count; i++) {
2388      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
2389      GENX(SCISSOR_RECT_pack)(
2390         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2391   }
2392
2393   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2394      ptr.ScissorRectPointer = scissor_state_offset;
2395   }
2396}
2397
2398static const struct brw_tracked_state genX(scissor_state) = {
2399   .dirty = {
2400      .mesa = _NEW_BUFFERS |
2401              _NEW_SCISSOR |
2402              _NEW_VIEWPORT,
2403      .brw = BRW_NEW_BATCH |
2404             BRW_NEW_BLORP |
2405             BRW_NEW_VIEWPORT_COUNT,
2406   },
2407   .emit = genX(upload_scissor_state),
2408};
2409#endif
2410
2411/* ---------------------------------------------------------------------- */
2412
2413static void
2414genX(upload_sf_clip_viewport)(struct brw_context *brw)
2415{
2416   struct gl_context *ctx = &brw->ctx;
2417   float y_scale, y_bias;
2418
2419   /* BRW_NEW_VIEWPORT_COUNT */
2420   const unsigned viewport_count = brw->clip.viewport_count;
2421
2422   /* _NEW_BUFFERS */
2423   const bool flip_y = ctx->DrawBuffer->FlipY;
2424   const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2425   const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2426
2427#if GFX_VER >= 7
2428#define clv sfv
2429   struct GENX(SF_CLIP_VIEWPORT) sfv;
2430   uint32_t sf_clip_vp_offset;
2431   uint32_t *sf_clip_map =
2432      brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2433                      64, &sf_clip_vp_offset);
2434#else
2435   struct GENX(SF_VIEWPORT) sfv;
2436   struct GENX(CLIP_VIEWPORT) clv;
2437   uint32_t sf_vp_offset, clip_vp_offset;
2438   uint32_t *sf_map =
2439      brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2440                      32, &sf_vp_offset);
2441   uint32_t *clip_map =
2442      brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2443                      32, &clip_vp_offset);
2444#endif
2445
2446   /* _NEW_BUFFERS */
2447   if (flip_y) {
2448      y_scale = -1.0;
2449      y_bias = (float)fb_height;
2450   } else {
2451      y_scale = 1.0;
2452      y_bias = 0;
2453   }
2454
2455   for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2456      /* _NEW_VIEWPORT: Guardband Clipping */
2457      float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2458      _mesa_get_viewport_xform(ctx, i, scale, translate);
2459
2460      sfv.ViewportMatrixElementm00 = scale[0];
2461      sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2462      sfv.ViewportMatrixElementm22 = scale[2],
2463      sfv.ViewportMatrixElementm30 = translate[0],
2464      sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2465      sfv.ViewportMatrixElementm32 = translate[2],
2466      intel_calculate_guardband_size(fb_width, fb_height,
2467                                     sfv.ViewportMatrixElementm00,
2468                                     sfv.ViewportMatrixElementm11,
2469                                     sfv.ViewportMatrixElementm30,
2470                                     sfv.ViewportMatrixElementm31,
2471                                     &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2472
2473
2474      clv.XMinClipGuardband = gb_xmin;
2475      clv.XMaxClipGuardband = gb_xmax;
2476      clv.YMinClipGuardband = gb_ymin;
2477      clv.YMaxClipGuardband = gb_ymax;
2478
2479#if GFX_VER < 6
2480      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
2481                       &sfv.ScissorRectangle);
2482#elif GFX_VER >= 8
2483      /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2484       * The hardware will take the intersection of the drawing rectangle,
2485       * scissor rectangle, and the viewport extents.  However, emitting
2486       * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2487       * pipeline stall so we're better off just being a little more clever
2488       * with our viewport so we can emit it once at context creation time.
2489       */
2490      const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2491      const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2492      const float viewport_Xmax =
2493         MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2494      const float viewport_Ymax =
2495         MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2496
2497      if (flip_y) {
2498         sfv.XMinViewPort = viewport_Xmin;
2499         sfv.XMaxViewPort = viewport_Xmax - 1;
2500         sfv.YMinViewPort = fb_height - viewport_Ymax;
2501         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2502      } else {
2503         sfv.XMinViewPort = viewport_Xmin;
2504         sfv.XMaxViewPort = viewport_Xmax - 1;
2505         sfv.YMinViewPort = viewport_Ymin;
2506         sfv.YMaxViewPort = viewport_Ymax - 1;
2507      }
2508#endif
2509
2510#if GFX_VER >= 7
2511      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2512      sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2513#else
2514      GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2515      GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2516      sf_map += GENX(SF_VIEWPORT_length);
2517      clip_map += GENX(CLIP_VIEWPORT_length);
2518#endif
2519   }
2520
2521#if GFX_VER >= 7
2522   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2523      ptr.SFClipViewportPointer = sf_clip_vp_offset;
2524   }
2525#elif GFX_VER == 6
2526   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2527      vp.SFViewportStateChange = 1;
2528      vp.CLIPViewportStateChange = 1;
2529      vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2530      vp.PointertoSF_VIEWPORT = sf_vp_offset;
2531   }
2532#else
2533   brw->sf.vp_offset = sf_vp_offset;
2534   brw->clip.vp_offset = clip_vp_offset;
2535   brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2536#endif
2537}
2538
2539static const struct brw_tracked_state genX(sf_clip_viewport) = {
2540   .dirty = {
2541      .mesa = _NEW_BUFFERS |
2542              _NEW_VIEWPORT |
2543              (GFX_VER <= 5 ? _NEW_SCISSOR : 0),
2544      .brw = BRW_NEW_BATCH |
2545             BRW_NEW_BLORP |
2546             BRW_NEW_VIEWPORT_COUNT,
2547   },
2548   .emit = genX(upload_sf_clip_viewport),
2549};
2550
2551/* ---------------------------------------------------------------------- */
2552
2553static void
2554genX(upload_gs_state)(struct brw_context *brw)
2555{
2556   UNUSED struct gl_context *ctx = &brw->ctx;
2557   UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo;
2558   const struct brw_stage_state *stage_state = &brw->gs.base;
2559   const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2560   /* BRW_NEW_GEOMETRY_PROGRAM */
2561   bool active = GFX_VER >= 6 && gs_prog;
2562
2563   /* BRW_NEW_GS_PROG_DATA */
2564   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2565   UNUSED const struct brw_vue_prog_data *vue_prog_data =
2566      brw_vue_prog_data(stage_prog_data);
2567#if GFX_VER >= 7
2568   const struct brw_gs_prog_data *gs_prog_data =
2569      brw_gs_prog_data(stage_prog_data);
2570#endif
2571
2572#if GFX_VER == 6
2573   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2574      if (active && stage_state->push_const_size != 0) {
2575         cgs.Buffer0Valid = true;
2576         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2577         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2578      }
2579   }
2580#endif
2581
2582#if GFX_VERx10 == 70
2583   /**
2584    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2585    * Geometry > Geometry Shader > State:
2586    *
2587    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2588    *     whole fixed function pipeline when the GS enable changes value in
2589    *     the 3DSTATE_GS."
2590    *
2591    * The hardware architects have clarified that in this context "flush the
2592    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2593    * Stall" bit set.
2594    */
2595   if (devinfo->gt == 2 && brw->gs.enabled != active)
2596      gfx7_emit_cs_stall_flush(brw);
2597#endif
2598
2599#if GFX_VER >= 6
2600   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2601#else
2602   ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
2603   brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2604#endif
2605
2606#if GFX_VER >= 6
2607      if (active) {
2608         INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2609
2610#if GFX_VER >= 7
2611         gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2612         gs.OutputTopology = gs_prog_data->output_topology;
2613         gs.ControlDataHeaderSize =
2614            gs_prog_data->control_data_header_size_hwords;
2615
2616         gs.InstanceControl = gs_prog_data->invocations - 1;
2617         gs.DispatchMode = vue_prog_data->dispatch_mode;
2618
2619         gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2620
2621         gs.ControlDataFormat = gs_prog_data->control_data_format;
2622#endif
2623
2624         /* Note: the meaning of the GFX7_GS_REORDER_TRAILING bit changes between
2625          * Ivy Bridge and Haswell.
2626          *
2627          * On Ivy Bridge, setting this bit causes the vertices of a triangle
2628          * strip to be delivered to the geometry shader in an order that does
2629          * not strictly follow the OpenGL spec, but preserves triangle
2630          * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2631          * the geometry shader sees triangles:
2632          *
2633          * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2634          *
2635          * (Clearing the bit is even worse, because it fails to preserve
2636          * orientation).
2637          *
2638          * Triangle strips with adjacency always ordered in a way that preserves
2639          * triangle orientation but does not strictly follow the OpenGL spec,
2640          * regardless of the setting of this bit.
2641          *
2642          * On Haswell, both triangle strips and triangle strips with adjacency
2643          * are always ordered in a way that preserves triangle orientation.
2644          * Setting this bit causes the ordering to strictly follow the OpenGL
2645          * spec.
2646          *
2647          * So in either case we want to set the bit.  Unfortunately on Ivy
2648          * Bridge this will get the order close to correct but not perfect.
2649          */
2650         gs.ReorderMode = TRAILING;
2651         gs.MaximumNumberofThreads =
2652            GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1)
2653                         : (devinfo->max_gs_threads - 1);
2654
2655#if GFX_VER < 7
2656         gs.SOStatisticsEnable = true;
2657         if (gs_prog->info.has_transform_feedback_varyings)
2658            gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
2659
2660         /* GFX6_GS_SPF_MODE and GFX6_GS_VECTOR_MASK_ENABLE are enabled as it
2661          * was previously done for gfx6.
2662          *
2663          * TODO: test with both disabled to see if the HW is behaving
2664          * as expected, like in gfx7.
2665          */
2666         gs.SingleProgramFlow = true;
2667         gs.VectorMaskEnable = true;
2668#endif
2669
2670#if GFX_VER >= 8
2671         gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2672
2673         if (gs_prog_data->static_vertex_count != -1) {
2674            gs.StaticOutput = true;
2675            gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2676         }
2677         gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2678
2679         gs.UserClipDistanceCullTestEnableBitmask =
2680            vue_prog_data->cull_distance_mask;
2681
2682         const int urb_entry_write_offset = 1;
2683         const uint32_t urb_entry_output_length =
2684            DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2685            urb_entry_write_offset;
2686
2687         gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2688         gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2689#endif
2690      }
2691#endif
2692
2693#if GFX_VER <= 6
2694      if (!active && brw->ff_gs.prog_active) {
2695         /* In gfx6, transform feedback for the VS stage is done with an
2696          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2697          * for this.
2698          */
2699         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2700         gs.SingleProgramFlow = true;
2701         gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1;
2702         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2703
2704#if GFX_VER <= 5
2705         gs.GRFRegisterCount =
2706            DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2707         /* BRW_NEW_URB_FENCE */
2708         gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2709         gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2710         gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2711         gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2712#else
2713         gs.Enable = true;
2714         gs.VectorMaskEnable = true;
2715         gs.SVBIPayloadEnable = true;
2716         gs.SVBIPostIncrementEnable = true;
2717         gs.SVBIPostIncrementValue =
2718            brw->ff_gs.prog_data->svbi_postincrement_value;
2719         gs.SOStatisticsEnable = true;
2720         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2721#endif
2722      }
2723#endif
2724      if (!active && !brw->ff_gs.prog_active) {
2725#if GFX_VER < 8
2726         gs.DispatchGRFStartRegisterForURBData = 1;
2727#if GFX_VER >= 7
2728         gs.IncludeVertexHandles = true;
2729#endif
2730#endif
2731      }
2732
2733#if GFX_VER >= 6
2734      gs.StatisticsEnable = true;
2735#endif
2736#if GFX_VER == 5 || GFX_VER == 6
2737      gs.RenderingEnabled = true;
2738#endif
2739#if GFX_VER <= 5
2740      gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2741#endif
2742   }
2743
2744#if GFX_VER == 6
2745   brw->gs.enabled = active;
2746#endif
2747}
2748
2749static const struct brw_tracked_state genX(gs_state) = {
2750   .dirty = {
2751      .mesa  = (GFX_VER == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2752      .brw   = BRW_NEW_BATCH |
2753               BRW_NEW_BLORP |
2754               (GFX_VER <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2755                               BRW_NEW_PROGRAM_CACHE |
2756                               BRW_NEW_URB_FENCE |
2757                               BRW_NEW_VIEWPORT_COUNT
2758                             : 0) |
2759               (GFX_VER >= 6 ? BRW_NEW_CONTEXT |
2760                               BRW_NEW_GEOMETRY_PROGRAM |
2761                               BRW_NEW_GS_PROG_DATA
2762                             : 0) |
2763               (GFX_VER < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2764   },
2765   .emit = genX(upload_gs_state),
2766};
2767
2768/* ---------------------------------------------------------------------- */
2769
2770UNUSED static GLenum
2771fix_dual_blend_alpha_to_one(GLenum function)
2772{
2773   switch (function) {
2774   case GL_SRC1_ALPHA:
2775      return GL_ONE;
2776
2777   case GL_ONE_MINUS_SRC1_ALPHA:
2778      return GL_ZERO;
2779   }
2780
2781   return function;
2782}
2783
2784#define blend_factor(x) brw_translate_blend_factor(x)
2785#define blend_eqn(x) brw_translate_blend_equation(x)
2786
2787/**
2788 * Modify blend function to force destination alpha to 1.0
2789 *
2790 * If \c function specifies a blend function that uses destination alpha,
2791 * replace it with a function that hard-wires destination alpha to 1.0.  This
2792 * is used when rendering to xRGB targets.
2793 */
2794static GLenum
2795brw_fix_xRGB_alpha(GLenum function)
2796{
2797   switch (function) {
2798   case GL_DST_ALPHA:
2799      return GL_ONE;
2800
2801   case GL_ONE_MINUS_DST_ALPHA:
2802   case GL_SRC_ALPHA_SATURATE:
2803      return GL_ZERO;
2804   }
2805
2806   return function;
2807}
2808
2809#if GFX_VER >= 6
2810typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2811#else
2812typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2813#endif
2814
2815UNUSED static bool
2816set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2817                     bool alpha_to_one)
2818{
2819   struct gl_context *ctx = &brw->ctx;
2820
2821   /* _NEW_BUFFERS */
2822   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2823
2824   bool independent_alpha_blend = false;
2825
2826   /* Used for implementing the following bit of GL_EXT_texture_integer:
2827    * "Per-fragment operations that require floating-point color
2828    *  components, including multisample alpha operations, alpha test,
2829    *  blending, and dithering, have no effect when the corresponding
2830    *  colors are written to an integer color buffer."
2831    */
2832   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2833
2834   const unsigned blend_enabled = GFX_VER >= 6 ?
2835      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2836
2837   /* _NEW_COLOR */
2838   if (ctx->Color.ColorLogicOpEnabled) {
2839      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2840         : GL_UNSIGNED_NORMALIZED;
2841      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2842                rb_type != GL_UNSIGNED_NORMALIZED &&
2843                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2844                "renderbuffer\n",
2845                _mesa_enum_to_string(ctx->Color.LogicOp),
2846                _mesa_enum_to_string(rb_type));
2847      if (GFX_VER >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2848         entry->LogicOpEnable = true;
2849         entry->LogicOpFunction = ctx->Color._LogicOp;
2850      }
2851   } else if (blend_enabled &&
2852              ctx->Color._AdvancedBlendMode == BLEND_NONE
2853              && (GFX_VER <= 5 || !integer)) {
2854      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2855      GLenum eqA = ctx->Color.Blend[i].EquationA;
2856      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2857      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2858      GLenum srcA = ctx->Color.Blend[i].SrcA;
2859      GLenum dstA = ctx->Color.Blend[i].DstA;
2860
2861      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2862         srcRGB = dstRGB = GL_ONE;
2863
2864      if (eqA == GL_MIN || eqA == GL_MAX)
2865         srcA = dstA = GL_ONE;
2866
2867      /* Due to hardware limitations, the destination may have information
2868       * in an alpha channel even when the format specifies no alpha
2869       * channel. In order to avoid getting any incorrect blending due to
2870       * that alpha channel, coerce the blend factors to values that will
2871       * not read the alpha channel, but will instead use the correct
2872       * implicit value for alpha.
2873       */
2874      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2875                                               GL_TEXTURE_ALPHA_TYPE)) {
2876         srcRGB = brw_fix_xRGB_alpha(srcRGB);
2877         srcA = brw_fix_xRGB_alpha(srcA);
2878         dstRGB = brw_fix_xRGB_alpha(dstRGB);
2879         dstA = brw_fix_xRGB_alpha(dstA);
2880      }
2881
2882      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2883       * "If Dual Source Blending is enabled, this bit must be disabled."
2884       *
2885       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2886       * and leave it enabled anyway.
2887       */
2888      if (GFX_VER >= 6 && ctx->Color._BlendUsesDualSrc & (1 << i) && alpha_to_one) {
2889         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2890         srcA = fix_dual_blend_alpha_to_one(srcA);
2891         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2892         dstA = fix_dual_blend_alpha_to_one(dstA);
2893      }
2894
2895      /* BRW_NEW_FS_PROG_DATA */
2896      const struct brw_wm_prog_data *wm_prog_data =
2897         brw_wm_prog_data(brw->wm.base.prog_data);
2898
2899      /* The Dual Source Blending documentation says:
2900       *
2901       * "If SRC1 is included in a src/dst blend factor and
2902       * a DualSource RT Write message is not used, results
2903       * are UNDEFINED. (This reflects the same restriction in DX APIs,
2904       * where undefined results are produced if “o1” is not written
2905       * by a PS – there are no default values defined).
2906       * If SRC1 is not included in a src/dst blend factor,
2907       * dual source blending must be disabled."
2908       *
2909       * There is no way to gracefully fix this undefined situation
2910       * so we just disable the blending to prevent possible issues.
2911       */
2912      entry->ColorBufferBlendEnable =
2913         !(ctx->Color._BlendUsesDualSrc & 0x1) || wm_prog_data->dual_src_blend;
2914
2915      entry->DestinationBlendFactor = blend_factor(dstRGB);
2916      entry->SourceBlendFactor = blend_factor(srcRGB);
2917      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
2918      entry->SourceAlphaBlendFactor = blend_factor(srcA);
2919      entry->ColorBlendFunction = blend_eqn(eqRGB);
2920      entry->AlphaBlendFunction = blend_eqn(eqA);
2921
2922      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
2923         independent_alpha_blend = true;
2924   }
2925
2926   return independent_alpha_blend;
2927}
2928
2929#if GFX_VER >= 6
2930static void
2931genX(upload_blend_state)(struct brw_context *brw)
2932{
2933   struct gl_context *ctx = &brw->ctx;
2934   int size;
2935
2936   /* We need at least one BLEND_STATE written, because we might do
2937    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
2938    * for computed depth or alpha test), which will do an FB write
2939    * with render target 0, which will reference BLEND_STATE[0] for
2940    * alpha test enable.
2941    */
2942   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
2943   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
2944      nr_draw_buffers = 1;
2945
2946   size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
2947#if GFX_VER >= 8
2948   size += GENX(BLEND_STATE_length) * 4;
2949#endif
2950
2951   uint32_t *blend_map;
2952   blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
2953
2954#if GFX_VER >= 8
2955   struct GENX(BLEND_STATE) blend = { 0 };
2956   {
2957#else
2958   for (int i = 0; i < nr_draw_buffers; i++) {
2959      struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2960#define blend entry
2961#endif
2962      /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
2963       * "If drawbuffer zero is not NONE and the buffer it references has an
2964       * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
2965       * operations are skipped."
2966       */
2967      if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
2968         /* _NEW_MULTISAMPLE */
2969         if (_mesa_is_multisample_enabled(ctx)) {
2970            if (ctx->Multisample.SampleAlphaToCoverage) {
2971               blend.AlphaToCoverageEnable = true;
2972               blend.AlphaToCoverageDitherEnable = GFX_VER >= 7;
2973            }
2974            if (ctx->Multisample.SampleAlphaToOne)
2975               blend.AlphaToOneEnable = true;
2976         }
2977
2978         /* _NEW_COLOR */
2979         if (ctx->Color.AlphaEnabled) {
2980            blend.AlphaTestEnable = true;
2981            blend.AlphaTestFunction =
2982               brw_translate_compare_func(ctx->Color.AlphaFunc);
2983         }
2984
2985         if (ctx->Color.DitherFlag) {
2986            blend.ColorDitherEnable = true;
2987         }
2988      }
2989
2990#if GFX_VER >= 8
2991      for (int i = 0; i < nr_draw_buffers; i++) {
2992         struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
2993#else
2994      {
2995#endif
2996         blend.IndependentAlphaBlendEnable =
2997            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
2998            blend.IndependentAlphaBlendEnable;
2999
3000         /* See section 8.1.6 "Pre-Blend Color Clamping" of the
3001          * SandyBridge PRM Volume 2 Part 1 for HW requirements.
3002          *
3003          * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
3004          * clamping in the fragment shader.  For its clamping of
3005          * blending, the spec says:
3006          *
3007          *     "RESOLVED: For fixed-point color buffers, the inputs and
3008          *      the result of the blending equation are clamped.  For
3009          *      floating-point color buffers, no clamping occurs."
3010          *
3011          * So, generally, we want clamping to the render target's range.
3012          * And, good news, the hardware tables for both pre- and
3013          * post-blend color clamping are either ignored, or any are
3014          * allowed, or clamping is required but RT range clamping is a
3015          * valid option.
3016          */
3017         entry.PreBlendColorClampEnable = true;
3018         entry.PostBlendColorClampEnable = true;
3019         entry.ColorClampRange = COLORCLAMP_RTFORMAT;
3020
3021         entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
3022         entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
3023         entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
3024         entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
3025
3026#if GFX_VER >= 8
3027         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
3028#else
3029         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
3030#endif
3031      }
3032   }
3033
3034#if GFX_VER >= 8
3035   GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3036#endif
3037
3038#if GFX_VER < 7
3039   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3040      ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3041      ptr.BLEND_STATEChange = true;
3042   }
3043#else
3044   brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3045      ptr.BlendStatePointer = brw->cc.blend_state_offset;
3046#if GFX_VER >= 8
3047      ptr.BlendStatePointerValid = true;
3048#endif
3049   }
3050#endif
3051}
3052
3053UNUSED static const struct brw_tracked_state genX(blend_state) = {
3054   .dirty = {
3055      .mesa = _NEW_BUFFERS |
3056              _NEW_COLOR |
3057              _NEW_MULTISAMPLE,
3058      .brw = BRW_NEW_BATCH |
3059             BRW_NEW_BLORP |
3060             BRW_NEW_FS_PROG_DATA |
3061             BRW_NEW_STATE_BASE_ADDRESS,
3062   },
3063   .emit = genX(upload_blend_state),
3064};
3065#endif
3066
3067/* ---------------------------------------------------------------------- */
3068
3069#if GFX_VER >= 7
3070UNUSED static const uint32_t push_constant_opcodes[] = {
3071   [MESA_SHADER_VERTEX]                      = 21,
3072   [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3073   [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3074   [MESA_SHADER_GEOMETRY]                    = 22,
3075   [MESA_SHADER_FRAGMENT]                    = 23,
3076   [MESA_SHADER_COMPUTE]                     = 0,
3077};
3078
3079static void
3080genX(upload_push_constant_packets)(struct brw_context *brw)
3081{
3082   const struct intel_device_info *devinfo = &brw->screen->devinfo;
3083   struct gl_context *ctx = &brw->ctx;
3084
3085   UNUSED uint32_t mocs = GFX_VER < 8 ? GFX7_MOCS_L3 : 0;
3086
3087   struct brw_stage_state *stage_states[] = {
3088      &brw->vs.base,
3089      &brw->tcs.base,
3090      &brw->tes.base,
3091      &brw->gs.base,
3092      &brw->wm.base,
3093   };
3094
3095   if (GFX_VERx10 == 70 && !devinfo->is_baytrail &&
3096       stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3097      gfx7_emit_vs_workaround_flush(brw);
3098
3099   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3100      struct brw_stage_state *stage_state = stage_states[stage];
3101      UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3102
3103      if (!stage_state->push_constants_dirty)
3104         continue;
3105
3106      brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3107         pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3108         if (stage_state->prog_data) {
3109#if GFX_VERx10 >= 75
3110            /* The Skylake PRM contains the following restriction:
3111             *
3112             *    "The driver must ensure The following case does not occur
3113             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3114             *     buffer 3 read length equal to zero committed followed by a
3115             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3116             *     zero committed."
3117             *
3118             * To avoid this, we program the buffers in the highest slots.
3119             * This way, slot 0 is only used if slot 3 is also used.
3120             */
3121            int n = 3;
3122
3123            for (int i = 3; i >= 0; i--) {
3124               const struct brw_ubo_range *range =
3125                  &stage_state->prog_data->ubo_ranges[i];
3126
3127               if (range->length == 0)
3128                  continue;
3129
3130               const struct gl_uniform_block *block =
3131                  prog->sh.UniformBlocks[range->block];
3132               const struct gl_buffer_binding *binding =
3133                  &ctx->UniformBufferBindings[block->Binding];
3134
3135               if (!binding->BufferObject) {
3136                  static unsigned msg_id = 0;
3137                  _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3138                                  MESA_DEBUG_TYPE_UNDEFINED,
3139                                  MESA_DEBUG_SEVERITY_HIGH,
3140                                  "UBO %d unbound, %s shader uniform data "
3141                                  "will be undefined.",
3142                                  range->block,
3143                                  _mesa_shader_stage_to_string(stage));
3144                  continue;
3145               }
3146
3147               assert(binding->Offset % 32 == 0);
3148
3149               struct brw_bo *bo = brw_bufferobj_buffer(brw,
3150                  brw_buffer_object(binding->BufferObject),
3151                  binding->Offset, range->length * 32, false);
3152
3153               pkt.ConstantBody.ReadLength[n] = range->length;
3154               pkt.ConstantBody.Buffer[n] =
3155                  ro_bo(bo, range->start * 32 + binding->Offset);
3156               n--;
3157            }
3158
3159            if (stage_state->push_const_size > 0) {
3160               assert(n >= 0);
3161               pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3162               pkt.ConstantBody.Buffer[n] =
3163                  ro_bo(stage_state->push_const_bo,
3164                        stage_state->push_const_offset);
3165            }
3166#else
3167            pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3168            pkt.ConstantBody.Buffer[0].offset =
3169               stage_state->push_const_offset | mocs;
3170#endif
3171         }
3172      }
3173
3174      stage_state->push_constants_dirty = false;
3175      brw->ctx.NewDriverState |= GFX_VER >= 9 ? BRW_NEW_SURFACES : 0;
3176   }
3177}
3178
3179const struct brw_tracked_state genX(push_constant_packets) = {
3180   .dirty = {
3181      .mesa  = 0,
3182      .brw   = BRW_NEW_DRAW_CALL,
3183   },
3184   .emit = genX(upload_push_constant_packets),
3185};
3186#endif
3187
3188#if GFX_VER >= 6
3189static void
3190genX(upload_vs_push_constants)(struct brw_context *brw)
3191{
3192   struct brw_stage_state *stage_state = &brw->vs.base;
3193
3194   /* BRW_NEW_VERTEX_PROGRAM */
3195   const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3196   /* BRW_NEW_VS_PROG_DATA */
3197   const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3198
3199   gfx6_upload_push_constants(brw, vp, prog_data, stage_state);
3200}
3201
3202static const struct brw_tracked_state genX(vs_push_constants) = {
3203   .dirty = {
3204      .mesa  = _NEW_PROGRAM_CONSTANTS |
3205               _NEW_TRANSFORM,
3206      .brw   = BRW_NEW_BATCH |
3207               BRW_NEW_BLORP |
3208               BRW_NEW_VERTEX_PROGRAM |
3209               BRW_NEW_VS_PROG_DATA,
3210   },
3211   .emit = genX(upload_vs_push_constants),
3212};
3213
3214static void
3215genX(upload_gs_push_constants)(struct brw_context *brw)
3216{
3217   struct brw_stage_state *stage_state = &brw->gs.base;
3218
3219   /* BRW_NEW_GEOMETRY_PROGRAM */
3220   const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3221
3222   /* BRW_NEW_GS_PROG_DATA */
3223   struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3224
3225   gfx6_upload_push_constants(brw, gp, prog_data, stage_state);
3226}
3227
3228static const struct brw_tracked_state genX(gs_push_constants) = {
3229   .dirty = {
3230      .mesa  = _NEW_PROGRAM_CONSTANTS |
3231               _NEW_TRANSFORM,
3232      .brw   = BRW_NEW_BATCH |
3233               BRW_NEW_BLORP |
3234               BRW_NEW_GEOMETRY_PROGRAM |
3235               BRW_NEW_GS_PROG_DATA,
3236   },
3237   .emit = genX(upload_gs_push_constants),
3238};
3239
3240static void
3241genX(upload_wm_push_constants)(struct brw_context *brw)
3242{
3243   struct brw_stage_state *stage_state = &brw->wm.base;
3244   /* BRW_NEW_FRAGMENT_PROGRAM */
3245   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3246   /* BRW_NEW_FS_PROG_DATA */
3247   const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3248
3249   gfx6_upload_push_constants(brw, fp, prog_data, stage_state);
3250}
3251
3252static const struct brw_tracked_state genX(wm_push_constants) = {
3253   .dirty = {
3254      .mesa  = _NEW_PROGRAM_CONSTANTS,
3255      .brw   = BRW_NEW_BATCH |
3256               BRW_NEW_BLORP |
3257               BRW_NEW_FRAGMENT_PROGRAM |
3258               BRW_NEW_FS_PROG_DATA,
3259   },
3260   .emit = genX(upload_wm_push_constants),
3261};
3262#endif
3263
3264/* ---------------------------------------------------------------------- */
3265
3266#if GFX_VER >= 6
3267static unsigned
3268genX(determine_sample_mask)(struct brw_context *brw)
3269{
3270   struct gl_context *ctx = &brw->ctx;
3271   float coverage = 1.0f;
3272   float coverage_invert = false;
3273   unsigned sample_mask = ~0u;
3274
3275   /* BRW_NEW_NUM_SAMPLES */
3276   unsigned num_samples = brw->num_samples;
3277
3278   if (_mesa_is_multisample_enabled(ctx)) {
3279      if (ctx->Multisample.SampleCoverage) {
3280         coverage = ctx->Multisample.SampleCoverageValue;
3281         coverage_invert = ctx->Multisample.SampleCoverageInvert;
3282      }
3283      if (ctx->Multisample.SampleMask) {
3284         sample_mask = ctx->Multisample.SampleMaskValue;
3285      }
3286   }
3287
3288   if (num_samples > 1) {
3289      int coverage_int = (int) (num_samples * coverage + 0.5f);
3290      uint32_t coverage_bits = (1 << coverage_int) - 1;
3291      if (coverage_invert)
3292         coverage_bits ^= (1 << num_samples) - 1;
3293      return coverage_bits & sample_mask;
3294   } else {
3295      return 1;
3296   }
3297}
3298
3299static void
3300genX(emit_3dstate_multisample2)(struct brw_context *brw,
3301                                unsigned num_samples)
3302{
3303   unsigned log2_samples = ffs(num_samples) - 1;
3304
3305   brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3306      multi.PixelLocation = CENTER;
3307      multi.NumberofMultisamples = log2_samples;
3308#if GFX_VER == 6
3309      INTEL_SAMPLE_POS_4X(multi.Sample);
3310#elif GFX_VER == 7
3311      switch (num_samples) {
3312      case 1:
3313         INTEL_SAMPLE_POS_1X(multi.Sample);
3314         break;
3315      case 2:
3316         INTEL_SAMPLE_POS_2X(multi.Sample);
3317         break;
3318      case 4:
3319         INTEL_SAMPLE_POS_4X(multi.Sample);
3320         break;
3321      case 8:
3322         INTEL_SAMPLE_POS_8X(multi.Sample);
3323         break;
3324      default:
3325         break;
3326      }
3327#endif
3328   }
3329}
3330
3331static void
3332genX(upload_multisample_state)(struct brw_context *brw)
3333{
3334   assert(brw->num_samples > 0 && brw->num_samples <= 16);
3335
3336   genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3337
3338   brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3339      sm.SampleMask = genX(determine_sample_mask)(brw);
3340   }
3341}
3342
3343static const struct brw_tracked_state genX(multisample_state) = {
3344   .dirty = {
3345      .mesa = _NEW_MULTISAMPLE |
3346              (GFX_VER == 10 ? _NEW_BUFFERS : 0),
3347      .brw = BRW_NEW_BLORP |
3348             BRW_NEW_CONTEXT |
3349             BRW_NEW_NUM_SAMPLES,
3350   },
3351   .emit = genX(upload_multisample_state)
3352};
3353#endif
3354
3355/* ---------------------------------------------------------------------- */
3356
3357static void
3358genX(upload_color_calc_state)(struct brw_context *brw)
3359{
3360   struct gl_context *ctx = &brw->ctx;
3361
3362   brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3363#if GFX_VER <= 5
3364      cc.IndependentAlphaBlendEnable =
3365         set_blend_entry_bits(brw, &cc, 0, false);
3366      set_depth_stencil_bits(brw, &cc);
3367
3368      if (ctx->Color.AlphaEnabled &&
3369          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3370         cc.AlphaTestEnable = true;
3371         cc.AlphaTestFunction =
3372            brw_translate_compare_func(ctx->Color.AlphaFunc);
3373      }
3374
3375      cc.ColorDitherEnable = ctx->Color.DitherFlag;
3376
3377      cc.StatisticsEnable = brw->stats_wm;
3378
3379      cc.CCViewportStatePointer =
3380         ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3381#else
3382      /* _NEW_COLOR */
3383      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3384      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3385      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3386      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3387
3388#if GFX_VER < 9
3389      /* _NEW_STENCIL */
3390      cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3391      cc.BackfaceStencilReferenceValue =
3392         _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3393#endif
3394
3395#endif
3396
3397      /* _NEW_COLOR */
3398      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3399                               ctx->Color.AlphaRef);
3400   }
3401
3402#if GFX_VER >= 6
3403   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3404      ptr.ColorCalcStatePointer = brw->cc.state_offset;
3405#if GFX_VER != 7
3406      ptr.ColorCalcStatePointerValid = true;
3407#endif
3408   }
3409#else
3410   brw->ctx.NewDriverState |= BRW_NEW_GFX4_UNIT_STATE;
3411#endif
3412}
3413
3414UNUSED static const struct brw_tracked_state genX(color_calc_state) = {
3415   .dirty = {
3416      .mesa = _NEW_COLOR |
3417              _NEW_STENCIL |
3418              (GFX_VER <= 5 ? _NEW_BUFFERS |
3419                              _NEW_DEPTH
3420                            : 0),
3421      .brw = BRW_NEW_BATCH |
3422             BRW_NEW_BLORP |
3423             (GFX_VER <= 5 ? BRW_NEW_CC_VP |
3424                             BRW_NEW_STATS_WM
3425                           : BRW_NEW_CC_STATE |
3426                             BRW_NEW_STATE_BASE_ADDRESS),
3427   },
3428   .emit = genX(upload_color_calc_state),
3429};
3430
3431
3432/* ---------------------------------------------------------------------- */
3433
3434#if GFX_VERx10 == 75
3435static void
3436genX(upload_color_calc_and_blend_state)(struct brw_context *brw)
3437{
3438   genX(upload_blend_state)(brw);
3439   genX(upload_color_calc_state)(brw);
3440}
3441
3442/* On Haswell when BLEND_STATE is emitted CC_STATE should also be re-emitted,
3443 * this workarounds the flickering shadows in several games.
3444 */
3445static const struct brw_tracked_state genX(cc_and_blend_state) = {
3446   .dirty = {
3447      .mesa = _NEW_BUFFERS |
3448              _NEW_COLOR |
3449              _NEW_STENCIL |
3450              _NEW_MULTISAMPLE,
3451      .brw = BRW_NEW_BATCH |
3452             BRW_NEW_BLORP |
3453             BRW_NEW_CC_STATE |
3454             BRW_NEW_FS_PROG_DATA |
3455             BRW_NEW_STATE_BASE_ADDRESS,
3456   },
3457   .emit = genX(upload_color_calc_and_blend_state),
3458};
3459#endif
3460
3461/* ---------------------------------------------------------------------- */
3462
3463#if GFX_VER >= 7
3464static void
3465genX(upload_sbe)(struct brw_context *brw)
3466{
3467   struct gl_context *ctx = &brw->ctx;
3468   /* BRW_NEW_FRAGMENT_PROGRAM */
3469   UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3470   /* BRW_NEW_FS_PROG_DATA */
3471   const struct brw_wm_prog_data *wm_prog_data =
3472      brw_wm_prog_data(brw->wm.base.prog_data);
3473#if GFX_VER >= 8
3474   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3475#else
3476#define attr_overrides sbe.Attribute
3477#endif
3478   uint32_t urb_entry_read_length;
3479   uint32_t urb_entry_read_offset;
3480   uint32_t point_sprite_enables;
3481
3482   brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3483      sbe.AttributeSwizzleEnable = true;
3484      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3485
3486      /* _NEW_BUFFERS */
3487      bool flip_y = ctx->DrawBuffer->FlipY;
3488
3489      /* _NEW_POINT
3490       *
3491       * Window coordinates in an FBO are inverted, which means point
3492       * sprite origin must be inverted.
3493       */
3494      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
3495         sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3496      else
3497         sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3498
3499      /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3500       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3501       * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3502       * BRW_NEW_VUE_MAP_GEOM_OUT
3503       */
3504      genX(calculate_attr_overrides)(brw,
3505                                     attr_overrides,
3506                                     &point_sprite_enables,
3507                                     &urb_entry_read_length,
3508                                     &urb_entry_read_offset);
3509
3510      /* Typically, the URB entry read length and offset should be programmed
3511       * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3512       * stage which produces geometry.  However, we don't know the proper
3513       * value until we call calculate_attr_overrides().
3514       *
3515       * To fit with our existing code, we override the inherited values and
3516       * specify it here directly, as we did on previous generations.
3517       */
3518      sbe.VertexURBEntryReadLength = urb_entry_read_length;
3519      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3520      sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3521      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3522
3523#if GFX_VER >= 8
3524      sbe.ForceVertexURBEntryReadLength = true;
3525      sbe.ForceVertexURBEntryReadOffset = true;
3526#endif
3527
3528#if GFX_VER >= 9
3529      /* prepare the active component dwords */
3530      for (int i = 0; i < 32; i++)
3531         sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
3532#endif
3533   }
3534
3535#if GFX_VER >= 8
3536   brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3537      for (int i = 0; i < 16; i++)
3538         sbes.Attribute[i] = attr_overrides[i];
3539   }
3540#endif
3541
3542#undef attr_overrides
3543}
3544
3545static const struct brw_tracked_state genX(sbe_state) = {
3546   .dirty = {
3547      .mesa  = _NEW_BUFFERS |
3548               _NEW_LIGHT |
3549               _NEW_POINT |
3550               _NEW_POLYGON |
3551               _NEW_PROGRAM,
3552      .brw   = BRW_NEW_BLORP |
3553               BRW_NEW_CONTEXT |
3554               BRW_NEW_FRAGMENT_PROGRAM |
3555               BRW_NEW_FS_PROG_DATA |
3556               BRW_NEW_GS_PROG_DATA |
3557               BRW_NEW_TES_PROG_DATA |
3558               BRW_NEW_VUE_MAP_GEOM_OUT |
3559               (GFX_VER == 7 ? BRW_NEW_PRIMITIVE
3560                             : 0),
3561   },
3562   .emit = genX(upload_sbe),
3563};
3564#endif
3565
3566/* ---------------------------------------------------------------------- */
3567
3568#if GFX_VER >= 7
3569/**
3570 * Outputs the 3DSTATE_SO_DECL_LIST command.
3571 *
3572 * The data output is a series of 64-bit entries containing a SO_DECL per
3573 * stream.  We only have one stream of rendering coming out of the GS unit, so
3574 * we only emit stream 0 (low 16 bits) SO_DECLs.
3575 */
3576static void
3577genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3578                                  const struct brw_vue_map *vue_map)
3579{
3580   struct gl_context *ctx = &brw->ctx;
3581   /* BRW_NEW_TRANSFORM_FEEDBACK */
3582   struct gl_transform_feedback_object *xfb_obj =
3583      ctx->TransformFeedback.CurrentObject;
3584   const struct gl_transform_feedback_info *linked_xfb_info =
3585      xfb_obj->program->sh.LinkedTransformFeedback;
3586   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3587   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3588   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3589   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3590   int max_decls = 0;
3591   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3592
3593   memset(so_decl, 0, sizeof(so_decl));
3594
3595   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3596    * command feels strange -- each dword pair contains a SO_DECL per stream.
3597    */
3598   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3599      const struct gl_transform_feedback_output *output =
3600         &linked_xfb_info->Outputs[i];
3601      const int buffer = output->OutputBuffer;
3602      const int varying = output->OutputRegister;
3603      const unsigned stream_id = output->StreamId;
3604      assert(stream_id < MAX_VERTEX_STREAMS);
3605
3606      buffer_mask[stream_id] |= 1 << buffer;
3607
3608      assert(vue_map->varying_to_slot[varying] >= 0);
3609
3610      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3611       * array.  Instead, it simply increments DstOffset for the following
3612       * input by the number of components that should be skipped.
3613       *
3614       * Our hardware is unusual in that it requires us to program SO_DECLs
3615       * for fake "hole" components, rather than simply taking the offset
3616       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3617       * program as many size = 4 holes as we can, then a final hole to
3618       * accommodate the final 1, 2, or 3 remaining.
3619       */
3620      int skip_components = output->DstOffset - next_offset[buffer];
3621
3622      while (skip_components > 0) {
3623         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3624            .HoleFlag = 1,
3625            .OutputBufferSlot = output->OutputBuffer,
3626            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3627         };
3628         skip_components -= 4;
3629      }
3630
3631      next_offset[buffer] = output->DstOffset + output->NumComponents;
3632
3633      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3634         .OutputBufferSlot = output->OutputBuffer,
3635         .RegisterIndex = vue_map->varying_to_slot[varying],
3636         .ComponentMask =
3637            ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3638      };
3639
3640      if (decls[stream_id] > max_decls)
3641         max_decls = decls[stream_id];
3642   }
3643
3644   uint32_t *dw;
3645   dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3646                        .StreamtoBufferSelects0 = buffer_mask[0],
3647                        .StreamtoBufferSelects1 = buffer_mask[1],
3648                        .StreamtoBufferSelects2 = buffer_mask[2],
3649                        .StreamtoBufferSelects3 = buffer_mask[3],
3650                        .NumEntries0 = decls[0],
3651                        .NumEntries1 = decls[1],
3652                        .NumEntries2 = decls[2],
3653                        .NumEntries3 = decls[3]);
3654
3655   for (int i = 0; i < max_decls; i++) {
3656      GENX(SO_DECL_ENTRY_pack)(
3657         brw, dw + 2 + i * 2,
3658         &(struct GENX(SO_DECL_ENTRY)) {
3659            .Stream0Decl = so_decl[0][i],
3660            .Stream1Decl = so_decl[1][i],
3661            .Stream2Decl = so_decl[2][i],
3662            .Stream3Decl = so_decl[3][i],
3663         });
3664   }
3665}
3666
3667static void
3668genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3669{
3670   struct gl_context *ctx = &brw->ctx;
3671   /* BRW_NEW_TRANSFORM_FEEDBACK */
3672   struct gl_transform_feedback_object *xfb_obj =
3673      ctx->TransformFeedback.CurrentObject;
3674#if GFX_VER < 8
3675   const struct gl_transform_feedback_info *linked_xfb_info =
3676      xfb_obj->program->sh.LinkedTransformFeedback;
3677#else
3678   struct brw_transform_feedback_object *brw_obj =
3679      (struct brw_transform_feedback_object *) xfb_obj;
3680   uint32_t mocs_wb = GFX_VER >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3681#endif
3682
3683   /* Set up the up to 4 output buffers.  These are the ranges defined in the
3684    * gl_transform_feedback_object.
3685    */
3686   for (int i = 0; i < 4; i++) {
3687      struct brw_buffer_object *bufferobj =
3688         brw_buffer_object(xfb_obj->Buffers[i]);
3689      uint32_t start = xfb_obj->Offset[i];
3690      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3691      uint32_t const size = end - start;
3692
3693      if (!bufferobj || !size) {
3694         brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3695            sob.SOBufferIndex = i;
3696         }
3697         continue;
3698      }
3699
3700      assert(start % 4 == 0);
3701      struct brw_bo *bo =
3702         brw_bufferobj_buffer(brw, bufferobj, start, size, true);
3703      assert(end <= bo->size);
3704
3705      brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3706         sob.SOBufferIndex = i;
3707
3708         sob.SurfaceBaseAddress = rw_bo(bo, start);
3709#if GFX_VER < 8
3710         sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3711         sob.SurfaceEndAddress = rw_bo(bo, end);
3712#else
3713         sob.SOBufferEnable = true;
3714         sob.StreamOffsetWriteEnable = true;
3715         sob.StreamOutputBufferOffsetAddressEnable = true;
3716         sob.MOCS = mocs_wb;
3717
3718         sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3719         sob.StreamOutputBufferOffsetAddress =
3720            rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3721
3722         if (brw_obj->zero_offsets) {
3723            /* Zero out the offset and write that to offset_bo */
3724            sob.StreamOffset = 0;
3725         } else {
3726            /* Use offset_bo as the "Stream Offset." */
3727            sob.StreamOffset = 0xFFFFFFFF;
3728         }
3729#endif
3730      }
3731   }
3732
3733#if GFX_VER >= 8
3734   brw_obj->zero_offsets = false;
3735#endif
3736}
3737
3738static bool
3739query_active(struct gl_query_object *q)
3740{
3741   return q && q->Active;
3742}
3743
3744static void
3745genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3746                               const struct brw_vue_map *vue_map)
3747{
3748   struct gl_context *ctx = &brw->ctx;
3749   /* BRW_NEW_TRANSFORM_FEEDBACK */
3750   struct gl_transform_feedback_object *xfb_obj =
3751      ctx->TransformFeedback.CurrentObject;
3752
3753   brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3754      if (active) {
3755         int urb_entry_read_offset = 0;
3756         int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3757            urb_entry_read_offset;
3758
3759         sos.SOFunctionEnable = true;
3760         sos.SOStatisticsEnable = true;
3761
3762         /* BRW_NEW_RASTERIZER_DISCARD */
3763         if (ctx->RasterDiscard) {
3764            if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3765               sos.RenderingDisable = true;
3766            } else {
3767               perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3768                          "query active relies on the clipper.\n");
3769            }
3770         }
3771
3772         /* _NEW_LIGHT */
3773         if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3774            sos.ReorderMode = TRAILING;
3775
3776#if GFX_VER < 8
3777         sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3778         sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3779         sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3780         sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3781#else
3782         const struct gl_transform_feedback_info *linked_xfb_info =
3783            xfb_obj->program->sh.LinkedTransformFeedback;
3784         /* Set buffer pitches; 0 means unbound. */
3785         if (xfb_obj->Buffers[0])
3786            sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3787         if (xfb_obj->Buffers[1])
3788            sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3789         if (xfb_obj->Buffers[2])
3790            sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3791         if (xfb_obj->Buffers[3])
3792            sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3793#endif
3794
3795         /* We always read the whole vertex.  This could be reduced at some
3796          * point by reading less and offsetting the register index in the
3797          * SO_DECLs.
3798          */
3799         sos.Stream0VertexReadOffset = urb_entry_read_offset;
3800         sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3801         sos.Stream1VertexReadOffset = urb_entry_read_offset;
3802         sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3803         sos.Stream2VertexReadOffset = urb_entry_read_offset;
3804         sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3805         sos.Stream3VertexReadOffset = urb_entry_read_offset;
3806         sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3807      }
3808   }
3809}
3810
3811static void
3812genX(upload_sol)(struct brw_context *brw)
3813{
3814   struct gl_context *ctx = &brw->ctx;
3815   /* BRW_NEW_TRANSFORM_FEEDBACK */
3816   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3817
3818   if (active) {
3819      genX(upload_3dstate_so_buffers)(brw);
3820
3821      /* BRW_NEW_VUE_MAP_GEOM_OUT */
3822      genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3823   }
3824
3825   /* Finally, set up the SOL stage.  This command must always follow updates to
3826    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3827    * MMIO register updates (current performed by the kernel at each batch
3828    * emit).
3829    */
3830   genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3831}
3832
3833static const struct brw_tracked_state genX(sol_state) = {
3834   .dirty = {
3835      .mesa  = _NEW_LIGHT,
3836      .brw   = BRW_NEW_BATCH |
3837               BRW_NEW_BLORP |
3838               BRW_NEW_RASTERIZER_DISCARD |
3839               BRW_NEW_VUE_MAP_GEOM_OUT |
3840               BRW_NEW_TRANSFORM_FEEDBACK,
3841   },
3842   .emit = genX(upload_sol),
3843};
3844#endif
3845
3846/* ---------------------------------------------------------------------- */
3847
3848#if GFX_VER >= 7
3849static void
3850genX(upload_ps)(struct brw_context *brw)
3851{
3852   UNUSED const struct gl_context *ctx = &brw->ctx;
3853   UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo;
3854
3855   /* BRW_NEW_FS_PROG_DATA */
3856   const struct brw_wm_prog_data *prog_data =
3857      brw_wm_prog_data(brw->wm.base.prog_data);
3858   const struct brw_stage_state *stage_state = &brw->wm.base;
3859
3860#if GFX_VER < 8
3861#endif
3862
3863   brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3864      /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3865       * incorrect for subspans where some of the pixels are unlit.  We believe
3866       * the bit just didn't take effect in previous generations.
3867       */
3868      ps.VectorMaskEnable = GFX_VER >= 8;
3869
3870      /* Wa_1606682166:
3871       * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
3872       * Disable the Sampler state prefetch functionality in the SARB by
3873       * programming 0xB000[30] to '1'."
3874       */
3875      ps.SamplerCount = GFX_VER == 11 ?
3876         0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3877
3878      /* BRW_NEW_FS_PROG_DATA */
3879      ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4;
3880
3881      if (prog_data->base.use_alt_mode)
3882         ps.FloatingPointMode = Alternate;
3883
3884      /* Haswell requires the sample mask to be set in this packet as well as
3885       * in 3DSTATE_SAMPLE_MASK; the values should match.
3886       */
3887
3888      /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3889#if GFX_VERx10 == 75
3890      ps.SampleMask = genX(determine_sample_mask(brw));
3891#endif
3892
3893      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
3894       * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is
3895       * k, it implies 2(k+1) threads. It implicitly scales for different GT
3896       * levels (which have some # of PSDs).
3897       *
3898       * In Gfx8 the format is U8-2 whereas in Gfx9+ it is U9-1.
3899       */
3900#if GFX_VER >= 9
3901      ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3902#elif GFX_VER >= 8
3903      ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3904#else
3905      ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3906#endif
3907
3908      if (prog_data->base.nr_params > 0 ||
3909          prog_data->base.ubo_ranges[0].length > 0)
3910         ps.PushConstantEnable = true;
3911
3912#if GFX_VER < 8
3913      /* From the IVB PRM, volume 2 part 1, page 287:
3914       * "This bit is inserted in the PS payload header and made available to
3915       * the DataPort (either via the message header or via header bypass) to
3916       * indicate that oMask data (one or two phases) is included in Render
3917       * Target Write messages. If present, the oMask data is used to mask off
3918       * samples."
3919       */
3920      ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3921
3922      /* The hardware wedges if you have this bit set but don't turn on any
3923       * dual source blend factors.
3924       *
3925       * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
3926       */
3927      ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
3928                                 (ctx->Color.BlendEnabled & 1) &&
3929                                 ctx->Color._BlendUsesDualSrc & 0x1;
3930
3931      /* BRW_NEW_FS_PROG_DATA */
3932      ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
3933#endif
3934
3935      /* From the documentation for this packet:
3936       * "If the PS kernel does not need the Position XY Offsets to
3937       *  compute a Position Value, then this field should be programmed
3938       *  to POSOFFSET_NONE."
3939       *
3940       * "SW Recommendation: If the PS kernel needs the Position Offsets
3941       *  to compute a Position XY value, this field should match Position
3942       *  ZW Interpolation Mode to ensure a consistent position.xyzw
3943       *  computation."
3944       *
3945       * We only require XY sample offsets. So, this recommendation doesn't
3946       * look useful at the moment. We might need this in future.
3947       */
3948      if (prog_data->uses_pos_offset)
3949         ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
3950      else
3951         ps.PositionXYOffsetSelect = POSOFFSET_NONE;
3952
3953      ps._8PixelDispatchEnable = prog_data->dispatch_8;
3954      ps._16PixelDispatchEnable = prog_data->dispatch_16;
3955      ps._32PixelDispatchEnable = prog_data->dispatch_32;
3956
3957      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
3958       *
3959       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
3960       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
3961       *
3962       * Since 16x MSAA is first introduced on SKL, we don't need to apply
3963       * the workaround on any older hardware.
3964       *
3965       * BRW_NEW_NUM_SAMPLES
3966       */
3967      if (GFX_VER >= 9 && !prog_data->persample_dispatch &&
3968          brw->num_samples == 16) {
3969         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
3970         ps._32PixelDispatchEnable = false;
3971      }
3972
3973      ps.DispatchGRFStartRegisterForConstantSetupData0 =
3974         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
3975      ps.DispatchGRFStartRegisterForConstantSetupData1 =
3976         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
3977      ps.DispatchGRFStartRegisterForConstantSetupData2 =
3978         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
3979
3980      ps.KernelStartPointer0 = stage_state->prog_offset +
3981                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
3982      ps.KernelStartPointer1 = stage_state->prog_offset +
3983                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
3984      ps.KernelStartPointer2 = stage_state->prog_offset +
3985                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
3986
3987      if (prog_data->base.total_scratch) {
3988         ps.ScratchSpaceBasePointer =
3989            rw_32_bo(stage_state->scratch_bo,
3990                     ffs(stage_state->per_thread_scratch) - 11);
3991      }
3992   }
3993}
3994
3995static const struct brw_tracked_state genX(ps_state) = {
3996   .dirty = {
3997      .mesa  = _NEW_MULTISAMPLE |
3998               (GFX_VER < 8 ? _NEW_BUFFERS |
3999                              _NEW_COLOR
4000                            : 0),
4001      .brw   = BRW_NEW_BATCH |
4002               BRW_NEW_BLORP |
4003               BRW_NEW_FS_PROG_DATA |
4004               (GFX_VER >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
4005   },
4006   .emit = genX(upload_ps),
4007};
4008#endif
4009
4010/* ---------------------------------------------------------------------- */
4011
4012#if GFX_VER >= 7
4013static void
4014genX(upload_hs_state)(struct brw_context *brw)
4015{
4016   const struct intel_device_info *devinfo = &brw->screen->devinfo;
4017   struct brw_stage_state *stage_state = &brw->tcs.base;
4018   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4019   const struct brw_vue_prog_data *vue_prog_data =
4020      brw_vue_prog_data(stage_prog_data);
4021
4022   /* BRW_NEW_TES_PROG_DATA */
4023   struct brw_tcs_prog_data *tcs_prog_data =
4024      brw_tcs_prog_data(stage_prog_data);
4025
4026   if (!tcs_prog_data) {
4027      brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
4028   } else {
4029      brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
4030         INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
4031
4032         hs.InstanceCount = tcs_prog_data->instances - 1;
4033         hs.IncludeVertexHandles = true;
4034
4035         hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4036
4037#if GFX_VER >= 9
4038         hs.DispatchMode = vue_prog_data->dispatch_mode;
4039         hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id;
4040#endif
4041      }
4042   }
4043}
4044
4045static const struct brw_tracked_state genX(hs_state) = {
4046   .dirty = {
4047      .mesa  = 0,
4048      .brw   = BRW_NEW_BATCH |
4049               BRW_NEW_BLORP |
4050               BRW_NEW_TCS_PROG_DATA |
4051               BRW_NEW_TESS_PROGRAMS,
4052   },
4053   .emit = genX(upload_hs_state),
4054};
4055
4056static void
4057genX(upload_ds_state)(struct brw_context *brw)
4058{
4059   const struct intel_device_info *devinfo = &brw->screen->devinfo;
4060   const struct brw_stage_state *stage_state = &brw->tes.base;
4061   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4062
4063   /* BRW_NEW_TES_PROG_DATA */
4064   const struct brw_tes_prog_data *tes_prog_data =
4065      brw_tes_prog_data(stage_prog_data);
4066   const struct brw_vue_prog_data *vue_prog_data =
4067      brw_vue_prog_data(stage_prog_data);
4068
4069   if (!tes_prog_data) {
4070      brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
4071   } else {
4072      assert(GFX_VER < 11 ||
4073             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
4074
4075      brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
4076         INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
4077
4078        ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4079        ds.ComputeWCoordinateEnable =
4080           tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4081
4082#if GFX_VER >= 8
4083        if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
4084           ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4085        ds.UserClipDistanceCullTestEnableBitmask =
4086            vue_prog_data->cull_distance_mask;
4087#endif
4088      }
4089   }
4090}
4091
4092static const struct brw_tracked_state genX(ds_state) = {
4093   .dirty = {
4094      .mesa  = 0,
4095      .brw   = BRW_NEW_BATCH |
4096               BRW_NEW_BLORP |
4097               BRW_NEW_TESS_PROGRAMS |
4098               BRW_NEW_TES_PROG_DATA,
4099   },
4100   .emit = genX(upload_ds_state),
4101};
4102
4103/* ---------------------------------------------------------------------- */
4104
4105static void
4106upload_te_state(struct brw_context *brw)
4107{
4108   /* BRW_NEW_TESS_PROGRAMS */
4109   bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4110
4111   /* BRW_NEW_TES_PROG_DATA */
4112   const struct brw_tes_prog_data *tes_prog_data =
4113      brw_tes_prog_data(brw->tes.base.prog_data);
4114
4115   if (active) {
4116      brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4117         te.Partitioning = tes_prog_data->partitioning;
4118         te.OutputTopology = tes_prog_data->output_topology;
4119         te.TEDomain = tes_prog_data->domain;
4120         te.TEEnable = true;
4121         te.MaximumTessellationFactorOdd = 63.0;
4122         te.MaximumTessellationFactorNotOdd = 64.0;
4123      }
4124   } else {
4125      brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4126   }
4127}
4128
4129static const struct brw_tracked_state genX(te_state) = {
4130   .dirty = {
4131      .mesa  = 0,
4132      .brw   = BRW_NEW_BLORP |
4133               BRW_NEW_CONTEXT |
4134               BRW_NEW_TES_PROG_DATA |
4135               BRW_NEW_TESS_PROGRAMS,
4136   },
4137   .emit = upload_te_state,
4138};
4139
4140/* ---------------------------------------------------------------------- */
4141
4142static void
4143genX(upload_tes_push_constants)(struct brw_context *brw)
4144{
4145   struct brw_stage_state *stage_state = &brw->tes.base;
4146   /* BRW_NEW_TESS_PROGRAMS */
4147   const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4148
4149   /* BRW_NEW_TES_PROG_DATA */
4150   const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4151   gfx6_upload_push_constants(brw, tep, prog_data, stage_state);
4152}
4153
4154static const struct brw_tracked_state genX(tes_push_constants) = {
4155   .dirty = {
4156      .mesa  = _NEW_PROGRAM_CONSTANTS,
4157      .brw   = BRW_NEW_BATCH |
4158               BRW_NEW_BLORP |
4159               BRW_NEW_TESS_PROGRAMS |
4160               BRW_NEW_TES_PROG_DATA,
4161   },
4162   .emit = genX(upload_tes_push_constants),
4163};
4164
4165static void
4166genX(upload_tcs_push_constants)(struct brw_context *brw)
4167{
4168   struct brw_stage_state *stage_state = &brw->tcs.base;
4169   /* BRW_NEW_TESS_PROGRAMS */
4170   const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4171
4172   /* BRW_NEW_TCS_PROG_DATA */
4173   const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4174
4175   gfx6_upload_push_constants(brw, tcp, prog_data, stage_state);
4176}
4177
4178static const struct brw_tracked_state genX(tcs_push_constants) = {
4179   .dirty = {
4180      .mesa  = _NEW_PROGRAM_CONSTANTS,
4181      .brw   = BRW_NEW_BATCH |
4182               BRW_NEW_BLORP |
4183               BRW_NEW_DEFAULT_TESS_LEVELS |
4184               BRW_NEW_TESS_PROGRAMS |
4185               BRW_NEW_TCS_PROG_DATA,
4186   },
4187   .emit = genX(upload_tcs_push_constants),
4188};
4189
4190#endif
4191
4192/* ---------------------------------------------------------------------- */
4193
4194#if GFX_VER >= 7
4195static void
4196genX(upload_cs_push_constants)(struct brw_context *brw)
4197{
4198   struct brw_stage_state *stage_state = &brw->cs.base;
4199
4200   /* BRW_NEW_COMPUTE_PROGRAM */
4201   const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4202
4203   if (cp) {
4204      /* BRW_NEW_CS_PROG_DATA */
4205      struct brw_cs_prog_data *cs_prog_data =
4206         brw_cs_prog_data(brw->cs.base.prog_data);
4207
4208      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4209      brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4210   }
4211}
4212
4213const struct brw_tracked_state genX(cs_push_constants) = {
4214   .dirty = {
4215      .mesa = _NEW_PROGRAM_CONSTANTS,
4216      .brw = BRW_NEW_BATCH |
4217             BRW_NEW_BLORP |
4218             BRW_NEW_COMPUTE_PROGRAM |
4219             BRW_NEW_CS_PROG_DATA,
4220   },
4221   .emit = genX(upload_cs_push_constants),
4222};
4223
4224/**
4225 * Creates a new CS constant buffer reflecting the current CS program's
4226 * constants, if needed by the CS program.
4227 */
4228static void
4229genX(upload_cs_pull_constants)(struct brw_context *brw)
4230{
4231   struct brw_stage_state *stage_state = &brw->cs.base;
4232
4233   /* BRW_NEW_COMPUTE_PROGRAM */
4234   struct brw_program *cp =
4235      (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4236
4237   /* BRW_NEW_CS_PROG_DATA */
4238   const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4239
4240   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4241   /* _NEW_PROGRAM_CONSTANTS */
4242   brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4243                             stage_state, prog_data);
4244}
4245
4246const struct brw_tracked_state genX(cs_pull_constants) = {
4247   .dirty = {
4248      .mesa = _NEW_PROGRAM_CONSTANTS,
4249      .brw = BRW_NEW_BATCH |
4250             BRW_NEW_BLORP |
4251             BRW_NEW_COMPUTE_PROGRAM |
4252             BRW_NEW_CS_PROG_DATA,
4253   },
4254   .emit = genX(upload_cs_pull_constants),
4255};
4256
4257static void
4258genX(upload_cs_state)(struct brw_context *brw)
4259{
4260   if (!brw->cs.base.prog_data)
4261      return;
4262
4263   uint32_t offset;
4264   uint32_t *desc = (uint32_t*) brw_state_batch(
4265      brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4266      &offset);
4267
4268   struct brw_stage_state *stage_state = &brw->cs.base;
4269   struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4270   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4271   const struct intel_device_info *devinfo = &brw->screen->devinfo;
4272
4273   const struct brw_cs_dispatch_info dispatch =
4274      brw_cs_get_dispatch_info(devinfo, cs_prog_data, brw->compute.group_size);
4275
4276   if (INTEL_DEBUG(DEBUG_SHADER_TIME)) {
4277      brw_emit_buffer_surface_state(
4278         brw, &stage_state->surf_offset[
4279                 prog_data->binding_table.shader_time_start],
4280         brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4281         brw->shader_time.bo->size, 1,
4282         RELOC_WRITE);
4283   }
4284
4285   uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4286                                    32, &stage_state->bind_bo_offset);
4287
4288   /* The MEDIA_VFE_STATE documentation for Gfx8+ says:
4289    *
4290    * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4291    *  the only bits that are changed are scoreboard related: Scoreboard
4292    *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4293    *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4294    *
4295    * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4296    * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4297    */
4298   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4299
4300   brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4301      if (prog_data->total_scratch) {
4302         uint32_t per_thread_scratch_value;
4303
4304         if (GFX_VER >= 8) {
4305            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4306             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4307             */
4308            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4309         } else if (GFX_VERx10 == 75) {
4310            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4311             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4312             */
4313            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4314         } else {
4315            /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4316             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4317             */
4318            per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4319         }
4320         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
4321         vfe.PerThreadScratchSpace = per_thread_scratch_value;
4322      }
4323
4324      vfe.MaximumNumberofThreads =
4325         devinfo->max_cs_threads * devinfo->subslice_total - 1;
4326      vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0;
4327#if GFX_VER < 11
4328      vfe.ResetGatewayTimer =
4329         Resettingrelativetimerandlatchingtheglobaltimestamp;
4330#endif
4331#if GFX_VER < 9
4332      vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4333#endif
4334#if GFX_VER == 7
4335      vfe.GPGPUMode = true;
4336#endif
4337
4338      /* We are uploading duplicated copies of push constant uniforms for each
4339       * thread. Although the local id data needs to vary per thread, it won't
4340       * change for other uniform data. Unfortunately this duplication is
4341       * required for gfx7. As of Haswell, this duplication can be avoided,
4342       * but this older mechanism with duplicated data continues to work.
4343       *
4344       * FINISHME: As of Haswell, we could make use of the
4345       * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4346       * field to only store one copy of uniform data.
4347       *
4348       * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4349       * which is described in the GPGPU_WALKER command and in the Broadwell
4350       * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4351       * Operations => GPGPU Mode => Indirect Payload Storage.
4352       *
4353       * Note: The constant data is built in brw_upload_cs_push_constants
4354       * below.
4355       */
4356      vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0;
4357
4358      const uint32_t vfe_curbe_allocation =
4359         ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads +
4360               cs_prog_data->push.cross_thread.regs, 2);
4361      vfe.CURBEAllocationSize = vfe_curbe_allocation;
4362   }
4363
4364   const unsigned push_const_size =
4365      brw_cs_push_const_total_size(cs_prog_data, dispatch.threads);
4366   if (push_const_size > 0) {
4367      brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4368         curbe.CURBETotalDataLength = ALIGN(push_const_size, 64);
4369         curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4370      }
4371   }
4372
4373   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4374   memcpy(bind, stage_state->surf_offset,
4375          prog_data->binding_table.size_bytes);
4376   const uint64_t ksp = brw->cs.base.prog_offset +
4377                        brw_cs_prog_data_prog_offset(cs_prog_data,
4378                                                     dispatch.simd_size);
4379   const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4380      .KernelStartPointer = ksp,
4381      .SamplerStatePointer = stage_state->sampler_offset,
4382      /* Wa_1606682166 */
4383      .SamplerCount = GFX_VER == 11 ? 0 :
4384                      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4385      .BindingTablePointer = stage_state->bind_bo_offset,
4386      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4387      .NumberofThreadsinGPGPUThreadGroup = dispatch.threads,
4388      .SharedLocalMemorySize = encode_slm_size(GFX_VER,
4389                                               prog_data->total_shared),
4390      .BarrierEnable = cs_prog_data->uses_barrier,
4391#if GFX_VERx10 >= 75
4392      .CrossThreadConstantDataReadLength =
4393         cs_prog_data->push.cross_thread.regs,
4394#endif
4395   };
4396
4397   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4398
4399   brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4400      load.InterfaceDescriptorTotalLength =
4401         GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4402      load.InterfaceDescriptorDataStartAddress = offset;
4403   }
4404}
4405
4406static const struct brw_tracked_state genX(cs_state) = {
4407   .dirty = {
4408      .mesa = _NEW_PROGRAM_CONSTANTS,
4409      .brw = BRW_NEW_BATCH |
4410             BRW_NEW_BLORP |
4411             BRW_NEW_CS_PROG_DATA |
4412             BRW_NEW_SAMPLER_STATE_TABLE |
4413             BRW_NEW_SURFACES,
4414   },
4415   .emit = genX(upload_cs_state)
4416};
4417
4418#define GPGPU_DISPATCHDIMX 0x2500
4419#define GPGPU_DISPATCHDIMY 0x2504
4420#define GPGPU_DISPATCHDIMZ 0x2508
4421
4422#define MI_PREDICATE_SRC0  0x2400
4423#define MI_PREDICATE_SRC1  0x2408
4424
4425static void
4426prepare_indirect_gpgpu_walker(struct brw_context *brw)
4427{
4428   GLintptr indirect_offset = brw->compute.num_work_groups_offset;
4429   struct brw_bo *bo = brw->compute.num_work_groups_bo;
4430
4431   emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
4432   emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
4433   emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
4434
4435#if GFX_VER <= 7
4436   /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
4437   emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
4438   emit_lri(brw, MI_PREDICATE_SRC1    , 0);
4439   emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
4440
4441   /* Load compute_dispatch_indirect_x_size into SRC0 */
4442   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
4443
4444   /* predicate = (compute_dispatch_indirect_x_size == 0); */
4445   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4446      mip.LoadOperation    = LOAD_LOAD;
4447      mip.CombineOperation = COMBINE_SET;
4448      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4449   }
4450
4451   /* Load compute_dispatch_indirect_y_size into SRC0 */
4452   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
4453
4454   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4455   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4456      mip.LoadOperation    = LOAD_LOAD;
4457      mip.CombineOperation = COMBINE_OR;
4458      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4459   }
4460
4461   /* Load compute_dispatch_indirect_z_size into SRC0 */
4462   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
4463
4464   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4465   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4466      mip.LoadOperation    = LOAD_LOAD;
4467      mip.CombineOperation = COMBINE_OR;
4468      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4469   }
4470
4471   /* predicate = !predicate; */
4472#define COMPARE_FALSE                           1
4473   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4474      mip.LoadOperation    = LOAD_LOADINV;
4475      mip.CombineOperation = COMBINE_OR;
4476      mip.CompareOperation = COMPARE_FALSE;
4477   }
4478#endif
4479}
4480
4481static void
4482genX(emit_gpgpu_walker)(struct brw_context *brw)
4483{
4484   const GLuint *num_groups = brw->compute.num_work_groups;
4485
4486   bool indirect = brw->compute.num_work_groups_bo != NULL;
4487   if (indirect)
4488      prepare_indirect_gpgpu_walker(brw);
4489
4490   const struct brw_cs_dispatch_info dispatch =
4491      brw_cs_get_dispatch_info(&brw->screen->devinfo,
4492                               brw_cs_prog_data(brw->cs.base.prog_data),
4493                               brw->compute.group_size);
4494
4495   brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
4496      ggw.IndirectParameterEnable      = indirect;
4497      ggw.PredicateEnable              = GFX_VER <= 7 && indirect;
4498      ggw.SIMDSize                     = dispatch.simd_size / 16;
4499      ggw.ThreadDepthCounterMaximum    = 0;
4500      ggw.ThreadHeightCounterMaximum   = 0;
4501      ggw.ThreadWidthCounterMaximum    = dispatch.threads - 1;
4502      ggw.ThreadGroupIDXDimension      = num_groups[0];
4503      ggw.ThreadGroupIDYDimension      = num_groups[1];
4504      ggw.ThreadGroupIDZDimension      = num_groups[2];
4505      ggw.RightExecutionMask           = dispatch.right_mask;
4506      ggw.BottomExecutionMask          = 0xffffffff;
4507   }
4508
4509   brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
4510}
4511
4512#endif
4513
4514/* ---------------------------------------------------------------------- */
4515
4516#if GFX_VER >= 8
4517static void
4518genX(upload_raster)(struct brw_context *brw)
4519{
4520   const struct gl_context *ctx = &brw->ctx;
4521
4522   /* _NEW_BUFFERS */
4523   const bool flip_y = ctx->DrawBuffer->FlipY;
4524
4525   /* _NEW_POLYGON */
4526   const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4527
4528   /* _NEW_POINT */
4529   const struct gl_point_attrib *point = &ctx->Point;
4530
4531   brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4532      if (brw->polygon_front_bit != flip_y)
4533         raster.FrontWinding = CounterClockwise;
4534
4535      if (polygon->CullFlag) {
4536         switch (polygon->CullFaceMode) {
4537         case GL_FRONT:
4538            raster.CullMode = CULLMODE_FRONT;
4539            break;
4540         case GL_BACK:
4541            raster.CullMode = CULLMODE_BACK;
4542            break;
4543         case GL_FRONT_AND_BACK:
4544            raster.CullMode = CULLMODE_BOTH;
4545            break;
4546         default:
4547            unreachable("not reached");
4548         }
4549      } else {
4550         raster.CullMode = CULLMODE_NONE;
4551      }
4552
4553      raster.SmoothPointEnable = point->SmoothFlag;
4554
4555      raster.DXMultisampleRasterizationEnable =
4556         _mesa_is_multisample_enabled(ctx);
4557
4558      raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4559      raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4560      raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4561
4562      switch (polygon->FrontMode) {
4563      case GL_FILL:
4564         raster.FrontFaceFillMode = FILL_MODE_SOLID;
4565         break;
4566      case GL_LINE:
4567         raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4568         break;
4569      case GL_POINT:
4570         raster.FrontFaceFillMode = FILL_MODE_POINT;
4571         break;
4572      default:
4573         unreachable("not reached");
4574      }
4575
4576      switch (polygon->BackMode) {
4577      case GL_FILL:
4578         raster.BackFaceFillMode = FILL_MODE_SOLID;
4579         break;
4580      case GL_LINE:
4581         raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4582         break;
4583      case GL_POINT:
4584         raster.BackFaceFillMode = FILL_MODE_POINT;
4585         break;
4586      default:
4587         unreachable("not reached");
4588      }
4589
4590      /* _NEW_LINE */
4591      raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4592
4593#if GFX_VER == 10
4594      /* _NEW_BUFFERS
4595       * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4596       */
4597      const bool multisampled_fbo =
4598         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4599      if (multisampled_fbo)
4600         raster.AntialiasingEnable = false;
4601#endif
4602
4603      /* _NEW_SCISSOR */
4604      raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4605
4606      /* _NEW_TRANSFORM */
4607#if GFX_VER < 9
4608      if (!(ctx->Transform.DepthClampNear &&
4609            ctx->Transform.DepthClampFar))
4610         raster.ViewportZClipTestEnable = true;
4611#endif
4612
4613#if GFX_VER >= 9
4614      if (!ctx->Transform.DepthClampNear)
4615         raster.ViewportZNearClipTestEnable = true;
4616
4617      if (!ctx->Transform.DepthClampFar)
4618         raster.ViewportZFarClipTestEnable = true;
4619#endif
4620
4621      /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4622#if GFX_VER >= 9
4623      raster.ConservativeRasterizationEnable =
4624         ctx->IntelConservativeRasterization;
4625#endif
4626
4627      raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4628      raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4629
4630      raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4631   }
4632}
4633
4634static const struct brw_tracked_state genX(raster_state) = {
4635   .dirty = {
4636      .mesa  = _NEW_BUFFERS |
4637               _NEW_LINE |
4638               _NEW_MULTISAMPLE |
4639               _NEW_POINT |
4640               _NEW_POLYGON |
4641               _NEW_SCISSOR |
4642               _NEW_TRANSFORM,
4643      .brw   = BRW_NEW_BLORP |
4644               BRW_NEW_CONTEXT |
4645               BRW_NEW_CONSERVATIVE_RASTERIZATION,
4646   },
4647   .emit = genX(upload_raster),
4648};
4649#endif
4650
4651/* ---------------------------------------------------------------------- */
4652
4653#if GFX_VER >= 8
4654static void
4655genX(upload_ps_extra)(struct brw_context *brw)
4656{
4657   UNUSED struct gl_context *ctx = &brw->ctx;
4658
4659   const struct brw_wm_prog_data *prog_data =
4660      brw_wm_prog_data(brw->wm.base.prog_data);
4661
4662   brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4663      psx.PixelShaderValid = true;
4664      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4665      psx.PixelShaderKillsPixel = prog_data->uses_kill;
4666      psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4667      psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4668      psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4669      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4670
4671      /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4672      if (prog_data->uses_sample_mask) {
4673#if GFX_VER >= 9
4674         if (prog_data->post_depth_coverage)
4675            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4676         else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4677            psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4678         else
4679            psx.InputCoverageMaskState = ICMS_NORMAL;
4680#else
4681         psx.PixelShaderUsesInputCoverageMask = true;
4682#endif
4683      }
4684
4685      psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4686#if GFX_VER >= 9
4687      psx.PixelShaderPullsBary = prog_data->pulls_bary;
4688      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4689#endif
4690
4691      /* The stricter cross-primitive coherency guarantees that the hardware
4692       * gives us with the "Accesses UAV" bit set for at least one shader stage
4693       * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4694       * are redundant within the current image, atomic counter and SSBO GL
4695       * APIs, which all have very loose ordering and coherency requirements
4696       * and generally rely on the application to insert explicit barriers when
4697       * a shader invocation is expected to see the memory writes performed by
4698       * the invocations of some previous primitive.  Regardless of the value
4699       * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4700       * cause an in most cases useless DC flush when the lowermost stage with
4701       * the bit set finishes execution.
4702       *
4703       * It would be nice to disable it, but in some cases we can't because on
4704       * Gfx8+ it also has an influence on rasterization via the PS UAV-only
4705       * signal (which could be set independently from the coherency mechanism
4706       * in the 3DSTATE_WM command on Gfx7), and because in some cases it will
4707       * determine whether the hardware skips execution of the fragment shader
4708       * or not via the ThreadDispatchEnable signal.  However if we know that
4709       * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4710       * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4711       * difference so we may just disable it here.
4712       *
4713       * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4714       * take into account KillPixels when no depth or stencil writes are
4715       * enabled.  In order for occlusion queries to work correctly with no
4716       * attachments, we need to force-enable here.
4717       *
4718       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4719       * _NEW_COLOR
4720       */
4721      if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4722          !brw_color_buffer_write_enabled(brw))
4723         psx.PixelShaderHasUAV = true;
4724   }
4725}
4726
4727const struct brw_tracked_state genX(ps_extra) = {
4728   .dirty = {
4729      .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4730      .brw   = BRW_NEW_BLORP |
4731               BRW_NEW_CONTEXT |
4732               BRW_NEW_FRAGMENT_PROGRAM |
4733               BRW_NEW_FS_PROG_DATA |
4734               BRW_NEW_CONSERVATIVE_RASTERIZATION,
4735   },
4736   .emit = genX(upload_ps_extra),
4737};
4738#endif
4739
4740/* ---------------------------------------------------------------------- */
4741
4742#if GFX_VER >= 8
4743static void
4744genX(upload_ps_blend)(struct brw_context *brw)
4745{
4746   struct gl_context *ctx = &brw->ctx;
4747
4748   /* _NEW_BUFFERS */
4749   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4750   const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4751
4752   /* _NEW_COLOR */
4753   struct gl_colorbuffer_attrib *color = &ctx->Color;
4754
4755   brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4756      /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4757      pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4758
4759      bool alpha_to_one = false;
4760
4761      if (!buffer0_is_integer) {
4762         /* _NEW_MULTISAMPLE */
4763
4764         if (_mesa_is_multisample_enabled(ctx)) {
4765            pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4766            alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4767         }
4768
4769         pb.AlphaTestEnable = color->AlphaEnabled;
4770      }
4771
4772      /* Used for implementing the following bit of GL_EXT_texture_integer:
4773       * "Per-fragment operations that require floating-point color
4774       *  components, including multisample alpha operations, alpha test,
4775       *  blending, and dithering, have no effect when the corresponding
4776       *  colors are written to an integer color buffer."
4777       *
4778       * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4779       * "If drawbuffer zero is not NONE and the buffer it references has an
4780       *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4781       *  operations are skipped."
4782       */
4783      if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4784         GLenum eqRGB = color->Blend[0].EquationRGB;
4785         GLenum eqA = color->Blend[0].EquationA;
4786         GLenum srcRGB = color->Blend[0].SrcRGB;
4787         GLenum dstRGB = color->Blend[0].DstRGB;
4788         GLenum srcA = color->Blend[0].SrcA;
4789         GLenum dstA = color->Blend[0].DstA;
4790
4791         if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4792            srcRGB = dstRGB = GL_ONE;
4793
4794         if (eqA == GL_MIN || eqA == GL_MAX)
4795            srcA = dstA = GL_ONE;
4796
4797         /* Due to hardware limitations, the destination may have information
4798          * in an alpha channel even when the format specifies no alpha
4799          * channel. In order to avoid getting any incorrect blending due to
4800          * that alpha channel, coerce the blend factors to values that will
4801          * not read the alpha channel, but will instead use the correct
4802          * implicit value for alpha.
4803          */
4804         if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4805                                            GL_TEXTURE_ALPHA_TYPE)) {
4806            srcRGB = brw_fix_xRGB_alpha(srcRGB);
4807            srcA = brw_fix_xRGB_alpha(srcA);
4808            dstRGB = brw_fix_xRGB_alpha(dstRGB);
4809            dstA = brw_fix_xRGB_alpha(dstA);
4810         }
4811
4812         /* Alpha to One doesn't work with Dual Color Blending.  Override
4813          * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4814          */
4815         if (alpha_to_one && color->_BlendUsesDualSrc & 0x1) {
4816            srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4817            srcA = fix_dual_blend_alpha_to_one(srcA);
4818            dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4819            dstA = fix_dual_blend_alpha_to_one(dstA);
4820         }
4821
4822         /* BRW_NEW_FS_PROG_DATA */
4823         const struct brw_wm_prog_data *wm_prog_data =
4824            brw_wm_prog_data(brw->wm.base.prog_data);
4825
4826         /* The Dual Source Blending documentation says:
4827          *
4828          * "If SRC1 is included in a src/dst blend factor and
4829          * a DualSource RT Write message is not used, results
4830          * are UNDEFINED. (This reflects the same restriction in DX APIs,
4831          * where undefined results are produced if “o1” is not written
4832          * by a PS – there are no default values defined).
4833          * If SRC1 is not included in a src/dst blend factor,
4834          * dual source blending must be disabled."
4835          *
4836          * There is no way to gracefully fix this undefined situation
4837          * so we just disable the blending to prevent possible issues.
4838          */
4839         pb.ColorBufferBlendEnable =
4840            !(color->_BlendUsesDualSrc & 0x1) || wm_prog_data->dual_src_blend;
4841         pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4842         pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4843         pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4844         pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4845
4846         pb.IndependentAlphaBlendEnable =
4847            srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4848      }
4849   }
4850}
4851
4852static const struct brw_tracked_state genX(ps_blend) = {
4853   .dirty = {
4854      .mesa = _NEW_BUFFERS |
4855              _NEW_COLOR |
4856              _NEW_MULTISAMPLE,
4857      .brw = BRW_NEW_BLORP |
4858             BRW_NEW_CONTEXT |
4859             BRW_NEW_FRAGMENT_PROGRAM |
4860             BRW_NEW_FS_PROG_DATA,
4861   },
4862   .emit = genX(upload_ps_blend)
4863};
4864#endif
4865
4866/* ---------------------------------------------------------------------- */
4867
4868#if GFX_VER >= 8
4869static void
4870genX(emit_vf_topology)(struct brw_context *brw)
4871{
4872   brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4873      vftopo.PrimitiveTopologyType = brw->primitive;
4874   }
4875}
4876
4877static const struct brw_tracked_state genX(vf_topology) = {
4878   .dirty = {
4879      .mesa = 0,
4880      .brw = BRW_NEW_BLORP |
4881             BRW_NEW_PRIMITIVE,
4882   },
4883   .emit = genX(emit_vf_topology),
4884};
4885#endif
4886
4887/* ---------------------------------------------------------------------- */
4888
4889#if GFX_VER >= 7
4890static void
4891genX(emit_mi_report_perf_count)(struct brw_context *brw,
4892                                struct brw_bo *bo,
4893                                uint32_t offset_in_bytes,
4894                                uint32_t report_id)
4895{
4896   brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4897      mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4898      mi_rpc.ReportID = report_id;
4899   }
4900}
4901#endif
4902
4903/* ---------------------------------------------------------------------- */
4904
4905/**
4906 * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4907 */
4908static void
4909genX(emit_sampler_state_pointers_xs)(UNUSED struct brw_context *brw,
4910                                     UNUSED struct brw_stage_state *stage_state)
4911{
4912#if GFX_VER >= 7
4913   static const uint16_t packet_headers[] = {
4914      [MESA_SHADER_VERTEX] = 43,
4915      [MESA_SHADER_TESS_CTRL] = 44,
4916      [MESA_SHADER_TESS_EVAL] = 45,
4917      [MESA_SHADER_GEOMETRY] = 46,
4918      [MESA_SHADER_FRAGMENT] = 47,
4919   };
4920
4921   /* Ivybridge requires a workaround flush before VS packets. */
4922   if (GFX_VERx10 == 70 &&
4923       stage_state->stage == MESA_SHADER_VERTEX) {
4924      gfx7_emit_vs_workaround_flush(brw);
4925   }
4926
4927   brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
4928      ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
4929      ptr.PointertoVSSamplerState = stage_state->sampler_offset;
4930   }
4931#endif
4932}
4933
4934UNUSED static bool
4935has_component(mesa_format format, int i)
4936{
4937   if (_mesa_is_format_color_format(format))
4938      return _mesa_format_has_color_component(format, i);
4939
4940   /* depth and stencil have only one component */
4941   return i == 0;
4942}
4943
4944/**
4945 * Upload SAMPLER_BORDER_COLOR_STATE.
4946 */
4947static void
4948genX(upload_default_color)(struct brw_context *brw,
4949                           const struct gl_sampler_object *sampler,
4950                           UNUSED mesa_format format,
4951                           GLenum base_format,
4952                           bool is_integer_format, bool is_stencil_sampling,
4953                           uint32_t *sdc_offset)
4954{
4955   union gl_color_union color;
4956
4957   switch (base_format) {
4958   case GL_DEPTH_COMPONENT:
4959      /* GL specs that border color for depth textures is taken from the
4960       * R channel, while the hardware uses A.  Spam R into all the
4961       * channels for safety.
4962       */
4963      color.ui[0] = sampler->Attrib.state.border_color.ui[0];
4964      color.ui[1] = sampler->Attrib.state.border_color.ui[0];
4965      color.ui[2] = sampler->Attrib.state.border_color.ui[0];
4966      color.ui[3] = sampler->Attrib.state.border_color.ui[0];
4967      break;
4968   case GL_ALPHA:
4969      color.ui[0] = 0u;
4970      color.ui[1] = 0u;
4971      color.ui[2] = 0u;
4972      color.ui[3] = sampler->Attrib.state.border_color.ui[3];
4973      break;
4974   case GL_INTENSITY:
4975      color.ui[0] = sampler->Attrib.state.border_color.ui[0];
4976      color.ui[1] = sampler->Attrib.state.border_color.ui[0];
4977      color.ui[2] = sampler->Attrib.state.border_color.ui[0];
4978      color.ui[3] = sampler->Attrib.state.border_color.ui[0];
4979      break;
4980   case GL_LUMINANCE:
4981      color.ui[0] = sampler->Attrib.state.border_color.ui[0];
4982      color.ui[1] = sampler->Attrib.state.border_color.ui[0];
4983      color.ui[2] = sampler->Attrib.state.border_color.ui[0];
4984      color.ui[3] = float_as_int(1.0);
4985      break;
4986   case GL_LUMINANCE_ALPHA:
4987      color.ui[0] = sampler->Attrib.state.border_color.ui[0];
4988      color.ui[1] = sampler->Attrib.state.border_color.ui[0];
4989      color.ui[2] = sampler->Attrib.state.border_color.ui[0];
4990      color.ui[3] = sampler->Attrib.state.border_color.ui[3];
4991      break;
4992   default:
4993      color.ui[0] = sampler->Attrib.state.border_color.ui[0];
4994      color.ui[1] = sampler->Attrib.state.border_color.ui[1];
4995      color.ui[2] = sampler->Attrib.state.border_color.ui[2];
4996      color.ui[3] = sampler->Attrib.state.border_color.ui[3];
4997      break;
4998   }
4999
5000   /* In some cases we use an RGBA surface format for GL RGB textures,
5001    * where we've initialized the A channel to 1.0.  We also have to set
5002    * the border color alpha to 1.0 in that case.
5003    */
5004   if (base_format == GL_RGB)
5005      color.ui[3] = float_as_int(1.0);
5006
5007   int alignment = 32;
5008   if (GFX_VER >= 8) {
5009      alignment = 64;
5010   } else if (GFX_VERx10 == 75 && (is_integer_format || is_stencil_sampling)) {
5011      alignment = 512;
5012   }
5013
5014   uint32_t *sdc = brw_state_batch(
5015      brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
5016      alignment, sdc_offset);
5017
5018   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
5019
5020#define ASSIGN(dst, src) \
5021   do {                  \
5022      dst = src;         \
5023   } while (0)
5024
5025#define ASSIGNu16(dst, src) \
5026   do {                     \
5027      dst = (uint16_t)src;  \
5028   } while (0)
5029
5030#define ASSIGNu8(dst, src) \
5031   do {                    \
5032      dst = (uint8_t)src;  \
5033   } while (0)
5034
5035#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
5036   macro(state.BorderColor ## _color_type ## Red, src[0]);   \
5037   macro(state.BorderColor ## _color_type ## Green, src[1]);   \
5038   macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
5039   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
5040
5041#if GFX_VER >= 8
5042   /* On Broadwell, the border color is represented as four 32-bit floats,
5043    * integers, or unsigned values, interpreted according to the surface
5044    * format.  This matches the sampler->BorderColor union exactly; just
5045    * memcpy the values.
5046    */
5047   BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
5048#elif GFX_VERx10 == 75
5049   if (is_integer_format || is_stencil_sampling) {
5050      bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
5051      const int bits_per_channel =
5052         _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
5053
5054      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
5055       * "If any color channel is missing from the surface format,
5056       *  corresponding border color should be programmed as zero and if
5057       *  alpha channel is missing, corresponding Alpha border color should
5058       *  be programmed as 1."
5059       */
5060      unsigned c[4] = { 0, 0, 0, 1 };
5061      for (int i = 0; i < 4; i++) {
5062         if (has_component(format, i))
5063            c[i] = color.ui[i];
5064      }
5065
5066      switch (bits_per_channel) {
5067      case 8:
5068         /* Copy RGBA in order. */
5069         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
5070         break;
5071      case 10:
5072         /* R10G10B10A2_UINT is treated like a 16-bit format. */
5073      case 16:
5074         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
5075         break;
5076      case 32:
5077         if (base_format == GL_RG) {
5078            /* Careful inspection of the tables reveals that for RG32 formats,
5079             * the green channel needs to go where blue normally belongs.
5080             */
5081            state.BorderColor32bitRed = c[0];
5082            state.BorderColor32bitBlue = c[1];
5083            state.BorderColor32bitAlpha = 1;
5084         } else {
5085            /* Copy RGBA in order. */
5086            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
5087         }
5088         break;
5089      default:
5090         assert(!"Invalid number of bits per channel in integer format.");
5091         break;
5092      }
5093   } else {
5094      BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5095   }
5096#elif GFX_VER == 5 || GFX_VER == 6
5097   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
5098   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
5099   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
5100
5101#define MESA_FLOAT_TO_HALF(dst, src) \
5102   dst = _mesa_float_to_half(src);
5103
5104   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
5105
5106#undef MESA_FLOAT_TO_HALF
5107
5108   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
5109   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
5110   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
5111   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
5112
5113   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5114#elif GFX_VER == 4
5115   BORDER_COLOR_ATTR(ASSIGN, , color.f);
5116#else
5117   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5118#endif
5119
5120#undef ASSIGN
5121#undef BORDER_COLOR_ATTR
5122
5123   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
5124}
5125
5126static uint32_t
5127translate_wrap_mode(GLenum wrap, UNUSED bool using_nearest)
5128{
5129   switch (wrap) {
5130   case GL_REPEAT:
5131      return TCM_WRAP;
5132   case GL_CLAMP:
5133#if GFX_VER >= 8
5134      /* GL_CLAMP is the weird mode where coordinates are clamped to
5135       * [0.0, 1.0], so linear filtering of coordinates outside of
5136       * [0.0, 1.0] give you half edge texel value and half border
5137       * color.
5138       *
5139       * Gfx8+ supports this natively.
5140       */
5141      return TCM_HALF_BORDER;
5142#else
5143      /* On Gfx4-7.5, we clamp the coordinates in the fragment shader
5144       * and set clamp_border here, which gets the result desired.
5145       * We just use clamp(_to_edge) for nearest, because for nearest
5146       * clamping to 1.0 gives border color instead of the desired
5147       * edge texels.
5148       */
5149      if (using_nearest)
5150         return TCM_CLAMP;
5151      else
5152         return TCM_CLAMP_BORDER;
5153#endif
5154   case GL_CLAMP_TO_EDGE:
5155      return TCM_CLAMP;
5156   case GL_CLAMP_TO_BORDER:
5157      return TCM_CLAMP_BORDER;
5158   case GL_MIRRORED_REPEAT:
5159      return TCM_MIRROR;
5160   case GL_MIRROR_CLAMP_TO_EDGE:
5161      return TCM_MIRROR_ONCE;
5162   default:
5163      return TCM_WRAP;
5164   }
5165}
5166
5167/**
5168 * Return true if the given wrap mode requires the border color to exist.
5169 */
5170static bool
5171wrap_mode_needs_border_color(unsigned wrap_mode)
5172{
5173#if GFX_VER >= 8
5174   return wrap_mode == TCM_CLAMP_BORDER ||
5175          wrap_mode == TCM_HALF_BORDER;
5176#else
5177   return wrap_mode == TCM_CLAMP_BORDER;
5178#endif
5179}
5180
5181/**
5182 * Sets the sampler state for a single unit based off of the sampler key
5183 * entry.
5184 */
5185static void
5186genX(update_sampler_state)(struct brw_context *brw,
5187                           GLenum target, bool tex_cube_map_seamless,
5188                           GLfloat tex_unit_lod_bias,
5189                           mesa_format format, GLenum base_format,
5190                           const struct gl_texture_object *texObj,
5191                           const struct gl_sampler_object *sampler,
5192                           uint32_t *sampler_state)
5193{
5194   struct GENX(SAMPLER_STATE) samp_st = { 0 };
5195
5196   /* Select min and mip filters. */
5197   switch (sampler->Attrib.MinFilter) {
5198   case GL_NEAREST:
5199      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5200      samp_st.MipModeFilter = MIPFILTER_NONE;
5201      break;
5202   case GL_LINEAR:
5203      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5204      samp_st.MipModeFilter = MIPFILTER_NONE;
5205      break;
5206   case GL_NEAREST_MIPMAP_NEAREST:
5207      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5208      samp_st.MipModeFilter = MIPFILTER_NEAREST;
5209      break;
5210   case GL_LINEAR_MIPMAP_NEAREST:
5211      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5212      samp_st.MipModeFilter = MIPFILTER_NEAREST;
5213      break;
5214   case GL_NEAREST_MIPMAP_LINEAR:
5215      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5216      samp_st.MipModeFilter = MIPFILTER_LINEAR;
5217      break;
5218   case GL_LINEAR_MIPMAP_LINEAR:
5219      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5220      samp_st.MipModeFilter = MIPFILTER_LINEAR;
5221      break;
5222   default:
5223      unreachable("not reached");
5224   }
5225
5226   /* Select mag filter. */
5227   samp_st.MagModeFilter = sampler->Attrib.MagFilter == GL_LINEAR ?
5228      MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5229
5230   /* Enable anisotropic filtering if desired. */
5231   samp_st.MaximumAnisotropy = RATIO21;
5232
5233   if (sampler->Attrib.MaxAnisotropy > 1.0f) {
5234      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5235         samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5236      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5237         samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5238
5239      if (sampler->Attrib.MaxAnisotropy > 2.0f) {
5240         samp_st.MaximumAnisotropy =
5241            MIN2((sampler->Attrib.MaxAnisotropy - 2) / 2, RATIO161);
5242      }
5243   }
5244
5245   /* Set address rounding bits if not using nearest filtering. */
5246   if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5247      samp_st.UAddressMinFilterRoundingEnable = true;
5248      samp_st.VAddressMinFilterRoundingEnable = true;
5249      samp_st.RAddressMinFilterRoundingEnable = true;
5250   }
5251
5252   if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5253      samp_st.UAddressMagFilterRoundingEnable = true;
5254      samp_st.VAddressMagFilterRoundingEnable = true;
5255      samp_st.RAddressMagFilterRoundingEnable = true;
5256   }
5257
5258   bool either_nearest =
5259      sampler->Attrib.MinFilter == GL_NEAREST || sampler->Attrib.MagFilter == GL_NEAREST;
5260   unsigned wrap_s = translate_wrap_mode(sampler->Attrib.WrapS, either_nearest);
5261   unsigned wrap_t = translate_wrap_mode(sampler->Attrib.WrapT, either_nearest);
5262   unsigned wrap_r = translate_wrap_mode(sampler->Attrib.WrapR, either_nearest);
5263
5264   if (target == GL_TEXTURE_CUBE_MAP ||
5265       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5266      /* Cube maps must use the same wrap mode for all three coordinate
5267       * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5268       *
5269       * Ivybridge and Baytrail seem to have problems with CUBE mode and
5270       * integer formats.  Fall back to CLAMP for now.
5271       */
5272      if ((tex_cube_map_seamless || sampler->Attrib.CubeMapSeamless) &&
5273          !(GFX_VERx10 == 70 && texObj->_IsIntegerFormat)) {
5274         wrap_s = TCM_CUBE;
5275         wrap_t = TCM_CUBE;
5276         wrap_r = TCM_CUBE;
5277      } else {
5278         wrap_s = TCM_CLAMP;
5279         wrap_t = TCM_CLAMP;
5280         wrap_r = TCM_CLAMP;
5281      }
5282   } else if (target == GL_TEXTURE_1D) {
5283      /* There's a bug in 1D texture sampling - it actually pays
5284       * attention to the wrap_t value, though it should not.
5285       * Override the wrap_t value here to GL_REPEAT to keep
5286       * any nonexistent border pixels from floating in.
5287       */
5288      wrap_t = TCM_WRAP;
5289   }
5290
5291   samp_st.TCXAddressControlMode = wrap_s;
5292   samp_st.TCYAddressControlMode = wrap_t;
5293   samp_st.TCZAddressControlMode = wrap_r;
5294
5295   samp_st.ShadowFunction =
5296      sampler->Attrib.CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5297      brw_translate_shadow_compare_func(sampler->Attrib.CompareFunc) : 0;
5298
5299#if GFX_VER >= 7
5300   /* Set shadow function. */
5301   samp_st.AnisotropicAlgorithm =
5302      samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5303      EWAApproximation : LEGACY;
5304#endif
5305
5306#if GFX_VER >= 6
5307   samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5308#endif
5309
5310   const float hw_max_lod = GFX_VER >= 7 ? 14 : 13;
5311   samp_st.MinLOD = CLAMP(sampler->Attrib.MinLod, 0, hw_max_lod);
5312   samp_st.MaxLOD = CLAMP(sampler->Attrib.MaxLod, 0, hw_max_lod);
5313   samp_st.TextureLODBias =
5314      CLAMP(tex_unit_lod_bias + sampler->Attrib.LodBias, -16, 15);
5315
5316#if GFX_VER == 6
5317   samp_st.BaseMipLevel =
5318      CLAMP(texObj->Attrib.MinLevel + texObj->Attrib.BaseLevel, 0, hw_max_lod);
5319   samp_st.MinandMagStateNotEqual =
5320      samp_st.MinModeFilter != samp_st.MagModeFilter;
5321#endif
5322
5323   /* Upload the border color if necessary.  If not, just point it at
5324    * offset 0 (the start of the batch) - the color should be ignored,
5325    * but that address won't fault in case something reads it anyway.
5326    */
5327   uint32_t border_color_offset = 0;
5328   if (wrap_mode_needs_border_color(wrap_s) ||
5329       wrap_mode_needs_border_color(wrap_t) ||
5330       wrap_mode_needs_border_color(wrap_r)) {
5331      genX(upload_default_color)(brw, sampler, format, base_format,
5332                                 texObj->_IsIntegerFormat,
5333                                 texObj->StencilSampling,
5334                                 &border_color_offset);
5335   }
5336#if GFX_VER < 6
5337      samp_st.BorderColorPointer =
5338         ro_bo(brw->batch.state.bo, border_color_offset);
5339#else
5340      samp_st.BorderColorPointer = border_color_offset;
5341#endif
5342
5343#if GFX_VER >= 8
5344   samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5345#else
5346   samp_st.LODPreClampEnable = true;
5347#endif
5348
5349   GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5350}
5351
5352static void
5353update_sampler_state(struct brw_context *brw,
5354                     int unit,
5355                     uint32_t *sampler_state)
5356{
5357   struct gl_context *ctx = &brw->ctx;
5358   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5359   const struct gl_texture_object *texObj = texUnit->_Current;
5360   const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5361
5362   /* These don't use samplers at all. */
5363   if (texObj->Target == GL_TEXTURE_BUFFER)
5364      return;
5365
5366   struct gl_texture_image *firstImage = texObj->Image[0][texObj->Attrib.BaseLevel];
5367   genX(update_sampler_state)(brw, texObj->Target,
5368                              ctx->Texture.CubeMapSeamless,
5369                              texUnit->LodBias,
5370                              firstImage->TexFormat, firstImage->_BaseFormat,
5371                              texObj, sampler,
5372                              sampler_state);
5373}
5374
5375static void
5376genX(upload_sampler_state_table)(struct brw_context *brw,
5377                                 struct gl_program *prog,
5378                                 struct brw_stage_state *stage_state)
5379{
5380   struct gl_context *ctx = &brw->ctx;
5381   uint32_t sampler_count = stage_state->sampler_count;
5382
5383   GLbitfield SamplersUsed = prog->SamplersUsed;
5384
5385   if (sampler_count == 0)
5386      return;
5387
5388   /* SAMPLER_STATE is 4 DWords on all platforms. */
5389   const int dwords = GENX(SAMPLER_STATE_length);
5390   const int size_in_bytes = dwords * sizeof(uint32_t);
5391
5392   uint32_t *sampler_state = brw_state_batch(brw,
5393                                             sampler_count * size_in_bytes,
5394                                             32, &stage_state->sampler_offset);
5395   /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5396
5397   for (unsigned s = 0; s < sampler_count; s++) {
5398      if (SamplersUsed & (1 << s)) {
5399         const unsigned unit = prog->SamplerUnits[s];
5400         if (ctx->Texture.Unit[unit]._Current) {
5401            update_sampler_state(brw, unit, sampler_state);
5402         }
5403      }
5404
5405      sampler_state += dwords;
5406   }
5407
5408   if (GFX_VER >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5409      /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5410      genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5411   } else {
5412      /* Flag that the sampler state table pointer has changed; later atoms
5413       * will handle it.
5414       */
5415      brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5416   }
5417}
5418
5419static void
5420genX(upload_fs_samplers)(struct brw_context *brw)
5421{
5422   /* BRW_NEW_FRAGMENT_PROGRAM */
5423   struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5424   genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5425}
5426
5427static const struct brw_tracked_state genX(fs_samplers) = {
5428   .dirty = {
5429      .mesa = _NEW_TEXTURE,
5430      .brw = BRW_NEW_BATCH |
5431             BRW_NEW_BLORP |
5432             BRW_NEW_FRAGMENT_PROGRAM,
5433   },
5434   .emit = genX(upload_fs_samplers),
5435};
5436
5437static void
5438genX(upload_vs_samplers)(struct brw_context *brw)
5439{
5440   /* BRW_NEW_VERTEX_PROGRAM */
5441   struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5442   genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5443}
5444
5445static const struct brw_tracked_state genX(vs_samplers) = {
5446   .dirty = {
5447      .mesa = _NEW_TEXTURE,
5448      .brw = BRW_NEW_BATCH |
5449             BRW_NEW_BLORP |
5450             BRW_NEW_VERTEX_PROGRAM,
5451   },
5452   .emit = genX(upload_vs_samplers),
5453};
5454
5455#if GFX_VER >= 6
5456static void
5457genX(upload_gs_samplers)(struct brw_context *brw)
5458{
5459   /* BRW_NEW_GEOMETRY_PROGRAM */
5460   struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5461   if (!gs)
5462      return;
5463
5464   genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5465}
5466
5467
5468static const struct brw_tracked_state genX(gs_samplers) = {
5469   .dirty = {
5470      .mesa = _NEW_TEXTURE,
5471      .brw = BRW_NEW_BATCH |
5472             BRW_NEW_BLORP |
5473             BRW_NEW_GEOMETRY_PROGRAM,
5474   },
5475   .emit = genX(upload_gs_samplers),
5476};
5477#endif
5478
5479#if GFX_VER >= 7
5480static void
5481genX(upload_tcs_samplers)(struct brw_context *brw)
5482{
5483   /* BRW_NEW_TESS_PROGRAMS */
5484   struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5485   if (!tcs)
5486      return;
5487
5488   genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5489}
5490
5491static const struct brw_tracked_state genX(tcs_samplers) = {
5492   .dirty = {
5493      .mesa = _NEW_TEXTURE,
5494      .brw = BRW_NEW_BATCH |
5495             BRW_NEW_BLORP |
5496             BRW_NEW_TESS_PROGRAMS,
5497   },
5498   .emit = genX(upload_tcs_samplers),
5499};
5500#endif
5501
5502#if GFX_VER >= 7
5503static void
5504genX(upload_tes_samplers)(struct brw_context *brw)
5505{
5506   /* BRW_NEW_TESS_PROGRAMS */
5507   struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5508   if (!tes)
5509      return;
5510
5511   genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5512}
5513
5514static const struct brw_tracked_state genX(tes_samplers) = {
5515   .dirty = {
5516      .mesa = _NEW_TEXTURE,
5517      .brw = BRW_NEW_BATCH |
5518             BRW_NEW_BLORP |
5519             BRW_NEW_TESS_PROGRAMS,
5520   },
5521   .emit = genX(upload_tes_samplers),
5522};
5523#endif
5524
5525#if GFX_VER >= 7
5526static void
5527genX(upload_cs_samplers)(struct brw_context *brw)
5528{
5529   /* BRW_NEW_COMPUTE_PROGRAM */
5530   struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5531   if (!cs)
5532      return;
5533
5534   genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5535}
5536
5537const struct brw_tracked_state genX(cs_samplers) = {
5538   .dirty = {
5539      .mesa = _NEW_TEXTURE,
5540      .brw = BRW_NEW_BATCH |
5541             BRW_NEW_BLORP |
5542             BRW_NEW_COMPUTE_PROGRAM,
5543   },
5544   .emit = genX(upload_cs_samplers),
5545};
5546#endif
5547
5548/* ---------------------------------------------------------------------- */
5549
5550#if GFX_VER <= 5
5551
5552static void genX(upload_blend_constant_color)(struct brw_context *brw)
5553{
5554   struct gl_context *ctx = &brw->ctx;
5555
5556   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5557      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5558      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5559      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5560      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5561   }
5562}
5563
5564static const struct brw_tracked_state genX(blend_constant_color) = {
5565   .dirty = {
5566      .mesa = _NEW_COLOR,
5567      .brw = BRW_NEW_CONTEXT |
5568             BRW_NEW_BLORP,
5569   },
5570   .emit = genX(upload_blend_constant_color)
5571};
5572#endif
5573
5574/* ---------------------------------------------------------------------- */
5575
5576void
5577genX(init_atoms)(struct brw_context *brw)
5578{
5579#if GFX_VER < 6
5580   static const struct brw_tracked_state *render_atoms[] =
5581   {
5582      &genX(vf_statistics),
5583
5584      /* Once all the programs are done, we know how large urb entry
5585       * sizes need to be and can decide if we need to change the urb
5586       * layout.
5587       */
5588      &brw_curbe_offsets,
5589      &brw_recalculate_urb_fence,
5590
5591      &genX(cc_vp),
5592      &genX(color_calc_state),
5593
5594      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5595       * table upload must be last.
5596       */
5597      &brw_vs_pull_constants,
5598      &brw_wm_pull_constants,
5599      &brw_renderbuffer_surfaces,
5600      &brw_renderbuffer_read_surfaces,
5601      &brw_texture_surfaces,
5602      &brw_vs_binding_table,
5603      &brw_wm_binding_table,
5604
5605      &genX(fs_samplers),
5606      &genX(vs_samplers),
5607
5608      /* These set up state for brw_psp_urb_cbs */
5609      &genX(wm_state),
5610      &genX(sf_clip_viewport),
5611      &genX(sf_state),
5612      &genX(vs_state), /* always required, enabled or not */
5613      &genX(clip_state),
5614      &genX(gs_state),
5615
5616      /* Command packets:
5617       */
5618      &brw_binding_table_pointers,
5619      &genX(blend_constant_color),
5620
5621      &brw_depthbuffer,
5622
5623      &genX(polygon_stipple),
5624      &genX(polygon_stipple_offset),
5625
5626      &genX(line_stipple),
5627
5628      &brw_psp_urb_cbs,
5629
5630      &genX(drawing_rect),
5631      &brw_indices, /* must come before brw_vertices */
5632      &genX(index_buffer),
5633      &genX(vertices),
5634
5635      &brw_constant_buffer
5636   };
5637#elif GFX_VER == 6
5638   static const struct brw_tracked_state *render_atoms[] =
5639   {
5640      &genX(vf_statistics),
5641
5642      &genX(sf_clip_viewport),
5643
5644      /* Command packets: */
5645
5646      &genX(cc_vp),
5647
5648      &gfx6_urb,
5649      &genX(blend_state),         /* must do before cc unit */
5650      &genX(color_calc_state),    /* must do before cc unit */
5651      &genX(depth_stencil_state), /* must do before cc unit */
5652
5653      &genX(vs_push_constants), /* Before vs_state */
5654      &genX(gs_push_constants), /* Before gs_state */
5655      &genX(wm_push_constants), /* Before wm_state */
5656
5657      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5658       * table upload must be last.
5659       */
5660      &brw_vs_pull_constants,
5661      &brw_vs_ubo_surfaces,
5662      &brw_gs_pull_constants,
5663      &brw_gs_ubo_surfaces,
5664      &brw_wm_pull_constants,
5665      &brw_wm_ubo_surfaces,
5666      &gfx6_renderbuffer_surfaces,
5667      &brw_renderbuffer_read_surfaces,
5668      &brw_texture_surfaces,
5669      &gfx6_sol_surface,
5670      &brw_vs_binding_table,
5671      &gfx6_gs_binding_table,
5672      &brw_wm_binding_table,
5673
5674      &genX(fs_samplers),
5675      &genX(vs_samplers),
5676      &genX(gs_samplers),
5677      &gfx6_sampler_state,
5678      &genX(multisample_state),
5679
5680      &genX(vs_state),
5681      &genX(gs_state),
5682      &genX(clip_state),
5683      &genX(sf_state),
5684      &genX(wm_state),
5685
5686      &genX(scissor_state),
5687
5688      &gfx6_binding_table_pointers,
5689
5690      &brw_depthbuffer,
5691
5692      &genX(polygon_stipple),
5693      &genX(polygon_stipple_offset),
5694
5695      &genX(line_stipple),
5696
5697      &genX(drawing_rect),
5698
5699      &brw_indices, /* must come before brw_vertices */
5700      &genX(index_buffer),
5701      &genX(vertices),
5702   };
5703#elif GFX_VER == 7
5704   static const struct brw_tracked_state *render_atoms[] =
5705   {
5706      &genX(vf_statistics),
5707
5708      /* Command packets: */
5709
5710      &genX(cc_vp),
5711      &genX(sf_clip_viewport),
5712
5713      &gfx7_l3_state,
5714      &gfx7_push_constant_space,
5715      &gfx7_urb,
5716#if GFX_VERx10 == 75
5717      &genX(cc_and_blend_state),
5718#else
5719      &genX(blend_state),         /* must do before cc unit */
5720      &genX(color_calc_state),    /* must do before cc unit */
5721#endif
5722      &genX(depth_stencil_state), /* must do before cc unit */
5723
5724      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5725      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5726      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5727      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5728      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5729
5730      &genX(vs_push_constants), /* Before vs_state */
5731      &genX(tcs_push_constants),
5732      &genX(tes_push_constants),
5733      &genX(gs_push_constants), /* Before gs_state */
5734      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5735
5736      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5737       * table upload must be last.
5738       */
5739      &brw_vs_pull_constants,
5740      &brw_vs_ubo_surfaces,
5741      &brw_tcs_pull_constants,
5742      &brw_tcs_ubo_surfaces,
5743      &brw_tes_pull_constants,
5744      &brw_tes_ubo_surfaces,
5745      &brw_gs_pull_constants,
5746      &brw_gs_ubo_surfaces,
5747      &brw_wm_pull_constants,
5748      &brw_wm_ubo_surfaces,
5749      &gfx6_renderbuffer_surfaces,
5750      &brw_renderbuffer_read_surfaces,
5751      &brw_texture_surfaces,
5752
5753      &genX(push_constant_packets),
5754
5755      &brw_vs_binding_table,
5756      &brw_tcs_binding_table,
5757      &brw_tes_binding_table,
5758      &brw_gs_binding_table,
5759      &brw_wm_binding_table,
5760
5761      &genX(fs_samplers),
5762      &genX(vs_samplers),
5763      &genX(tcs_samplers),
5764      &genX(tes_samplers),
5765      &genX(gs_samplers),
5766      &genX(multisample_state),
5767
5768      &genX(vs_state),
5769      &genX(hs_state),
5770      &genX(te_state),
5771      &genX(ds_state),
5772      &genX(gs_state),
5773      &genX(sol_state),
5774      &genX(clip_state),
5775      &genX(sbe_state),
5776      &genX(sf_state),
5777      &genX(wm_state),
5778      &genX(ps_state),
5779
5780      &genX(scissor_state),
5781
5782      &brw_depthbuffer,
5783
5784      &genX(polygon_stipple),
5785      &genX(polygon_stipple_offset),
5786
5787      &genX(line_stipple),
5788
5789      &genX(drawing_rect),
5790
5791      &brw_indices, /* must come before brw_vertices */
5792      &genX(index_buffer),
5793      &genX(vertices),
5794
5795#if GFX_VERx10 == 75
5796      &genX(cut_index),
5797#endif
5798   };
5799#elif GFX_VER >= 8
5800   static const struct brw_tracked_state *render_atoms[] =
5801   {
5802      &genX(vf_statistics),
5803
5804      &genX(cc_vp),
5805      &genX(sf_clip_viewport),
5806
5807      &gfx7_l3_state,
5808      &gfx7_push_constant_space,
5809      &gfx7_urb,
5810      &genX(blend_state),
5811      &genX(color_calc_state),
5812
5813      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5814      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5815      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5816      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5817      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5818
5819      &genX(vs_push_constants), /* Before vs_state */
5820      &genX(tcs_push_constants),
5821      &genX(tes_push_constants),
5822      &genX(gs_push_constants), /* Before gs_state */
5823      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5824
5825      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5826       * table upload must be last.
5827       */
5828      &brw_vs_pull_constants,
5829      &brw_vs_ubo_surfaces,
5830      &brw_tcs_pull_constants,
5831      &brw_tcs_ubo_surfaces,
5832      &brw_tes_pull_constants,
5833      &brw_tes_ubo_surfaces,
5834      &brw_gs_pull_constants,
5835      &brw_gs_ubo_surfaces,
5836      &brw_wm_pull_constants,
5837      &brw_wm_ubo_surfaces,
5838      &gfx6_renderbuffer_surfaces,
5839      &brw_renderbuffer_read_surfaces,
5840      &brw_texture_surfaces,
5841
5842      &genX(push_constant_packets),
5843
5844      &brw_vs_binding_table,
5845      &brw_tcs_binding_table,
5846      &brw_tes_binding_table,
5847      &brw_gs_binding_table,
5848      &brw_wm_binding_table,
5849
5850      &genX(fs_samplers),
5851      &genX(vs_samplers),
5852      &genX(tcs_samplers),
5853      &genX(tes_samplers),
5854      &genX(gs_samplers),
5855      &genX(multisample_state),
5856
5857      &genX(vs_state),
5858      &genX(hs_state),
5859      &genX(te_state),
5860      &genX(ds_state),
5861      &genX(gs_state),
5862      &genX(sol_state),
5863      &genX(clip_state),
5864      &genX(raster_state),
5865      &genX(sbe_state),
5866      &genX(sf_state),
5867      &genX(ps_blend),
5868      &genX(ps_extra),
5869      &genX(ps_state),
5870      &genX(depth_stencil_state),
5871      &genX(wm_state),
5872
5873      &genX(scissor_state),
5874
5875      &brw_depthbuffer,
5876
5877      &genX(polygon_stipple),
5878      &genX(polygon_stipple_offset),
5879
5880      &genX(line_stipple),
5881
5882      &genX(drawing_rect),
5883
5884      &genX(vf_topology),
5885
5886      &brw_indices,
5887      &genX(index_buffer),
5888      &genX(vertices),
5889
5890      &genX(cut_index),
5891      &gfx8_pma_fix,
5892   };
5893#endif
5894
5895   STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5896   brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5897                           render_atoms, ARRAY_SIZE(render_atoms));
5898
5899#if GFX_VER >= 7
5900   static const struct brw_tracked_state *compute_atoms[] =
5901   {
5902      &gfx7_l3_state,
5903      &brw_cs_image_surfaces,
5904      &genX(cs_push_constants),
5905      &genX(cs_pull_constants),
5906      &brw_cs_ubo_surfaces,
5907      &brw_cs_texture_surfaces,
5908      &brw_cs_work_groups_surface,
5909      &genX(cs_samplers),
5910      &genX(cs_state),
5911   };
5912
5913   STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5914   brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5915                           compute_atoms, ARRAY_SIZE(compute_atoms));
5916
5917   brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5918   brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
5919#endif
5920
5921   assert(brw->screen->devinfo.verx10 == GFX_VERx10);
5922}
5923