1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include <assert.h>
25
26#include "dev/gen_device_info.h"
27#include "common/gen_sample_positions.h"
28#include "genxml/gen_macros.h"
29
30#include "main/bufferobj.h"
31#include "main/context.h"
32#include "main/enums.h"
33#include "main/macros.h"
34#include "main/state.h"
35
36#include "genX_boilerplate.h"
37
38#include "brw_context.h"
39#include "brw_draw.h"
40#include "brw_multisample_state.h"
41#include "brw_state.h"
42#include "brw_wm.h"
43#include "brw_util.h"
44
45#include "intel_batchbuffer.h"
46#include "intel_buffer_objects.h"
47#include "intel_fbo.h"
48
49#include "main/enums.h"
50#include "main/fbobject.h"
51#include "main/framebuffer.h"
52#include "main/glformats.h"
53#include "main/samplerobj.h"
54#include "main/shaderapi.h"
55#include "main/stencil.h"
56#include "main/transformfeedback.h"
57#include "main/varray.h"
58#include "main/viewport.h"
59#include "util/half_float.h"
60
61#if GEN_GEN == 4
62static struct brw_address
63KSP(struct brw_context *brw, uint32_t offset)
64{
65   return ro_bo(brw->cache.bo, offset);
66}
67#else
68static uint32_t
69KSP(UNUSED struct brw_context *brw, uint32_t offset)
70{
71   return offset;
72}
73#endif
74
75#if GEN_GEN >= 7
76MAYBE_UNUSED static void
77emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr)
78{
79   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) {
80      lrm.RegisterAddress  = reg;
81      lrm.MemoryAddress    = addr;
82   }
83}
84#endif
85
86MAYBE_UNUSED static void
87emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm)
88{
89   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) {
90      lri.RegisterOffset   = reg;
91      lri.DataDWord        = imm;
92   }
93}
94
95#if GEN_IS_HASWELL || GEN_GEN >= 8
96MAYBE_UNUSED static void
97emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src)
98{
99   brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) {
100      lrr.SourceRegisterAddress        = src;
101      lrr.DestinationRegisterAddress   = dst;
102   }
103}
104#endif
105
106/**
107 * Polygon stipple packet
108 */
109static void
110genX(upload_polygon_stipple)(struct brw_context *brw)
111{
112   struct gl_context *ctx = &brw->ctx;
113
114   /* _NEW_POLYGON */
115   if (!ctx->Polygon.StippleFlag)
116      return;
117
118   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) {
119      /* Polygon stipple is provided in OpenGL order, i.e. bottom
120       * row first.  If we're rendering to a window (i.e. the
121       * default frame buffer object, 0), then we need to invert
122       * it to match our pixel layout.  But if we're rendering
123       * to a FBO (i.e. any named frame buffer object), we *don't*
124       * need to invert - we already match the layout.
125       */
126      if (ctx->DrawBuffer->FlipY) {
127         for (unsigned i = 0; i < 32; i++)
128            poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */
129      } else {
130         for (unsigned i = 0; i < 32; i++)
131            poly.PatternRow[i] = ctx->PolygonStipple[i];
132      }
133   }
134}
135
136static const struct brw_tracked_state genX(polygon_stipple) = {
137   .dirty = {
138      .mesa = _NEW_POLYGON |
139              _NEW_POLYGONSTIPPLE,
140      .brw = BRW_NEW_CONTEXT,
141   },
142   .emit = genX(upload_polygon_stipple),
143};
144
145/**
146 * Polygon stipple offset packet
147 */
148static void
149genX(upload_polygon_stipple_offset)(struct brw_context *brw)
150{
151   struct gl_context *ctx = &brw->ctx;
152
153   /* _NEW_POLYGON */
154   if (!ctx->Polygon.StippleFlag)
155      return;
156
157   brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) {
158      /* _NEW_BUFFERS
159       *
160       * If we're drawing to a system window we have to invert the Y axis
161       * in order to match the OpenGL pixel coordinate system, and our
162       * offset must be matched to the window position.  If we're drawing
163       * to a user-created FBO then our native pixel coordinate system
164       * works just fine, and there's no window system to worry about.
165       */
166      if (ctx->DrawBuffer->FlipY) {
167         poly.PolygonStippleYOffset =
168            (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31;
169      }
170   }
171}
172
173static const struct brw_tracked_state genX(polygon_stipple_offset) = {
174   .dirty = {
175      .mesa = _NEW_BUFFERS |
176              _NEW_POLYGON,
177      .brw = BRW_NEW_CONTEXT,
178   },
179   .emit = genX(upload_polygon_stipple_offset),
180};
181
182/**
183 * Line stipple packet
184 */
185static void
186genX(upload_line_stipple)(struct brw_context *brw)
187{
188   struct gl_context *ctx = &brw->ctx;
189
190   if (!ctx->Line.StippleFlag)
191      return;
192
193   brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) {
194      line.LineStipplePattern = ctx->Line.StipplePattern;
195
196      line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor;
197      line.LineStippleRepeatCount = ctx->Line.StippleFactor;
198   }
199}
200
201static const struct brw_tracked_state genX(line_stipple) = {
202   .dirty = {
203      .mesa = _NEW_LINE,
204      .brw = BRW_NEW_CONTEXT,
205   },
206   .emit = genX(upload_line_stipple),
207};
208
209/* Constant single cliprect for framebuffer object or DRI2 drawing */
210static void
211genX(upload_drawing_rect)(struct brw_context *brw)
212{
213   struct gl_context *ctx = &brw->ctx;
214   const struct gl_framebuffer *fb = ctx->DrawBuffer;
215   const unsigned int fb_width = _mesa_geometric_width(fb);
216   const unsigned int fb_height = _mesa_geometric_height(fb);
217
218   brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) {
219      rect.ClippedDrawingRectangleXMax = fb_width - 1;
220      rect.ClippedDrawingRectangleYMax = fb_height - 1;
221   }
222}
223
224static const struct brw_tracked_state genX(drawing_rect) = {
225   .dirty = {
226      .mesa = _NEW_BUFFERS,
227      .brw = BRW_NEW_BLORP |
228             BRW_NEW_CONTEXT,
229   },
230   .emit = genX(upload_drawing_rect),
231};
232
233static uint32_t *
234genX(emit_vertex_buffer_state)(struct brw_context *brw,
235                               uint32_t *dw,
236                               unsigned buffer_nr,
237                               struct brw_bo *bo,
238                               unsigned start_offset,
239                               MAYBE_UNUSED unsigned end_offset,
240                               unsigned stride,
241                               MAYBE_UNUSED unsigned step_rate)
242{
243   struct GENX(VERTEX_BUFFER_STATE) buf_state = {
244      .VertexBufferIndex = buffer_nr,
245      .BufferPitch = stride,
246
247      /* The VF cache designers apparently cut corners, and made the cache
248       * only consider the bottom 32 bits of memory addresses.  If you happen
249       * to have two vertex buffers which get placed exactly 4 GiB apart and
250       * use them in back-to-back draw calls, you can get collisions.  To work
251       * around this problem, we restrict vertex buffers to the low 32 bits of
252       * the address space.
253       */
254      .BufferStartingAddress = ro_32_bo(bo, start_offset),
255#if GEN_GEN >= 8
256      .BufferSize = end_offset - start_offset,
257#endif
258
259#if GEN_GEN >= 7
260      .AddressModifyEnable = true,
261#endif
262
263#if GEN_GEN < 8
264      .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA,
265      .InstanceDataStepRate = step_rate,
266#if GEN_GEN >= 5
267      .EndAddress = ro_bo(bo, end_offset - 1),
268#endif
269#endif
270
271#if GEN_GEN == 11
272      .MOCS = ICL_MOCS_WB,
273#elif GEN_GEN == 10
274      .MOCS = CNL_MOCS_WB,
275#elif GEN_GEN == 9
276      .MOCS = SKL_MOCS_WB,
277#elif GEN_GEN == 8
278      .MOCS = BDW_MOCS_WB,
279#elif GEN_GEN == 7
280      .MOCS = GEN7_MOCS_L3,
281#endif
282   };
283
284   GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state);
285   return dw + GENX(VERTEX_BUFFER_STATE_length);
286}
287
288UNUSED static bool
289is_passthru_format(uint32_t format)
290{
291   switch (format) {
292   case ISL_FORMAT_R64_PASSTHRU:
293   case ISL_FORMAT_R64G64_PASSTHRU:
294   case ISL_FORMAT_R64G64B64_PASSTHRU:
295   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
296      return true;
297   default:
298      return false;
299   }
300}
301
302UNUSED static int
303uploads_needed(uint32_t format,
304               bool is_dual_slot)
305{
306   if (!is_passthru_format(format))
307      return 1;
308
309   if (is_dual_slot)
310      return 2;
311
312   switch (format) {
313   case ISL_FORMAT_R64_PASSTHRU:
314   case ISL_FORMAT_R64G64_PASSTHRU:
315      return 1;
316   case ISL_FORMAT_R64G64B64_PASSTHRU:
317   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
318      return 2;
319   default:
320      unreachable("not reached");
321   }
322}
323
324/*
325 * Returns the format that we are finally going to use when upload a vertex
326 * element. It will only change if we are using *64*PASSTHRU formats, as for
327 * gen < 8 they need to be splitted on two *32*FLOAT formats.
328 *
329 * @upload points in which upload we are. Valid values are [0,1]
330 */
331static uint32_t
332downsize_format_if_needed(uint32_t format,
333                          int upload)
334{
335   assert(upload == 0 || upload == 1);
336
337   if (!is_passthru_format(format))
338      return format;
339
340   /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload ==
341    * 1 means that we have been forced to do 2 uploads for a size <= 2. This
342    * happens with gen < 8 and dvec3 or dvec4 vertex shader input
343    * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of
344    * flagging that we want to fill with zeroes this second forced upload.
345    */
346   switch (format) {
347   case ISL_FORMAT_R64_PASSTHRU:
348      return upload == 0 ? ISL_FORMAT_R32G32_FLOAT
349                         : ISL_FORMAT_R32_FLOAT;
350   case ISL_FORMAT_R64G64_PASSTHRU:
351      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
352                         : ISL_FORMAT_R32_FLOAT;
353   case ISL_FORMAT_R64G64B64_PASSTHRU:
354      return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT
355                         : ISL_FORMAT_R32G32_FLOAT;
356   case ISL_FORMAT_R64G64B64A64_PASSTHRU:
357      return ISL_FORMAT_R32G32B32A32_FLOAT;
358   default:
359      unreachable("not reached");
360   }
361}
362
363/*
364 * Returns the number of componentes associated with a format that is used on
365 * a 64 to 32 format split. See downsize_format()
366 */
367static int
368upload_format_size(uint32_t upload_format)
369{
370   switch (upload_format) {
371   case ISL_FORMAT_R32_FLOAT:
372
373      /* downsized_format has returned this one in order to flag that we are
374       * performing a second upload which we want to have filled with
375       * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4
376       * vertex shader input variables.
377       */
378
379      return 0;
380   case ISL_FORMAT_R32G32_FLOAT:
381      return 2;
382   case ISL_FORMAT_R32G32B32A32_FLOAT:
383      return 4;
384   default:
385      unreachable("not reached");
386   }
387}
388
389static UNUSED uint16_t
390pinned_bo_high_bits(struct brw_bo *bo)
391{
392   return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0;
393}
394
395/* The VF cache designers apparently cut corners, and made the cache key's
396 * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits
397 * of the address.  If you happen to have two vertex buffers which get placed
398 * exactly 4 GiB apart and use them in back-to-back draw calls, you can get
399 * collisions.  (These collisions can happen within a single batch.)
400 *
401 * In the soft-pin world, we'd like to assign addresses up front, and never
402 * move buffers.  So, we need to do a VF cache invalidate if the buffer for
403 * a particular VB slot has different [48:32] address bits than the last one.
404 *
405 * In the relocation world, we have no idea what the addresses will be, so
406 * we can't apply this workaround.  Instead, we tell the kernel to move it
407 * to the low 4GB regardless.
408 *
409 * This HW issue is gone on Gen11+.
410 */
411static void
412vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw)
413{
414#if GEN_GEN >= 8 && GEN_GEN < 11
415   bool need_invalidate = false;
416
417   for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
418      uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo);
419
420      if (high_bits != brw->vb.last_bo_high_bits[i]) {
421         need_invalidate = true;
422         brw->vb.last_bo_high_bits[i] = high_bits;
423      }
424   }
425
426   if (brw->draw.draw_params_bo) {
427      uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo);
428
429      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) {
430         need_invalidate = true;
431         brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits;
432      }
433   }
434
435   if (brw->draw.derived_draw_params_bo) {
436      uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo);
437
438      if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) {
439         need_invalidate = true;
440         brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits;
441      }
442   }
443
444   if (need_invalidate) {
445      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL);
446   }
447#endif
448}
449
450static void
451vf_invalidate_for_ib_48bit_transition(struct brw_context *brw)
452{
453#if GEN_GEN >= 8
454   uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo);
455
456   if (high_bits != brw->ib.last_bo_high_bits) {
457      brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE);
458      brw->ib.last_bo_high_bits = high_bits;
459   }
460#endif
461}
462
463static void
464genX(emit_vertices)(struct brw_context *brw)
465{
466   const struct gen_device_info *devinfo = &brw->screen->devinfo;
467   uint32_t *dw;
468
469   brw_prepare_vertices(brw);
470   brw_prepare_shader_draw_parameters(brw);
471
472#if GEN_GEN < 6
473   brw_emit_query_begin(brw);
474#endif
475
476   const struct brw_vs_prog_data *vs_prog_data =
477      brw_vs_prog_data(brw->vs.base.prog_data);
478
479#if GEN_GEN >= 8
480   struct gl_context *ctx = &brw->ctx;
481   const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL ||
482                                ctx->Polygon.BackMode != GL_FILL);
483
484   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) {
485      unsigned vue = brw->vb.nr_enabled;
486
487      /* The element for the edge flags must always be last, so we have to
488       * insert the SGVS before it in that case.
489       */
490      if (uses_edge_flag) {
491         assert(vue > 0);
492         vue--;
493      }
494
495      WARN_ONCE(vue >= 33,
496                "Trying to insert VID/IID past 33rd vertex element, "
497                "need to reorder the vertex attrbutes.");
498
499      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) {
500         if (vs_prog_data->uses_vertexid) {
501            vfs.VertexIDEnable = true;
502            vfs.VertexIDComponentNumber = 2;
503            vfs.VertexIDElementOffset = vue;
504         }
505
506         if (vs_prog_data->uses_instanceid) {
507            vfs.InstanceIDEnable = true;
508            vfs.InstanceIDComponentNumber = 3;
509            vfs.InstanceIDElementOffset = vue;
510         }
511      }
512
513      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
514         vfi.InstancingEnable = true;
515         vfi.VertexElementIndex = vue;
516      }
517   } else {
518      brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs);
519   }
520#endif
521
522   const bool uses_draw_params =
523      vs_prog_data->uses_firstvertex ||
524      vs_prog_data->uses_baseinstance;
525
526   const bool uses_derived_draw_params =
527      vs_prog_data->uses_drawid ||
528      vs_prog_data->uses_is_indexed_draw;
529
530   const bool needs_sgvs_element = (uses_draw_params ||
531                                    vs_prog_data->uses_instanceid ||
532                                    vs_prog_data->uses_vertexid);
533
534   unsigned nr_elements =
535      brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params;
536
537#if GEN_GEN < 8
538   /* If any of the formats of vb.enabled needs more that one upload, we need
539    * to add it to nr_elements
540    */
541   for (unsigned i = 0; i < brw->vb.nr_enabled; i++) {
542      struct brw_vertex_element *input = brw->vb.enabled[i];
543      const struct gl_array_attributes *glattrib = input->glattrib;
544      uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
545
546      if (uploads_needed(format, input->is_dual_slot) > 1)
547         nr_elements++;
548   }
549#endif
550
551   /* If the VS doesn't read any inputs (calculating vertex position from
552    * a state variable for some reason, for example), emit a single pad
553    * VERTEX_ELEMENT struct and bail.
554    *
555    * The stale VB state stays in place, but they don't do anything unless
556    * a VE loads from them.
557    */
558   if (nr_elements == 0) {
559      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
560                           1 + GENX(VERTEX_ELEMENT_STATE_length));
561      struct GENX(VERTEX_ELEMENT_STATE) elem = {
562         .Valid = true,
563         .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT,
564         .Component0Control = VFCOMP_STORE_0,
565         .Component1Control = VFCOMP_STORE_0,
566         .Component2Control = VFCOMP_STORE_0,
567         .Component3Control = VFCOMP_STORE_1_FP,
568      };
569      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem);
570      return;
571   }
572
573   /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */
574   const unsigned nr_buffers = brw->vb.nr_buffers +
575      uses_draw_params + uses_derived_draw_params;
576
577   vf_invalidate_for_vb_48bit_transitions(brw);
578
579   if (nr_buffers) {
580      assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17));
581
582      dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS),
583                           1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers);
584
585      for (unsigned i = 0; i < brw->vb.nr_buffers; i++) {
586         const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
587         /* Prior to Haswell and Bay Trail we have to use 4-component formats
588          * to fake 3-component ones.  In particular, we do this for
589          * half-float and 8 and 16-bit integer formats.  This means that the
590          * vertex element may poke over the end of the buffer by 2 bytes.
591          */
592         const unsigned padding =
593            (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2;
594         const unsigned end = buffer->offset + buffer->size + padding;
595         dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo,
596                                             buffer->offset,
597                                             end,
598                                             buffer->stride,
599                                             buffer->step_rate);
600      }
601
602      if (uses_draw_params) {
603         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers,
604                                             brw->draw.draw_params_bo,
605                                             brw->draw.draw_params_offset,
606                                             brw->draw.draw_params_bo->size,
607                                             0 /* stride */,
608                                             0 /* step rate */);
609      }
610
611      if (uses_derived_draw_params) {
612         dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1,
613                                             brw->draw.derived_draw_params_bo,
614                                             brw->draw.derived_draw_params_offset,
615                                             brw->draw.derived_draw_params_bo->size,
616                                             0 /* stride */,
617                                             0 /* step rate */);
618      }
619   }
620
621   /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS,
622    * presumably for VertexID/InstanceID.
623    */
624#if GEN_GEN >= 6
625   assert(nr_elements <= 34);
626   const struct brw_vertex_element *gen6_edgeflag_input = NULL;
627#else
628   assert(nr_elements <= 18);
629#endif
630
631   dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS),
632                        1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements);
633   unsigned i;
634   for (i = 0; i < brw->vb.nr_enabled; i++) {
635      const struct brw_vertex_element *input = brw->vb.enabled[i];
636      const struct gl_array_attributes *glattrib = input->glattrib;
637      uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
638      uint32_t comp0 = VFCOMP_STORE_SRC;
639      uint32_t comp1 = VFCOMP_STORE_SRC;
640      uint32_t comp2 = VFCOMP_STORE_SRC;
641      uint32_t comp3 = VFCOMP_STORE_SRC;
642      const unsigned num_uploads = GEN_GEN < 8 ?
643         uploads_needed(format, input->is_dual_slot) : 1;
644
645#if GEN_GEN >= 8
646      /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE):
647       * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an
648       * element which has edge flag enabled."
649       */
650      assert(!(is_passthru_format(format) && uses_edge_flag));
651#endif
652
653      /* The gen4 driver expects edgeflag to come in as a float, and passes
654       * that float on to the tests in the clipper.  Mesa's current vertex
655       * attribute value for EdgeFlag is stored as a float, which works out.
656       * glEdgeFlagPointer, on the other hand, gives us an unnormalized
657       * integer ubyte.  Just rewrite that to convert to a float.
658       *
659       * Gen6+ passes edgeflag as sideband along with the vertex, instead
660       * of in the VUE.  We have to upload it sideband as the last vertex
661       * element according to the B-Spec.
662       */
663#if GEN_GEN >= 6
664      if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) {
665         gen6_edgeflag_input = input;
666         continue;
667      }
668#endif
669
670      for (unsigned c = 0; c < num_uploads; c++) {
671         const uint32_t upload_format = GEN_GEN >= 8 ? format :
672            downsize_format_if_needed(format, c);
673         /* If we need more that one upload, the offset stride would be 128
674          * bits (16 bytes), as for previous uploads we are using the full
675          * entry. */
676         const unsigned offset = input->offset + c * 16;
677
678         const struct gl_array_attributes *glattrib = input->glattrib;
679         const int size = (GEN_GEN < 8 && is_passthru_format(format)) ?
680            upload_format_size(upload_format) : glattrib->Format.Size;
681
682         switch (size) {
683            case 0: comp0 = VFCOMP_STORE_0;
684            case 1: comp1 = VFCOMP_STORE_0;
685            case 2: comp2 = VFCOMP_STORE_0;
686            case 3:
687               if (GEN_GEN >= 8 && glattrib->Format.Doubles) {
688                  comp3 = VFCOMP_STORE_0;
689               } else if (glattrib->Format.Integer) {
690                  comp3 = VFCOMP_STORE_1_INT;
691               } else {
692                  comp3 = VFCOMP_STORE_1_FP;
693               }
694
695               break;
696         }
697
698#if GEN_GEN >= 8
699         /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE):
700          *
701          *     "When SourceElementFormat is set to one of the *64*_PASSTHRU
702          *     formats, 64-bit components are stored in the URB without any
703          *     conversion. In this case, vertex elements must be written as 128
704          *     or 256 bits, with VFCOMP_STORE_0 being used to pad the output as
705          *     required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red
706          *     component into the URB, Component 1 must be specified as
707          *     VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in
708          *     order to output a 128-bit vertex element, or Components 1-3 must
709          *     be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex
710          *     element. Likewise, use of R64G64B64_PASSTHRU requires Component 3
711          *     to be specified as VFCOMP_STORE_0 in order to output a 256-bit
712          *     vertex element."
713          */
714         if (glattrib->Format.Doubles && !input->is_dual_slot) {
715            /* Store vertex elements which correspond to double and dvec2 vertex
716             * shader inputs as 128-bit vertex elements, instead of 256-bits.
717             */
718            comp2 = VFCOMP_NOSTORE;
719            comp3 = VFCOMP_NOSTORE;
720         }
721#endif
722
723         struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
724            .VertexBufferIndex = input->buffer,
725            .Valid = true,
726            .SourceElementFormat = upload_format,
727            .SourceElementOffset = offset,
728            .Component0Control = comp0,
729            .Component1Control = comp1,
730            .Component2Control = comp2,
731            .Component3Control = comp3,
732#if GEN_GEN < 5
733            .DestinationElementOffset = i * 4,
734#endif
735         };
736
737         GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
738         dw += GENX(VERTEX_ELEMENT_STATE_length);
739      }
740   }
741
742   if (needs_sgvs_element) {
743      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
744         .Valid = true,
745         .Component0Control = VFCOMP_STORE_0,
746         .Component1Control = VFCOMP_STORE_0,
747         .Component2Control = VFCOMP_STORE_0,
748         .Component3Control = VFCOMP_STORE_0,
749#if GEN_GEN < 5
750         .DestinationElementOffset = i * 4,
751#endif
752      };
753
754#if GEN_GEN >= 8
755      if (uses_draw_params) {
756         elem_state.VertexBufferIndex = brw->vb.nr_buffers;
757         elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
758         elem_state.Component0Control = VFCOMP_STORE_SRC;
759         elem_state.Component1Control = VFCOMP_STORE_SRC;
760      }
761#else
762      elem_state.VertexBufferIndex = brw->vb.nr_buffers;
763      elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT;
764      if (uses_draw_params) {
765         elem_state.Component0Control = VFCOMP_STORE_SRC;
766         elem_state.Component1Control = VFCOMP_STORE_SRC;
767      }
768
769      if (vs_prog_data->uses_vertexid)
770         elem_state.Component2Control = VFCOMP_STORE_VID;
771
772      if (vs_prog_data->uses_instanceid)
773         elem_state.Component3Control = VFCOMP_STORE_IID;
774#endif
775
776      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
777      dw += GENX(VERTEX_ELEMENT_STATE_length);
778   }
779
780   if (uses_derived_draw_params) {
781      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
782         .Valid = true,
783         .VertexBufferIndex = brw->vb.nr_buffers + 1,
784         .SourceElementFormat = ISL_FORMAT_R32G32_UINT,
785         .Component0Control = VFCOMP_STORE_SRC,
786         .Component1Control = VFCOMP_STORE_SRC,
787         .Component2Control = VFCOMP_STORE_0,
788         .Component3Control = VFCOMP_STORE_0,
789#if GEN_GEN < 5
790         .DestinationElementOffset = i * 4,
791#endif
792      };
793
794      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
795      dw += GENX(VERTEX_ELEMENT_STATE_length);
796   }
797
798#if GEN_GEN >= 6
799   if (gen6_edgeflag_input) {
800      const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib;
801      const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format);
802
803      struct GENX(VERTEX_ELEMENT_STATE) elem_state = {
804         .Valid = true,
805         .VertexBufferIndex = gen6_edgeflag_input->buffer,
806         .EdgeFlagEnable = true,
807         .SourceElementFormat = format,
808         .SourceElementOffset = gen6_edgeflag_input->offset,
809         .Component0Control = VFCOMP_STORE_SRC,
810         .Component1Control = VFCOMP_STORE_0,
811         .Component2Control = VFCOMP_STORE_0,
812         .Component3Control = VFCOMP_STORE_0,
813      };
814
815      GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state);
816      dw += GENX(VERTEX_ELEMENT_STATE_length);
817   }
818#endif
819
820#if GEN_GEN >= 8
821   for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) {
822      const struct brw_vertex_element *input = brw->vb.enabled[i];
823      const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer];
824      unsigned element_index;
825
826      /* The edge flag element is reordered to be the last one in the code
827       * above so we need to compensate for that in the element indices used
828       * below.
829       */
830      if (input == gen6_edgeflag_input)
831         element_index = nr_elements - 1;
832      else
833         element_index = j++;
834
835      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
836         vfi.VertexElementIndex = element_index;
837         vfi.InstancingEnable = buffer->step_rate != 0;
838         vfi.InstanceDataStepRate = buffer->step_rate;
839      }
840   }
841
842   if (vs_prog_data->uses_drawid) {
843      const unsigned element = brw->vb.nr_enabled + needs_sgvs_element;
844
845      brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) {
846         vfi.VertexElementIndex = element;
847      }
848   }
849#endif
850}
851
852static const struct brw_tracked_state genX(vertices) = {
853   .dirty = {
854      .mesa = _NEW_POLYGON,
855      .brw = BRW_NEW_BATCH |
856             BRW_NEW_BLORP |
857             BRW_NEW_VERTEX_PROGRAM |
858             BRW_NEW_VERTICES |
859             BRW_NEW_VS_PROG_DATA,
860   },
861   .emit = genX(emit_vertices),
862};
863
864static void
865genX(emit_index_buffer)(struct brw_context *brw)
866{
867   const struct _mesa_index_buffer *index_buffer = brw->ib.ib;
868
869   if (index_buffer == NULL)
870      return;
871
872   vf_invalidate_for_ib_48bit_transition(brw);
873
874   brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) {
875#if GEN_GEN < 8 && !GEN_IS_HASWELL
876      assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index);
877      ib.CutIndexEnable = brw->ib.enable_cut_index;
878#endif
879      ib.IndexFormat = brw_get_index_type(index_buffer->index_size);
880
881      /* The VF cache designers apparently cut corners, and made the cache
882       * only consider the bottom 32 bits of memory addresses.  If you happen
883       * to have two index buffers which get placed exactly 4 GiB apart and
884       * use them in back-to-back draw calls, you can get collisions.  To work
885       * around this problem, we restrict index buffers to the low 32 bits of
886       * the address space.
887       */
888      ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0);
889#if GEN_GEN >= 8
890      ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
891      ib.BufferSize = brw->ib.size;
892#else
893      ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1);
894#endif
895   }
896}
897
898static const struct brw_tracked_state genX(index_buffer) = {
899   .dirty = {
900      .mesa = 0,
901      .brw = BRW_NEW_BATCH |
902             BRW_NEW_BLORP |
903             BRW_NEW_INDEX_BUFFER,
904   },
905   .emit = genX(emit_index_buffer),
906};
907
908#if GEN_IS_HASWELL || GEN_GEN >= 8
909static void
910genX(upload_cut_index)(struct brw_context *brw)
911{
912   const struct gl_context *ctx = &brw->ctx;
913
914   brw_batch_emit(brw, GENX(3DSTATE_VF), vf) {
915      if (ctx->Array._PrimitiveRestart && brw->ib.ib) {
916         vf.IndexedDrawCutIndexEnable = true;
917         vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size);
918      }
919   }
920}
921
922const struct brw_tracked_state genX(cut_index) = {
923   .dirty = {
924      .mesa  = _NEW_TRANSFORM,
925      .brw   = BRW_NEW_INDEX_BUFFER,
926   },
927   .emit = genX(upload_cut_index),
928};
929#endif
930
931static void
932genX(upload_vf_statistics)(struct brw_context *brw)
933{
934   brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) {
935      vf.StatisticsEnable = true;
936   }
937}
938
939const struct brw_tracked_state genX(vf_statistics) = {
940   .dirty = {
941      .mesa  = 0,
942      .brw   = BRW_NEW_BLORP | BRW_NEW_CONTEXT,
943   },
944   .emit = genX(upload_vf_statistics),
945};
946
947#if GEN_GEN >= 6
948/**
949 * Determine the appropriate attribute override value to store into the
950 * 3DSTATE_SF structure for a given fragment shader attribute.  The attribute
951 * override value contains two pieces of information: the location of the
952 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a
953 * flag indicating whether to "swizzle" the attribute based on the direction
954 * the triangle is facing.
955 *
956 * If an attribute is "swizzled", then the given VUE location is used for
957 * front-facing triangles, and the VUE location that immediately follows is
958 * used for back-facing triangles.  We use this to implement the mapping from
959 * gl_FrontColor/gl_BackColor to gl_Color.
960 *
961 * urb_entry_read_offset is the offset into the VUE at which the SF unit is
962 * being instructed to begin reading attribute data.  It can be set to a
963 * nonzero value to prevent the SF unit from wasting time reading elements of
964 * the VUE that are not needed by the fragment shader.  It is measured in
965 * 256-bit increments.
966 */
967static void
968genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr,
969                        const struct brw_vue_map *vue_map,
970                        int urb_entry_read_offset, int fs_attr,
971                        bool two_side_color, uint32_t *max_source_attr)
972{
973   /* Find the VUE slot for this attribute. */
974   int slot = vue_map->varying_to_slot[fs_attr];
975
976   /* Viewport and Layer are stored in the VUE header.  We need to override
977    * them to zero if earlier stages didn't write them, as GL requires that
978    * they read back as zero when not explicitly set.
979    */
980   if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) {
981      attr->ComponentOverrideX = true;
982      attr->ComponentOverrideW = true;
983      attr->ConstantSource = CONST_0000;
984
985      if (!(vue_map->slots_valid & VARYING_BIT_LAYER))
986         attr->ComponentOverrideY = true;
987      if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT))
988         attr->ComponentOverrideZ = true;
989
990      return;
991   }
992
993   /* If there was only a back color written but not front, use back
994    * as the color instead of undefined
995    */
996   if (slot == -1 && fs_attr == VARYING_SLOT_COL0)
997      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0];
998   if (slot == -1 && fs_attr == VARYING_SLOT_COL1)
999      slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1];
1000
1001   if (slot == -1) {
1002      /* This attribute does not exist in the VUE--that means that the vertex
1003       * shader did not write to it.  This means that either:
1004       *
1005       * (a) This attribute is a texture coordinate, and it is going to be
1006       * replaced with point coordinates (as a consequence of a call to
1007       * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the
1008       * hardware will ignore whatever attribute override we supply.
1009       *
1010       * (b) This attribute is read by the fragment shader but not written by
1011       * the vertex shader, so its value is undefined.  Therefore the
1012       * attribute override we supply doesn't matter.
1013       *
1014       * (c) This attribute is gl_PrimitiveID, and it wasn't written by the
1015       * previous shader stage.
1016       *
1017       * Note that we don't have to worry about the cases where the attribute
1018       * is gl_PointCoord or is undergoing point sprite coordinate
1019       * replacement, because in those cases, this function isn't called.
1020       *
1021       * In case (c), we need to program the attribute overrides so that the
1022       * primitive ID will be stored in this slot.  In every other case, the
1023       * attribute override we supply doesn't matter.  So just go ahead and
1024       * program primitive ID in every case.
1025       */
1026      attr->ComponentOverrideW = true;
1027      attr->ComponentOverrideX = true;
1028      attr->ComponentOverrideY = true;
1029      attr->ComponentOverrideZ = true;
1030      attr->ConstantSource = PRIM_ID;
1031      return;
1032   }
1033
1034   /* Compute the location of the attribute relative to urb_entry_read_offset.
1035    * Each increment of urb_entry_read_offset represents a 256-bit value, so
1036    * it counts for two 128-bit VUE slots.
1037    */
1038   int source_attr = slot - 2 * urb_entry_read_offset;
1039   assert(source_attr >= 0 && source_attr < 32);
1040
1041   /* If we are doing two-sided color, and the VUE slot following this one
1042    * represents a back-facing color, then we need to instruct the SF unit to
1043    * do back-facing swizzling.
1044    */
1045   bool swizzling = two_side_color &&
1046      ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 &&
1047        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) ||
1048       (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 &&
1049        vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1));
1050
1051   /* Update max_source_attr.  If swizzling, the SF will read this slot + 1. */
1052   if (*max_source_attr < source_attr + swizzling)
1053      *max_source_attr = source_attr + swizzling;
1054
1055   attr->SourceAttribute = source_attr;
1056   if (swizzling)
1057      attr->SwizzleSelect = INPUTATTR_FACING;
1058}
1059
1060
1061static void
1062genX(calculate_attr_overrides)(const struct brw_context *brw,
1063                               struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides,
1064                               uint32_t *point_sprite_enables,
1065                               uint32_t *urb_entry_read_length,
1066                               uint32_t *urb_entry_read_offset)
1067{
1068   const struct gl_context *ctx = &brw->ctx;
1069
1070   /* _NEW_POINT */
1071   const struct gl_point_attrib *point = &ctx->Point;
1072
1073   /* BRW_NEW_FRAGMENT_PROGRAM */
1074   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1075
1076   /* BRW_NEW_FS_PROG_DATA */
1077   const struct brw_wm_prog_data *wm_prog_data =
1078      brw_wm_prog_data(brw->wm.base.prog_data);
1079   uint32_t max_source_attr = 0;
1080
1081   *point_sprite_enables = 0;
1082
1083   int first_slot =
1084      brw_compute_first_urb_slot_required(fp->info.inputs_read,
1085                                          &brw->vue_map_geom_out);
1086
1087   /* Each URB offset packs two varying slots */
1088   assert(first_slot % 2 == 0);
1089   *urb_entry_read_offset = first_slot / 2;
1090
1091   /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE,
1092    * description of dw10 Point Sprite Texture Coordinate Enable:
1093    *
1094    * "This field must be programmed to zero when non-point primitives
1095    * are rendered."
1096    *
1097    * The SandyBridge PRM doesn't explicitly say that point sprite enables
1098    * must be programmed to zero when rendering non-point primitives, but
1099    * the IvyBridge PRM does, and if we don't, we get garbage.
1100    *
1101    * This is not required on Haswell, as the hardware ignores this state
1102    * when drawing non-points -- although we do still need to be careful to
1103    * correctly set the attr overrides.
1104    *
1105    * _NEW_POLYGON
1106    * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA
1107    */
1108   bool drawing_points = brw_is_drawing_points(brw);
1109
1110   for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1111      int input_index = wm_prog_data->urb_setup[attr];
1112
1113      if (input_index < 0)
1114         continue;
1115
1116      /* _NEW_POINT */
1117      bool point_sprite = false;
1118      if (drawing_points) {
1119         if (point->PointSprite &&
1120             (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) &&
1121             (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) {
1122            point_sprite = true;
1123         }
1124
1125         if (attr == VARYING_SLOT_PNTC)
1126            point_sprite = true;
1127
1128         if (point_sprite)
1129            *point_sprite_enables |= (1 << input_index);
1130      }
1131
1132      /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */
1133      struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 };
1134
1135      if (!point_sprite) {
1136         genX(get_attr_override)(&attribute,
1137                                 &brw->vue_map_geom_out,
1138                                 *urb_entry_read_offset, attr,
1139                                 _mesa_vertex_program_two_side_enabled(ctx),
1140                                 &max_source_attr);
1141      }
1142
1143      /* The hardware can only do the overrides on 16 overrides at a
1144       * time, and the other up to 16 have to be lined up so that the
1145       * input index = the output index.  We'll need to do some
1146       * tweaking to make sure that's the case.
1147       */
1148      if (input_index < 16)
1149         attr_overrides[input_index] = attribute;
1150      else
1151         assert(attribute.SourceAttribute == input_index);
1152   }
1153
1154   /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for
1155    * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length":
1156    *
1157    * "This field should be set to the minimum length required to read the
1158    *  maximum source attribute.  The maximum source attribute is indicated
1159    *  by the maximum value of the enabled Attribute # Source Attribute if
1160    *  Attribute Swizzle Enable is set, Number of Output Attributes-1 if
1161    *  enable is not set.
1162    *  read_length = ceiling((max_source_attr + 1) / 2)
1163    *
1164    *  [errata] Corruption/Hang possible if length programmed larger than
1165    *  recommended"
1166    *
1167    * Similar text exists for Ivy Bridge.
1168    */
1169   *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2);
1170}
1171#endif
1172
1173/* ---------------------------------------------------------------------- */
1174
1175#if GEN_GEN >= 8
1176typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML;
1177#elif GEN_GEN >= 6
1178typedef struct GENX(DEPTH_STENCIL_STATE)      DEPTH_STENCIL_GENXML;
1179#else
1180typedef struct GENX(COLOR_CALC_STATE)         DEPTH_STENCIL_GENXML;
1181#endif
1182
1183static inline void
1184set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds)
1185{
1186   struct gl_context *ctx = &brw->ctx;
1187
1188   /* _NEW_BUFFERS */
1189   struct intel_renderbuffer *depth_irb =
1190      intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH);
1191
1192   /* _NEW_DEPTH */
1193   struct gl_depthbuffer_attrib *depth = &ctx->Depth;
1194
1195   /* _NEW_STENCIL */
1196   struct gl_stencil_attrib *stencil = &ctx->Stencil;
1197   const int b = stencil->_BackFace;
1198
1199   if (depth->Test && depth_irb) {
1200      ds->DepthTestEnable = true;
1201      ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw);
1202      ds->DepthTestFunction = intel_translate_compare_func(depth->Func);
1203   }
1204
1205   if (brw->stencil_enabled) {
1206      ds->StencilTestEnable = true;
1207      ds->StencilWriteMask = stencil->WriteMask[0] & 0xff;
1208      ds->StencilTestMask = stencil->ValueMask[0] & 0xff;
1209
1210      ds->StencilTestFunction =
1211         intel_translate_compare_func(stencil->Function[0]);
1212      ds->StencilFailOp =
1213         intel_translate_stencil_op(stencil->FailFunc[0]);
1214      ds->StencilPassDepthPassOp =
1215         intel_translate_stencil_op(stencil->ZPassFunc[0]);
1216      ds->StencilPassDepthFailOp =
1217         intel_translate_stencil_op(stencil->ZFailFunc[0]);
1218
1219      ds->StencilBufferWriteEnable = brw->stencil_write_enabled;
1220
1221      if (brw->stencil_two_sided) {
1222         ds->DoubleSidedStencilEnable = true;
1223         ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff;
1224         ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff;
1225
1226         ds->BackfaceStencilTestFunction =
1227            intel_translate_compare_func(stencil->Function[b]);
1228         ds->BackfaceStencilFailOp =
1229            intel_translate_stencil_op(stencil->FailFunc[b]);
1230         ds->BackfaceStencilPassDepthPassOp =
1231            intel_translate_stencil_op(stencil->ZPassFunc[b]);
1232         ds->BackfaceStencilPassDepthFailOp =
1233            intel_translate_stencil_op(stencil->ZFailFunc[b]);
1234      }
1235
1236#if GEN_GEN <= 5 || GEN_GEN >= 9
1237      ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
1238      ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b);
1239#endif
1240   }
1241}
1242
1243#if GEN_GEN >= 6
1244static void
1245genX(upload_depth_stencil_state)(struct brw_context *brw)
1246{
1247#if GEN_GEN >= 8
1248   brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) {
1249      set_depth_stencil_bits(brw, &wmds);
1250   }
1251#else
1252   uint32_t ds_offset;
1253   brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) {
1254      set_depth_stencil_bits(brw, &ds);
1255   }
1256
1257   /* Now upload a pointer to the indirect state */
1258#if GEN_GEN == 6
1259   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
1260      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1261      ptr.DEPTH_STENCIL_STATEChange = true;
1262   }
1263#else
1264   brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) {
1265      ptr.PointertoDEPTH_STENCIL_STATE = ds_offset;
1266   }
1267#endif
1268#endif
1269}
1270
1271static const struct brw_tracked_state genX(depth_stencil_state) = {
1272   .dirty = {
1273      .mesa = _NEW_BUFFERS |
1274              _NEW_DEPTH |
1275              _NEW_STENCIL,
1276      .brw  = BRW_NEW_BLORP |
1277              (GEN_GEN >= 8 ? BRW_NEW_CONTEXT
1278                            : BRW_NEW_BATCH |
1279                              BRW_NEW_STATE_BASE_ADDRESS),
1280   },
1281   .emit = genX(upload_depth_stencil_state),
1282};
1283#endif
1284
1285/* ---------------------------------------------------------------------- */
1286
1287#if GEN_GEN <= 5
1288
1289static void
1290genX(upload_clip_state)(struct brw_context *brw)
1291{
1292   struct gl_context *ctx = &brw->ctx;
1293
1294   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1295   brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) {
1296      clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset);
1297      clip.GRFRegisterCount =
1298         DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1;
1299      clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1300      clip.SingleProgramFlow = true;
1301      clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length;
1302      clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length;
1303
1304      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1305      clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2;
1306      clip.DispatchGRFStartRegisterForURBData = 1;
1307      clip.VertexURBEntryReadOffset = 0;
1308
1309      /* BRW_NEW_URB_FENCE */
1310      clip.NumberofURBEntries = brw->urb.nr_clip_entries;
1311      clip.URBEntryAllocationSize = brw->urb.vsize - 1;
1312
1313      if (brw->urb.nr_clip_entries >= 10) {
1314         /* Half of the URB entries go to each thread, and it has to be an
1315          * even number.
1316          */
1317         assert(brw->urb.nr_clip_entries % 2 == 0);
1318
1319         /* Although up to 16 concurrent Clip threads are allowed on Ironlake,
1320          * only 2 threads can output VUEs at a time.
1321          */
1322         clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1;
1323      } else {
1324         assert(brw->urb.nr_clip_entries >= 5);
1325         clip.MaximumNumberofThreads = 1 - 1;
1326      }
1327
1328      clip.VertexPositionSpace = VPOS_NDCSPACE;
1329      clip.UserClipFlagsMustClipEnable = true;
1330      clip.GuardbandClipTestEnable = true;
1331
1332      clip.ClipperViewportStatePointer =
1333         ro_bo(brw->batch.state.bo, brw->clip.vp_offset);
1334
1335      clip.ScreenSpaceViewportXMin = -1;
1336      clip.ScreenSpaceViewportXMax = 1;
1337      clip.ScreenSpaceViewportYMin = -1;
1338      clip.ScreenSpaceViewportYMax = 1;
1339
1340      clip.ViewportXYClipTestEnable = true;
1341      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1342                                       ctx->Transform.DepthClampFar);
1343
1344      /* _NEW_TRANSFORM */
1345      if (GEN_GEN == 5 || GEN_IS_G4X) {
1346         clip.UserClipDistanceClipTestEnableBitmask =
1347            ctx->Transform.ClipPlanesEnabled;
1348      } else {
1349         /* Up to 6 actual clip flags, plus the 7th for the negative RHW
1350          * workaround.
1351          */
1352         clip.UserClipDistanceClipTestEnableBitmask =
1353            (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40;
1354      }
1355
1356      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1357         clip.APIMode = APIMODE_D3D;
1358      else
1359         clip.APIMode = APIMODE_OGL;
1360
1361      clip.GuardbandClipTestEnable = true;
1362
1363      clip.ClipMode = brw->clip.prog_data->clip_mode;
1364
1365#if GEN_IS_G4X
1366      clip.NegativeWClipTestEnable = true;
1367#endif
1368   }
1369}
1370
1371const struct brw_tracked_state genX(clip_state) = {
1372   .dirty = {
1373      .mesa  = _NEW_TRANSFORM |
1374               _NEW_VIEWPORT,
1375      .brw   = BRW_NEW_BATCH |
1376               BRW_NEW_BLORP |
1377               BRW_NEW_CLIP_PROG_DATA |
1378               BRW_NEW_PUSH_CONSTANT_ALLOCATION |
1379               BRW_NEW_PROGRAM_CACHE |
1380               BRW_NEW_URB_FENCE,
1381   },
1382   .emit = genX(upload_clip_state),
1383};
1384
1385#else
1386
1387static void
1388genX(upload_clip_state)(struct brw_context *brw)
1389{
1390   struct gl_context *ctx = &brw->ctx;
1391
1392   /* _NEW_BUFFERS */
1393   struct gl_framebuffer *fb = ctx->DrawBuffer;
1394
1395   /* BRW_NEW_FS_PROG_DATA */
1396   struct brw_wm_prog_data *wm_prog_data =
1397      brw_wm_prog_data(brw->wm.base.prog_data);
1398
1399   brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) {
1400      clip.StatisticsEnable = !brw->meta_in_progress;
1401
1402      if (wm_prog_data->barycentric_interp_modes &
1403          BRW_BARYCENTRIC_NONPERSPECTIVE_BITS)
1404         clip.NonPerspectiveBarycentricEnable = true;
1405
1406#if GEN_GEN >= 7
1407      clip.EarlyCullEnable = true;
1408#endif
1409
1410#if GEN_GEN == 7
1411      clip.FrontWinding = brw->polygon_front_bit != fb->FlipY;
1412
1413      if (ctx->Polygon.CullFlag) {
1414         switch (ctx->Polygon.CullFaceMode) {
1415         case GL_FRONT:
1416            clip.CullMode = CULLMODE_FRONT;
1417            break;
1418         case GL_BACK:
1419            clip.CullMode = CULLMODE_BACK;
1420            break;
1421         case GL_FRONT_AND_BACK:
1422            clip.CullMode = CULLMODE_BOTH;
1423            break;
1424         default:
1425            unreachable("Should not get here: invalid CullFlag");
1426         }
1427      } else {
1428         clip.CullMode = CULLMODE_NONE;
1429      }
1430#endif
1431
1432#if GEN_GEN < 8
1433      clip.UserClipDistanceCullTestEnableBitmask =
1434         brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask;
1435
1436      clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear &&
1437                                       ctx->Transform.DepthClampFar);
1438#endif
1439
1440      /* _NEW_LIGHT */
1441      if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) {
1442         clip.TriangleStripListProvokingVertexSelect = 0;
1443         clip.TriangleFanProvokingVertexSelect = 1;
1444         clip.LineStripListProvokingVertexSelect = 0;
1445      } else {
1446         clip.TriangleStripListProvokingVertexSelect = 2;
1447         clip.TriangleFanProvokingVertexSelect = 2;
1448         clip.LineStripListProvokingVertexSelect = 1;
1449      }
1450
1451      /* _NEW_TRANSFORM */
1452      clip.UserClipDistanceClipTestEnableBitmask =
1453         ctx->Transform.ClipPlanesEnabled;
1454
1455#if GEN_GEN >= 8
1456      clip.ForceUserClipDistanceClipTestEnableBitmask = true;
1457#endif
1458
1459      if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE)
1460         clip.APIMode = APIMODE_D3D;
1461      else
1462         clip.APIMode = APIMODE_OGL;
1463
1464      clip.GuardbandClipTestEnable = true;
1465
1466      /* BRW_NEW_VIEWPORT_COUNT */
1467      const unsigned viewport_count = brw->clip.viewport_count;
1468
1469      if (ctx->RasterDiscard) {
1470         clip.ClipMode = CLIPMODE_REJECT_ALL;
1471#if GEN_GEN == 6
1472         perf_debug("Rasterizer discard is currently implemented via the "
1473                    "clipper; having the GS not write primitives would "
1474                    "likely be faster.\n");
1475#endif
1476      } else {
1477         clip.ClipMode = CLIPMODE_NORMAL;
1478      }
1479
1480      clip.ClipEnable = true;
1481
1482      /* _NEW_POLYGON,
1483       * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE
1484       */
1485      if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw))
1486         clip.ViewportXYClipTestEnable = true;
1487
1488      clip.MinimumPointWidth = 0.125;
1489      clip.MaximumPointWidth = 255.875;
1490      clip.MaximumVPIndex = viewport_count - 1;
1491      if (_mesa_geometric_layers(fb) == 0)
1492         clip.ForceZeroRTAIndexEnable = true;
1493   }
1494}
1495
1496static const struct brw_tracked_state genX(clip_state) = {
1497   .dirty = {
1498      .mesa  = _NEW_BUFFERS |
1499               _NEW_LIGHT |
1500               _NEW_POLYGON |
1501               _NEW_TRANSFORM,
1502      .brw   = BRW_NEW_BLORP |
1503               BRW_NEW_CONTEXT |
1504               BRW_NEW_FS_PROG_DATA |
1505               BRW_NEW_GS_PROG_DATA |
1506               BRW_NEW_VS_PROG_DATA |
1507               BRW_NEW_META_IN_PROGRESS |
1508               BRW_NEW_PRIMITIVE |
1509               BRW_NEW_RASTERIZER_DISCARD |
1510               BRW_NEW_TES_PROG_DATA |
1511               BRW_NEW_VIEWPORT_COUNT,
1512   },
1513   .emit = genX(upload_clip_state),
1514};
1515#endif
1516
1517/* ---------------------------------------------------------------------- */
1518
1519static void
1520genX(upload_sf)(struct brw_context *brw)
1521{
1522   struct gl_context *ctx = &brw->ctx;
1523   float point_size;
1524
1525#if GEN_GEN <= 7
1526   /* _NEW_BUFFERS */
1527   bool flip_y = ctx->DrawBuffer->FlipY;
1528   UNUSED const bool multisampled_fbo =
1529      _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1530#endif
1531
1532#if GEN_GEN < 6
1533   const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data;
1534
1535   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1536
1537   brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) {
1538      sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset);
1539      sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1540      sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1;
1541      sf.DispatchGRFStartRegisterForURBData = 3;
1542      sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET;
1543      sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length;
1544      sf.NumberofURBEntries = brw->urb.nr_sf_entries;
1545      sf.URBEntryAllocationSize = brw->urb.sfsize - 1;
1546
1547      /* STATE_PREFETCH command description describes this state as being
1548       * something loaded through the GPE (L2 ISC), so it's INSTRUCTION
1549       * domain.
1550       */
1551      sf.SetupViewportStateOffset =
1552         ro_bo(brw->batch.state.bo, brw->sf.vp_offset);
1553
1554      sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1555
1556      /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */
1557      /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */
1558
1559      sf.MaximumNumberofThreads =
1560         MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1;
1561
1562      sf.SpritePointEnable = ctx->Point.PointSprite;
1563
1564      sf.DestinationOriginHorizontalBias = 0.5;
1565      sf.DestinationOriginVerticalBias = 0.5;
1566#else
1567   brw_batch_emit(brw, GENX(3DSTATE_SF), sf) {
1568      sf.StatisticsEnable = true;
1569#endif
1570      sf.ViewportTransformEnable = true;
1571
1572#if GEN_GEN == 7
1573      /* _NEW_BUFFERS */
1574      sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw);
1575#endif
1576
1577#if GEN_GEN <= 7
1578      /* _NEW_POLYGON */
1579      sf.FrontWinding = brw->polygon_front_bit != flip_y;
1580#if GEN_GEN >= 6
1581      sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill;
1582      sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine;
1583      sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint;
1584
1585      switch (ctx->Polygon.FrontMode) {
1586         case GL_FILL:
1587            sf.FrontFaceFillMode = FILL_MODE_SOLID;
1588            break;
1589         case GL_LINE:
1590            sf.FrontFaceFillMode = FILL_MODE_WIREFRAME;
1591            break;
1592         case GL_POINT:
1593            sf.FrontFaceFillMode = FILL_MODE_POINT;
1594            break;
1595         default:
1596            unreachable("not reached");
1597      }
1598
1599      switch (ctx->Polygon.BackMode) {
1600         case GL_FILL:
1601            sf.BackFaceFillMode = FILL_MODE_SOLID;
1602            break;
1603         case GL_LINE:
1604            sf.BackFaceFillMode = FILL_MODE_WIREFRAME;
1605            break;
1606         case GL_POINT:
1607            sf.BackFaceFillMode = FILL_MODE_POINT;
1608            break;
1609         default:
1610            unreachable("not reached");
1611      }
1612
1613      if (multisampled_fbo && ctx->Multisample.Enabled)
1614         sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
1615
1616      sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1617      sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1618      sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
1619#endif
1620
1621      sf.ScissorRectangleEnable = true;
1622
1623      if (ctx->Polygon.CullFlag) {
1624         switch (ctx->Polygon.CullFaceMode) {
1625            case GL_FRONT:
1626               sf.CullMode = CULLMODE_FRONT;
1627               break;
1628            case GL_BACK:
1629               sf.CullMode = CULLMODE_BACK;
1630               break;
1631            case GL_FRONT_AND_BACK:
1632               sf.CullMode = CULLMODE_BOTH;
1633               break;
1634            default:
1635               unreachable("not reached");
1636         }
1637      } else {
1638         sf.CullMode = CULLMODE_NONE;
1639      }
1640
1641#if GEN_IS_HASWELL
1642      sf.LineStippleEnable = ctx->Line.StippleFlag;
1643#endif
1644
1645#endif
1646
1647      /* _NEW_LINE */
1648#if GEN_GEN == 8
1649      const struct gen_device_info *devinfo = &brw->screen->devinfo;
1650
1651      if (devinfo->is_cherryview)
1652         sf.CHVLineWidth = brw_get_line_width(brw);
1653      else
1654         sf.LineWidth = brw_get_line_width(brw);
1655#else
1656      sf.LineWidth = brw_get_line_width(brw);
1657#endif
1658
1659      if (ctx->Line.SmoothFlag) {
1660         sf.LineEndCapAntialiasingRegionWidth = _10pixels;
1661#if GEN_GEN <= 7
1662         sf.AntiAliasingEnable = true;
1663#endif
1664      }
1665
1666      /* _NEW_POINT - Clamp to ARB_point_parameters user limits */
1667      point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize);
1668      /* Clamp to the hardware limits */
1669      sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f);
1670
1671      /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */
1672      if (use_state_point_size(brw))
1673         sf.PointWidthSource = State;
1674
1675#if GEN_GEN >= 8
1676      /* _NEW_POINT | _NEW_MULTISAMPLE */
1677      if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) &&
1678          !ctx->Point.PointSprite)
1679         sf.SmoothPointEnable = true;
1680#endif
1681
1682#if GEN_GEN == 10
1683      /* _NEW_BUFFERS
1684       * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
1685       */
1686      const bool multisampled_fbo =
1687         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
1688      if (multisampled_fbo)
1689         sf.SmoothPointEnable = false;
1690#endif
1691
1692#if GEN_IS_G4X || GEN_GEN >= 5
1693      sf.AALineDistanceMode = AALINEDISTANCE_TRUE;
1694#endif
1695
1696      /* _NEW_LIGHT */
1697      if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) {
1698         sf.TriangleStripListProvokingVertexSelect = 2;
1699         sf.TriangleFanProvokingVertexSelect = 2;
1700         sf.LineStripListProvokingVertexSelect = 1;
1701      } else {
1702         sf.TriangleFanProvokingVertexSelect = 1;
1703      }
1704
1705#if GEN_GEN == 6
1706      /* BRW_NEW_FS_PROG_DATA */
1707      const struct brw_wm_prog_data *wm_prog_data =
1708         brw_wm_prog_data(brw->wm.base.prog_data);
1709
1710      sf.AttributeSwizzleEnable = true;
1711      sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1712
1713      /*
1714       * Window coordinates in an FBO are inverted, which means point
1715       * sprite origin must be inverted, too.
1716       */
1717      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) {
1718         sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
1719      } else {
1720         sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
1721      }
1722
1723      /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM |
1724       * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA
1725       */
1726      uint32_t urb_entry_read_length;
1727      uint32_t urb_entry_read_offset;
1728      uint32_t point_sprite_enables;
1729      genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables,
1730                                     &urb_entry_read_length,
1731                                     &urb_entry_read_offset);
1732      sf.VertexURBEntryReadLength = urb_entry_read_length;
1733      sf.VertexURBEntryReadOffset = urb_entry_read_offset;
1734      sf.PointSpriteTextureCoordinateEnable = point_sprite_enables;
1735      sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
1736#endif
1737   }
1738}
1739
1740static const struct brw_tracked_state genX(sf_state) = {
1741   .dirty = {
1742      .mesa  = _NEW_LIGHT |
1743               _NEW_LINE |
1744               _NEW_POINT |
1745               _NEW_PROGRAM |
1746               (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) |
1747               (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) |
1748               (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
1749      .brw   = BRW_NEW_BLORP |
1750               BRW_NEW_VUE_MAP_GEOM_OUT |
1751               (GEN_GEN <= 5 ? BRW_NEW_BATCH |
1752                               BRW_NEW_PROGRAM_CACHE |
1753                               BRW_NEW_SF_PROG_DATA |
1754                               BRW_NEW_SF_VP |
1755                               BRW_NEW_URB_FENCE
1756                             : 0) |
1757               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) |
1758               (GEN_GEN >= 6 && GEN_GEN <= 7 ?
1759                               BRW_NEW_GS_PROG_DATA |
1760                               BRW_NEW_PRIMITIVE |
1761                               BRW_NEW_TES_PROG_DATA
1762                             : 0) |
1763               (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA |
1764                               BRW_NEW_FRAGMENT_PROGRAM
1765                             : 0),
1766   },
1767   .emit = genX(upload_sf),
1768};
1769
1770/* ---------------------------------------------------------------------- */
1771
1772static bool
1773brw_color_buffer_write_enabled(struct brw_context *brw)
1774{
1775   struct gl_context *ctx = &brw->ctx;
1776   /* BRW_NEW_FRAGMENT_PROGRAM */
1777   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
1778   unsigned i;
1779
1780   /* _NEW_BUFFERS */
1781   for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
1782      struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
1783      uint64_t outputs_written = fp->info.outputs_written;
1784
1785      /* _NEW_COLOR */
1786      if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) ||
1787                 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) &&
1788          GET_COLORMASK(ctx->Color.ColorMask, i)) {
1789         return true;
1790      }
1791   }
1792
1793   return false;
1794}
1795
1796static void
1797genX(upload_wm)(struct brw_context *brw)
1798{
1799   struct gl_context *ctx = &brw->ctx;
1800
1801   /* BRW_NEW_FS_PROG_DATA */
1802   const struct brw_wm_prog_data *wm_prog_data =
1803      brw_wm_prog_data(brw->wm.base.prog_data);
1804
1805   UNUSED bool writes_depth =
1806      wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF;
1807   UNUSED struct brw_stage_state *stage_state = &brw->wm.base;
1808   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
1809
1810#if GEN_GEN == 6
1811   /* We can't fold this into gen6_upload_wm_push_constants(), because
1812    * according to the SNB PRM, vol 2 part 1 section 7.2.2
1813    * (3DSTATE_CONSTANT_PS [DevSNB]):
1814    *
1815    *     "[DevSNB]: This packet must be followed by WM_STATE."
1816    */
1817   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) {
1818      if (wm_prog_data->base.nr_params != 0) {
1819         wmcp.Buffer0Valid = true;
1820         /* Pointer to the WM constant buffer.  Covered by the set of
1821          * state flags from gen6_upload_wm_push_constants.
1822          */
1823         wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
1824         wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
1825      }
1826   }
1827#endif
1828
1829#if GEN_GEN >= 6
1830   brw_batch_emit(brw, GENX(3DSTATE_WM), wm) {
1831#else
1832   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
1833   brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) {
1834#endif
1835
1836#if GEN_GEN <= 6
1837      wm._8PixelDispatchEnable = wm_prog_data->dispatch_8;
1838      wm._16PixelDispatchEnable = wm_prog_data->dispatch_16;
1839      wm._32PixelDispatchEnable = wm_prog_data->dispatch_32;
1840#endif
1841
1842#if GEN_GEN == 4
1843      /* On gen4, we only have one shader kernel */
1844      if (brw_wm_state_has_ksp(wm, 0)) {
1845         assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0);
1846         wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset);
1847         wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1848         wm.DispatchGRFStartRegisterForConstantSetupData0 =
1849            brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1850      }
1851#elif GEN_GEN == 5
1852      /* On gen5, we have multiple shader kernels but only one GRF start
1853       * register for all kernels
1854       */
1855      wm.KernelStartPointer0 = stage_state->prog_offset +
1856                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1857      wm.KernelStartPointer1 = stage_state->prog_offset +
1858                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1859      wm.KernelStartPointer2 = stage_state->prog_offset +
1860                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1861
1862      wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0);
1863      wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1);
1864      wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2);
1865
1866      wm.DispatchGRFStartRegisterForConstantSetupData0 =
1867         wm_prog_data->base.dispatch_grf_start_reg;
1868
1869      /* Dispatch GRF Start should be the same for all shaders on gen5 */
1870      if (brw_wm_state_has_ksp(wm, 1)) {
1871         assert(wm_prog_data->base.dispatch_grf_start_reg ==
1872                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1));
1873      }
1874      if (brw_wm_state_has_ksp(wm, 2)) {
1875         assert(wm_prog_data->base.dispatch_grf_start_reg ==
1876                brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2));
1877      }
1878#elif GEN_GEN == 6
1879      /* On gen6, we have multiple shader kernels and we no longer specify a
1880       * register count for each one.
1881       */
1882      wm.KernelStartPointer0 = stage_state->prog_offset +
1883                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0);
1884      wm.KernelStartPointer1 = stage_state->prog_offset +
1885                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1);
1886      wm.KernelStartPointer2 = stage_state->prog_offset +
1887                               brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2);
1888
1889      wm.DispatchGRFStartRegisterForConstantSetupData0 =
1890         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0);
1891      wm.DispatchGRFStartRegisterForConstantSetupData1 =
1892         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1);
1893      wm.DispatchGRFStartRegisterForConstantSetupData2 =
1894         brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2);
1895#endif
1896
1897#if GEN_GEN <= 5
1898      wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length;
1899      /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */
1900      wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2;
1901      wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2;
1902      wm.SetupURBEntryReadOffset = 0;
1903      wm.EarlyDepthTestEnable = true;
1904#endif
1905
1906#if GEN_GEN >= 6
1907      wm.LineAntialiasingRegionWidth = _10pixels;
1908      wm.LineEndCapAntialiasingRegionWidth = _05pixels;
1909
1910      wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT;
1911      wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes;
1912#else
1913      if (stage_state->sampler_count)
1914         wm.SamplerStatePointer =
1915            ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
1916
1917      wm.LineAntialiasingRegionWidth = _05pixels;
1918      wm.LineEndCapAntialiasingRegionWidth = _10pixels;
1919
1920      /* _NEW_POLYGON */
1921      if (ctx->Polygon.OffsetFill) {
1922         wm.GlobalDepthOffsetEnable = true;
1923         /* Something weird going on with legacy_global_depth_bias,
1924          * offset_constant, scaling and MRD.  This value passes glean
1925          * but gives some odd results elsewere (eg. the
1926          * quad-offset-units test).
1927          */
1928         wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2;
1929
1930         /* This is the only value that passes glean:
1931         */
1932         wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor;
1933      }
1934
1935      wm.DepthCoefficientURBReadOffset = 1;
1936#endif
1937
1938      /* BRW_NEW_STATS_WM */
1939      wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm;
1940
1941#if GEN_GEN < 7
1942      if (wm_prog_data->base.use_alt_mode)
1943         wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
1944
1945      /* WA_1606682166 */
1946      wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ?
1947         0 : DIV_ROUND_UP(stage_state->sampler_count, 4);
1948
1949      wm.BindingTableEntryCount =
1950         wm_prog_data->base.binding_table.size_bytes / 4;
1951      wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
1952
1953#if GEN_GEN == 6
1954      wm.DualSourceBlendEnable =
1955         wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) &&
1956         ctx->Color.Blend[0]._UsesDualSrc;
1957      wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask;
1958      wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
1959
1960      /* From the SNB PRM, volume 2 part 1, page 281:
1961       * "If the PS kernel does not need the Position XY Offsets
1962       * to compute a Position XY value, then this field should be
1963       * programmed to POSOFFSET_NONE."
1964       *
1965       * "SW Recommendation: If the PS kernel needs the Position Offsets
1966       * to compute a Position XY value, this field should match Position
1967       * ZW Interpolation Mode to ensure a consistent position.xyzw
1968       * computation."
1969       * We only require XY sample offsets. So, this recommendation doesn't
1970       * look useful at the moment. We might need this in future.
1971       */
1972      if (wm_prog_data->uses_pos_offset)
1973         wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
1974      else
1975         wm.PositionXYOffsetSelect = POSOFFSET_NONE;
1976#endif
1977
1978      if (wm_prog_data->base.total_scratch) {
1979         wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
1980         wm.PerThreadScratchSpace =
1981            ffs(stage_state->per_thread_scratch) - 11;
1982      }
1983
1984      wm.PixelShaderComputedDepth = writes_depth;
1985#endif
1986
1987      /* _NEW_LINE */
1988      wm.LineStippleEnable = ctx->Line.StippleFlag;
1989
1990      /* _NEW_POLYGON */
1991      wm.PolygonStippleEnable = ctx->Polygon.StippleFlag;
1992
1993#if GEN_GEN < 8
1994
1995#if GEN_GEN >= 6
1996      wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w;
1997
1998      /* _NEW_BUFFERS */
1999      const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
2000
2001      if (multisampled_fbo) {
2002         /* _NEW_MULTISAMPLE */
2003         if (ctx->Multisample.Enabled)
2004            wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN;
2005         else
2006            wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2007
2008         if (wm_prog_data->persample_dispatch)
2009            wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2010         else
2011            wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL;
2012      } else {
2013         wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL;
2014         wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE;
2015      }
2016#endif
2017      wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth;
2018      if (wm_prog_data->uses_kill ||
2019          _mesa_is_alpha_test_enabled(ctx) ||
2020          _mesa_is_alpha_to_coverage_enabled(ctx) ||
2021          (GEN_GEN >= 6 && wm_prog_data->uses_omask)) {
2022         wm.PixelShaderKillsPixel = true;
2023      }
2024
2025      /* _NEW_BUFFERS | _NEW_COLOR */
2026      if (brw_color_buffer_write_enabled(brw) || writes_depth ||
2027          wm.PixelShaderKillsPixel ||
2028          (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) {
2029         wm.ThreadDispatchEnable = true;
2030      }
2031
2032#if GEN_GEN >= 7
2033      wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode;
2034      wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask;
2035#endif
2036
2037      /* The "UAV access enable" bits are unnecessary on HSW because they only
2038       * seem to have an effect on the HW-assisted coherency mechanism which we
2039       * don't need, and the rasterization-related UAV_ONLY flag and the
2040       * DISPATCH_ENABLE bit can be set independently from it.
2041       * C.f. gen8_upload_ps_extra().
2042       *
2043       * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS |
2044       * _NEW_COLOR
2045       */
2046#if GEN_IS_HASWELL
2047      if (!(brw_color_buffer_write_enabled(brw) || writes_depth) &&
2048          wm_prog_data->has_side_effects)
2049         wm.PSUAVonly = ON;
2050#endif
2051#endif
2052
2053#if GEN_GEN >= 7
2054      /* BRW_NEW_FS_PROG_DATA */
2055      if (wm_prog_data->early_fragment_tests)
2056         wm.EarlyDepthStencilControl = EDSC_PREPS;
2057      else if (wm_prog_data->has_side_effects)
2058         wm.EarlyDepthStencilControl = EDSC_PSEXEC;
2059#endif
2060   }
2061
2062#if GEN_GEN <= 5
2063   if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) {
2064      brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) {
2065         clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp;
2066      }
2067
2068      brw->wm.offset_clamp = ctx->Polygon.OffsetClamp;
2069   }
2070#endif
2071}
2072
2073static const struct brw_tracked_state genX(wm_state) = {
2074   .dirty = {
2075      .mesa  = _NEW_LINE |
2076               _NEW_POLYGON |
2077               (GEN_GEN < 8 ? _NEW_BUFFERS |
2078                              _NEW_COLOR :
2079                              0) |
2080               (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) |
2081               (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) |
2082               (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0),
2083      .brw   = BRW_NEW_BLORP |
2084               BRW_NEW_FS_PROG_DATA |
2085               (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2086                              BRW_NEW_FRAGMENT_PROGRAM |
2087                              BRW_NEW_PROGRAM_CACHE |
2088                              BRW_NEW_SAMPLER_STATE_TABLE |
2089                              BRW_NEW_STATS_WM
2090                            : 0) |
2091               (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT),
2092   },
2093   .emit = genX(upload_wm),
2094};
2095
2096/* ---------------------------------------------------------------------- */
2097
2098/* We restrict scratch buffers to the bottom 32 bits of the address space
2099 * by using rw_32_bo().
2100 *
2101 * General State Base Address is a bit broken.  If the address + size as
2102 * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat
2103 * all accesses to the buffer as being out of bounds and returns zero.
2104 */
2105
2106#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \
2107   pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset);           \
2108   /* WA_1606682166 */                                                    \
2109   pkt.SamplerCount       =                                               \
2110      GEN_GEN == 11 ?                                                     \
2111      0 :                                                                 \
2112      DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);          \
2113   /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to     \
2114    * disable prefetching of binding tables in A0 and B0 steppings.       \
2115    * TODO: Revisit this WA on C0 stepping.                               \
2116    */                                                                    \
2117   pkt.BindingTableEntryCount =                                           \
2118      GEN_GEN == 11 ?                                                     \
2119      0 :                                                                 \
2120      stage_prog_data->binding_table.size_bytes / 4;                      \
2121   pkt.FloatingPointMode  = stage_prog_data->use_alt_mode;                \
2122                                                                          \
2123   if (stage_prog_data->total_scratch) {                                  \
2124      pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \
2125      pkt.PerThreadScratchSpace =                                         \
2126         ffs(stage_state->per_thread_scratch) - 11;                       \
2127   }                                                                      \
2128                                                                          \
2129   pkt.DispatchGRFStartRegisterForURBData =                               \
2130      stage_prog_data->dispatch_grf_start_reg;                            \
2131   pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length;       \
2132   pkt.prefix##URBEntryReadOffset = 0;                                    \
2133                                                                          \
2134   pkt.StatisticsEnable = true;                                           \
2135   pkt.Enable           = true;
2136
2137static void
2138genX(upload_vs_state)(struct brw_context *brw)
2139{
2140   UNUSED struct gl_context *ctx = &brw->ctx;
2141   const struct gen_device_info *devinfo = &brw->screen->devinfo;
2142   struct brw_stage_state *stage_state = &brw->vs.base;
2143
2144   /* BRW_NEW_VS_PROG_DATA */
2145   const struct brw_vue_prog_data *vue_prog_data =
2146      brw_vue_prog_data(brw->vs.base.prog_data);
2147   const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base;
2148
2149   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
2150          vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
2151   assert(GEN_GEN < 11 ||
2152          vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
2153
2154#if GEN_GEN == 6
2155   /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State,
2156    * 3DSTATE_VS, Dword 5.0 "VS Function Enable":
2157    *
2158    *   [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS
2159    *   command that causes the VS Function Enable to toggle. Pipeline
2160    *   flush can be executed by sending a PIPE_CONTROL command with CS
2161    *   stall bit set and a post sync operation.
2162    *
2163    * We've already done such a flush at the start of state upload, so we
2164    * don't need to do another one here.
2165    */
2166   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) {
2167      if (stage_state->push_const_size != 0) {
2168         cvs.Buffer0Valid = true;
2169         cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2170         cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2171      }
2172   }
2173#endif
2174
2175   if (GEN_GEN == 7 && devinfo->is_ivybridge)
2176      gen7_emit_vs_workaround_flush(brw);
2177
2178#if GEN_GEN >= 6
2179   brw_batch_emit(brw, GENX(3DSTATE_VS), vs) {
2180#else
2181   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2182   brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) {
2183#endif
2184      INIT_THREAD_DISPATCH_FIELDS(vs, Vertex);
2185
2186      vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1;
2187
2188#if GEN_GEN < 6
2189      vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1;
2190      vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length;
2191      vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2;
2192
2193      vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0);
2194      vs.URBEntryAllocationSize = brw->urb.vsize - 1;
2195
2196      vs.MaximumNumberofThreads =
2197         CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1;
2198
2199      vs.StatisticsEnable = false;
2200      vs.SamplerStatePointer =
2201         ro_bo(brw->batch.state.bo, stage_state->sampler_offset);
2202#endif
2203
2204#if GEN_GEN == 5
2205      /* Force single program flow on Ironlake.  We cannot reliably get
2206       * all applications working without it.  See:
2207       * https://bugs.freedesktop.org/show_bug.cgi?id=29172
2208       *
2209       * The most notable and reliably failing application is the Humus
2210       * demo "CelShading"
2211       */
2212      vs.SingleProgramFlow = true;
2213      vs.SamplerCount = 0; /* hardware requirement */
2214#endif
2215
2216#if GEN_GEN >= 8
2217      vs.SIMD8DispatchEnable =
2218         vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
2219
2220      vs.UserClipDistanceCullTestEnableBitmask =
2221         vue_prog_data->cull_distance_mask;
2222#endif
2223   }
2224
2225#if GEN_GEN == 6
2226   /* Based on my reading of the simulator, the VS constants don't get
2227    * pulled into the VS FF unit until an appropriate pipeline flush
2228    * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds
2229    * references to them into a little FIFO.  The flushes are common,
2230    * but don't reliably happen between this and a 3DPRIMITIVE, causing
2231    * the primitive to use the wrong constants.  Then the FIFO
2232    * containing the constant setup gets added to again on the next
2233    * constants change, and eventually when a flush does happen the
2234    * unit is overwhelmed by constant changes and dies.
2235    *
2236    * To avoid this, send a PIPE_CONTROL down the line that will
2237    * update the unit immediately loading the constants.  The flush
2238    * type bits here were those set by the STATE_BASE_ADDRESS whose
2239    * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the
2240    * bug reports that led to this workaround, and may be more than
2241    * what is strictly required to avoid the issue.
2242    */
2243   brw_emit_pipe_control_flush(brw,
2244                               PIPE_CONTROL_DEPTH_STALL |
2245                               PIPE_CONTROL_INSTRUCTION_INVALIDATE |
2246                               PIPE_CONTROL_STATE_CACHE_INVALIDATE);
2247#endif
2248}
2249
2250static const struct brw_tracked_state genX(vs_state) = {
2251   .dirty = {
2252      .mesa  = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0),
2253      .brw   = BRW_NEW_BATCH |
2254               BRW_NEW_BLORP |
2255               BRW_NEW_CONTEXT |
2256               BRW_NEW_VS_PROG_DATA |
2257               (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) |
2258               (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2259                               BRW_NEW_PROGRAM_CACHE |
2260                               BRW_NEW_SAMPLER_STATE_TABLE |
2261                               BRW_NEW_URB_FENCE
2262                             : 0),
2263   },
2264   .emit = genX(upload_vs_state),
2265};
2266
2267/* ---------------------------------------------------------------------- */
2268
2269static void
2270genX(upload_cc_viewport)(struct brw_context *brw)
2271{
2272   struct gl_context *ctx = &brw->ctx;
2273
2274   /* BRW_NEW_VIEWPORT_COUNT */
2275   const unsigned viewport_count = brw->clip.viewport_count;
2276
2277   struct GENX(CC_VIEWPORT) ccv;
2278   uint32_t cc_vp_offset;
2279   uint32_t *cc_map =
2280      brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count,
2281                      32, &cc_vp_offset);
2282
2283   for (unsigned i = 0; i < viewport_count; i++) {
2284      /* _NEW_VIEWPORT | _NEW_TRANSFORM */
2285      const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i];
2286      if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) {
2287         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2288         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2289      } else if (ctx->Transform.DepthClampNear) {
2290         ccv.MinimumDepth = MIN2(vp->Near, vp->Far);
2291         ccv.MaximumDepth = 0.0;
2292      } else if (ctx->Transform.DepthClampFar) {
2293         ccv.MinimumDepth = 0.0;
2294         ccv.MaximumDepth = MAX2(vp->Near, vp->Far);
2295      } else {
2296         ccv.MinimumDepth = 0.0;
2297         ccv.MaximumDepth = 1.0;
2298      }
2299      GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv);
2300      cc_map += GENX(CC_VIEWPORT_length);
2301   }
2302
2303#if GEN_GEN >= 7
2304   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) {
2305      ptr.CCViewportPointer = cc_vp_offset;
2306   }
2307#elif GEN_GEN == 6
2308   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2309      vp.CCViewportStateChange = 1;
2310      vp.PointertoCC_VIEWPORT = cc_vp_offset;
2311   }
2312#else
2313   brw->cc.vp_offset = cc_vp_offset;
2314   ctx->NewDriverState |= BRW_NEW_CC_VP;
2315#endif
2316}
2317
2318const struct brw_tracked_state genX(cc_vp) = {
2319   .dirty = {
2320      .mesa = _NEW_TRANSFORM |
2321              _NEW_VIEWPORT,
2322      .brw = BRW_NEW_BATCH |
2323             BRW_NEW_BLORP |
2324             BRW_NEW_VIEWPORT_COUNT,
2325   },
2326   .emit = genX(upload_cc_viewport)
2327};
2328
2329/* ---------------------------------------------------------------------- */
2330
2331static void
2332set_scissor_bits(const struct gl_context *ctx, int i,
2333                 bool flip_y, unsigned fb_width, unsigned fb_height,
2334                 struct GENX(SCISSOR_RECT) *sc)
2335{
2336   int bbox[4];
2337
2338   bbox[0] = MAX2(ctx->ViewportArray[i].X, 0);
2339   bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width);
2340   bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height);
2341   bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height);
2342   _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
2343
2344   if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
2345      /* If the scissor was out of bounds and got clamped to 0 width/height
2346       * at the bounds, the subtraction of 1 from maximums could produce a
2347       * negative number and thus not clip anything.  Instead, just provide
2348       * a min > max scissor inside the bounds, which produces the expected
2349       * no rendering.
2350       */
2351      sc->ScissorRectangleXMin = 1;
2352      sc->ScissorRectangleXMax = 0;
2353      sc->ScissorRectangleYMin = 1;
2354      sc->ScissorRectangleYMax = 0;
2355   } else if (!flip_y) {
2356      /* texmemory: Y=0=bottom */
2357      sc->ScissorRectangleXMin = bbox[0];
2358      sc->ScissorRectangleXMax = bbox[1] - 1;
2359      sc->ScissorRectangleYMin = bbox[2];
2360      sc->ScissorRectangleYMax = bbox[3] - 1;
2361   } else {
2362      /* memory: Y=0=top */
2363      sc->ScissorRectangleXMin = bbox[0];
2364      sc->ScissorRectangleXMax = bbox[1] - 1;
2365      sc->ScissorRectangleYMin = fb_height - bbox[3];
2366      sc->ScissorRectangleYMax = fb_height - bbox[2] - 1;
2367   }
2368}
2369
2370#if GEN_GEN >= 6
2371static void
2372genX(upload_scissor_state)(struct brw_context *brw)
2373{
2374   struct gl_context *ctx = &brw->ctx;
2375   const bool flip_y = ctx->DrawBuffer->FlipY;
2376   struct GENX(SCISSOR_RECT) scissor;
2377   uint32_t scissor_state_offset;
2378   const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer);
2379   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
2380   uint32_t *scissor_map;
2381
2382   /* BRW_NEW_VIEWPORT_COUNT */
2383   const unsigned viewport_count = brw->clip.viewport_count;
2384
2385   scissor_map = brw_state_batch(
2386      brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count,
2387      32, &scissor_state_offset);
2388
2389   /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */
2390
2391   /* The scissor only needs to handle the intersection of drawable and
2392    * scissor rect.  Clipping to the boundaries of static shared buffers
2393    * for front/back/depth is covered by looping over cliprects in brw_draw.c.
2394    *
2395    * Note that the hardware's coordinates are inclusive, while Mesa's min is
2396    * inclusive but max is exclusive.
2397    */
2398   for (unsigned i = 0; i < viewport_count; i++) {
2399      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor);
2400      GENX(SCISSOR_RECT_pack)(
2401         NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor);
2402   }
2403
2404   brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) {
2405      ptr.ScissorRectPointer = scissor_state_offset;
2406   }
2407}
2408
2409static const struct brw_tracked_state genX(scissor_state) = {
2410   .dirty = {
2411      .mesa = _NEW_BUFFERS |
2412              _NEW_SCISSOR |
2413              _NEW_VIEWPORT,
2414      .brw = BRW_NEW_BATCH |
2415             BRW_NEW_BLORP |
2416             BRW_NEW_VIEWPORT_COUNT,
2417   },
2418   .emit = genX(upload_scissor_state),
2419};
2420#endif
2421
2422/* ---------------------------------------------------------------------- */
2423
2424static void
2425brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height,
2426                             float m00, float m11, float m30, float m31,
2427                             float *xmin, float *xmax,
2428                             float *ymin, float *ymax)
2429{
2430   /* According to the "Vertex X,Y Clamping and Quantization" section of the
2431    * Strips and Fans documentation:
2432    *
2433    * "The vertex X and Y screen-space coordinates are also /clamped/ to the
2434    *  fixed-point "guardband" range supported by the rasterization hardware"
2435    *
2436    * and
2437    *
2438    * "In almost all circumstances, if an object’s vertices are actually
2439    *  modified by this clamping (i.e., had X or Y coordinates outside of
2440    *  the guardband extent the rendered object will not match the intended
2441    *  result.  Therefore software should take steps to ensure that this does
2442    *  not happen - e.g., by clipping objects such that they do not exceed
2443    *  these limits after the Drawing Rectangle is applied."
2444    *
2445    * I believe the fundamental restriction is that the rasterizer (in
2446    * the SF/WM stages) have a limit on the number of pixels that can be
2447    * rasterized.  We need to ensure any coordinates beyond the rasterizer
2448    * limit are handled by the clipper.  So effectively that limit becomes
2449    * the clipper's guardband size.
2450    *
2451    * It goes on to say:
2452    *
2453    * "In addition, in order to be correctly rendered, objects must have a
2454    *  screenspace bounding box not exceeding 8K in the X or Y direction.
2455    *  This additional restriction must also be comprehended by software,
2456    *  i.e., enforced by use of clipping."
2457    *
2458    * This makes no sense.  Gen7+ hardware supports 16K render targets,
2459    * and you definitely need to be able to draw polygons that fill the
2460    * surface.  Our assumption is that the rasterizer was limited to 8K
2461    * on Sandybridge, which only supports 8K surfaces, and it was actually
2462    * increased to 16K on Ivybridge and later.
2463    *
2464    * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge.
2465    */
2466   const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f;
2467
2468   /* Workaround: prevent gpu hangs on SandyBridge
2469    * by disabling guardband clipping for odd dimensions.
2470    */
2471   if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) {
2472      *xmin = -1.0f;
2473      *xmax =  1.0f;
2474      *ymin = -1.0f;
2475      *ymax =  1.0f;
2476      return;
2477   }
2478
2479   if (m00 != 0 && m11 != 0) {
2480      /* First, we compute the screen-space render area */
2481      const float ss_ra_xmin = MIN3(        0, m30 + m00, m30 - m00);
2482      const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00);
2483      const float ss_ra_ymin = MIN3(        0, m31 + m11, m31 - m11);
2484      const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11);
2485
2486      /* We want the guardband to be centered on that */
2487      const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size;
2488      const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size;
2489      const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size;
2490      const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size;
2491
2492      /* Now we need it in native device coordinates */
2493      const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00;
2494      const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00;
2495      const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11;
2496      const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11;
2497
2498      /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be
2499       * flipped upside-down.  X should be fine though.
2500       */
2501      assert(ndc_gb_xmin <= ndc_gb_xmax);
2502      *xmin = ndc_gb_xmin;
2503      *xmax = ndc_gb_xmax;
2504      *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax);
2505      *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax);
2506   } else {
2507      /* The viewport scales to 0, so nothing will be rendered. */
2508      *xmin = 0.0f;
2509      *xmax = 0.0f;
2510      *ymin = 0.0f;
2511      *ymax = 0.0f;
2512   }
2513}
2514
2515static void
2516genX(upload_sf_clip_viewport)(struct brw_context *brw)
2517{
2518   struct gl_context *ctx = &brw->ctx;
2519   float y_scale, y_bias;
2520
2521   /* BRW_NEW_VIEWPORT_COUNT */
2522   const unsigned viewport_count = brw->clip.viewport_count;
2523
2524   /* _NEW_BUFFERS */
2525   const bool flip_y = ctx->DrawBuffer->FlipY;
2526   const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer);
2527   const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
2528
2529#if GEN_GEN >= 7
2530#define clv sfv
2531   struct GENX(SF_CLIP_VIEWPORT) sfv;
2532   uint32_t sf_clip_vp_offset;
2533   uint32_t *sf_clip_map =
2534      brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count,
2535                      64, &sf_clip_vp_offset);
2536#else
2537   struct GENX(SF_VIEWPORT) sfv;
2538   struct GENX(CLIP_VIEWPORT) clv;
2539   uint32_t sf_vp_offset, clip_vp_offset;
2540   uint32_t *sf_map =
2541      brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count,
2542                      32, &sf_vp_offset);
2543   uint32_t *clip_map =
2544      brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count,
2545                      32, &clip_vp_offset);
2546#endif
2547
2548   /* _NEW_BUFFERS */
2549   if (flip_y) {
2550      y_scale = -1.0;
2551      y_bias = (float)fb_height;
2552   } else {
2553      y_scale = 1.0;
2554      y_bias = 0;
2555   }
2556
2557   for (unsigned i = 0; i < brw->clip.viewport_count; i++) {
2558      /* _NEW_VIEWPORT: Guardband Clipping */
2559      float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax;
2560      _mesa_get_viewport_xform(ctx, i, scale, translate);
2561
2562      sfv.ViewportMatrixElementm00 = scale[0];
2563      sfv.ViewportMatrixElementm11 = scale[1] * y_scale,
2564      sfv.ViewportMatrixElementm22 = scale[2],
2565      sfv.ViewportMatrixElementm30 = translate[0],
2566      sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias,
2567      sfv.ViewportMatrixElementm32 = translate[2],
2568      brw_calculate_guardband_size(fb_width, fb_height,
2569                                   sfv.ViewportMatrixElementm00,
2570                                   sfv.ViewportMatrixElementm11,
2571                                   sfv.ViewportMatrixElementm30,
2572                                   sfv.ViewportMatrixElementm31,
2573                                   &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax);
2574
2575
2576      clv.XMinClipGuardband = gb_xmin;
2577      clv.XMaxClipGuardband = gb_xmax;
2578      clv.YMinClipGuardband = gb_ymin;
2579      clv.YMaxClipGuardband = gb_ymax;
2580
2581#if GEN_GEN < 6
2582      set_scissor_bits(ctx, i, flip_y, fb_width, fb_height,
2583                       &sfv.ScissorRectangle);
2584#elif GEN_GEN >= 8
2585      /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport
2586       * The hardware will take the intersection of the drawing rectangle,
2587       * scissor rectangle, and the viewport extents.  However, emitting
2588       * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full
2589       * pipeline stall so we're better off just being a little more clever
2590       * with our viewport so we can emit it once at context creation time.
2591       */
2592      const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0);
2593      const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0);
2594      const float viewport_Xmax =
2595         MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width);
2596      const float viewport_Ymax =
2597         MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height);
2598
2599      if (flip_y) {
2600         sfv.XMinViewPort = viewport_Xmin;
2601         sfv.XMaxViewPort = viewport_Xmax - 1;
2602         sfv.YMinViewPort = fb_height - viewport_Ymax;
2603         sfv.YMaxViewPort = fb_height - viewport_Ymin - 1;
2604      } else {
2605         sfv.XMinViewPort = viewport_Xmin;
2606         sfv.XMaxViewPort = viewport_Xmax - 1;
2607         sfv.YMinViewPort = viewport_Ymin;
2608         sfv.YMaxViewPort = viewport_Ymax - 1;
2609      }
2610#endif
2611
2612#if GEN_GEN >= 7
2613      GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv);
2614      sf_clip_map += GENX(SF_CLIP_VIEWPORT_length);
2615#else
2616      GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv);
2617      GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv);
2618      sf_map += GENX(SF_VIEWPORT_length);
2619      clip_map += GENX(CLIP_VIEWPORT_length);
2620#endif
2621   }
2622
2623#if GEN_GEN >= 7
2624   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) {
2625      ptr.SFClipViewportPointer = sf_clip_vp_offset;
2626   }
2627#elif GEN_GEN == 6
2628   brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) {
2629      vp.SFViewportStateChange = 1;
2630      vp.CLIPViewportStateChange = 1;
2631      vp.PointertoCLIP_VIEWPORT = clip_vp_offset;
2632      vp.PointertoSF_VIEWPORT = sf_vp_offset;
2633   }
2634#else
2635   brw->sf.vp_offset = sf_vp_offset;
2636   brw->clip.vp_offset = clip_vp_offset;
2637   brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP;
2638#endif
2639}
2640
2641static const struct brw_tracked_state genX(sf_clip_viewport) = {
2642   .dirty = {
2643      .mesa = _NEW_BUFFERS |
2644              _NEW_VIEWPORT |
2645              (GEN_GEN <= 5 ? _NEW_SCISSOR : 0),
2646      .brw = BRW_NEW_BATCH |
2647             BRW_NEW_BLORP |
2648             BRW_NEW_VIEWPORT_COUNT,
2649   },
2650   .emit = genX(upload_sf_clip_viewport),
2651};
2652
2653/* ---------------------------------------------------------------------- */
2654
2655static void
2656genX(upload_gs_state)(struct brw_context *brw)
2657{
2658   UNUSED struct gl_context *ctx = &brw->ctx;
2659   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
2660   const struct brw_stage_state *stage_state = &brw->gs.base;
2661   const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY];
2662   /* BRW_NEW_GEOMETRY_PROGRAM */
2663   bool active = GEN_GEN >= 6 && gs_prog;
2664
2665   /* BRW_NEW_GS_PROG_DATA */
2666   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
2667   UNUSED const struct brw_vue_prog_data *vue_prog_data =
2668      brw_vue_prog_data(stage_prog_data);
2669#if GEN_GEN >= 7
2670   const struct brw_gs_prog_data *gs_prog_data =
2671      brw_gs_prog_data(stage_prog_data);
2672#endif
2673
2674#if GEN_GEN == 6
2675   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) {
2676      if (active && stage_state->push_const_size != 0) {
2677         cgs.Buffer0Valid = true;
2678         cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset;
2679         cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1;
2680      }
2681   }
2682#endif
2683
2684#if GEN_GEN == 7 && !GEN_IS_HASWELL
2685   /**
2686    * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages >
2687    * Geometry > Geometry Shader > State:
2688    *
2689    *     "Note: Because of corruption in IVB:GT2, software needs to flush the
2690    *     whole fixed function pipeline when the GS enable changes value in
2691    *     the 3DSTATE_GS."
2692    *
2693    * The hardware architects have clarified that in this context "flush the
2694    * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS
2695    * Stall" bit set.
2696    */
2697   if (devinfo->gt == 2 && brw->gs.enabled != active)
2698      gen7_emit_cs_stall_flush(brw);
2699#endif
2700
2701#if GEN_GEN >= 6
2702   brw_batch_emit(brw, GENX(3DSTATE_GS), gs) {
2703#else
2704   ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
2705   brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) {
2706#endif
2707
2708#if GEN_GEN >= 6
2709      if (active) {
2710         INIT_THREAD_DISPATCH_FIELDS(gs, Vertex);
2711
2712#if GEN_GEN >= 7
2713         gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1;
2714         gs.OutputTopology = gs_prog_data->output_topology;
2715         gs.ControlDataHeaderSize =
2716            gs_prog_data->control_data_header_size_hwords;
2717
2718         gs.InstanceControl = gs_prog_data->invocations - 1;
2719         gs.DispatchMode = vue_prog_data->dispatch_mode;
2720
2721         gs.IncludePrimitiveID = gs_prog_data->include_primitive_id;
2722
2723         gs.ControlDataFormat = gs_prog_data->control_data_format;
2724#endif
2725
2726         /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between
2727          * Ivy Bridge and Haswell.
2728          *
2729          * On Ivy Bridge, setting this bit causes the vertices of a triangle
2730          * strip to be delivered to the geometry shader in an order that does
2731          * not strictly follow the OpenGL spec, but preserves triangle
2732          * orientation.  For example, if the vertices are (1, 2, 3, 4, 5), then
2733          * the geometry shader sees triangles:
2734          *
2735          * (1, 2, 3), (2, 4, 3), (3, 4, 5)
2736          *
2737          * (Clearing the bit is even worse, because it fails to preserve
2738          * orientation).
2739          *
2740          * Triangle strips with adjacency always ordered in a way that preserves
2741          * triangle orientation but does not strictly follow the OpenGL spec,
2742          * regardless of the setting of this bit.
2743          *
2744          * On Haswell, both triangle strips and triangle strips with adjacency
2745          * are always ordered in a way that preserves triangle orientation.
2746          * Setting this bit causes the ordering to strictly follow the OpenGL
2747          * spec.
2748          *
2749          * So in either case we want to set the bit.  Unfortunately on Ivy
2750          * Bridge this will get the order close to correct but not perfect.
2751          */
2752         gs.ReorderMode = TRAILING;
2753         gs.MaximumNumberofThreads =
2754            GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1)
2755                         : (devinfo->max_gs_threads - 1);
2756
2757#if GEN_GEN < 7
2758         gs.SOStatisticsEnable = true;
2759         if (gs_prog->info.has_transform_feedback_varyings)
2760            gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx);
2761
2762         /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it
2763          * was previously done for gen6.
2764          *
2765          * TODO: test with both disabled to see if the HW is behaving
2766          * as expected, like in gen7.
2767          */
2768         gs.SingleProgramFlow = true;
2769         gs.VectorMaskEnable = true;
2770#endif
2771
2772#if GEN_GEN >= 8
2773         gs.ExpectedVertexCount = gs_prog_data->vertices_in;
2774
2775         if (gs_prog_data->static_vertex_count != -1) {
2776            gs.StaticOutput = true;
2777            gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count;
2778         }
2779         gs.IncludeVertexHandles = vue_prog_data->include_vue_handles;
2780
2781         gs.UserClipDistanceCullTestEnableBitmask =
2782            vue_prog_data->cull_distance_mask;
2783
2784         const int urb_entry_write_offset = 1;
2785         const uint32_t urb_entry_output_length =
2786            DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) -
2787            urb_entry_write_offset;
2788
2789         gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset;
2790         gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1);
2791#endif
2792      }
2793#endif
2794
2795#if GEN_GEN <= 6
2796      if (!active && brw->ff_gs.prog_active) {
2797         /* In gen6, transform feedback for the VS stage is done with an
2798          * ad-hoc GS program. This function provides the needed 3DSTATE_GS
2799          * for this.
2800          */
2801         gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset);
2802         gs.SingleProgramFlow = true;
2803         gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1;
2804         gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length;
2805
2806#if GEN_GEN <= 5
2807         gs.GRFRegisterCount =
2808            DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1;
2809         /* BRW_NEW_URB_FENCE */
2810         gs.NumberofURBEntries = brw->urb.nr_gs_entries;
2811         gs.URBEntryAllocationSize = brw->urb.vsize - 1;
2812         gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0;
2813         gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate;
2814#else
2815         gs.Enable = true;
2816         gs.VectorMaskEnable = true;
2817         gs.SVBIPayloadEnable = true;
2818         gs.SVBIPostIncrementEnable = true;
2819         gs.SVBIPostIncrementValue =
2820            brw->ff_gs.prog_data->svbi_postincrement_value;
2821         gs.SOStatisticsEnable = true;
2822         gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1;
2823#endif
2824      }
2825#endif
2826      if (!active && !brw->ff_gs.prog_active) {
2827#if GEN_GEN < 8
2828         gs.DispatchGRFStartRegisterForURBData = 1;
2829#if GEN_GEN >= 7
2830         gs.IncludeVertexHandles = true;
2831#endif
2832#endif
2833      }
2834
2835#if GEN_GEN >= 6
2836      gs.StatisticsEnable = true;
2837#endif
2838#if GEN_GEN == 5 || GEN_GEN == 6
2839      gs.RenderingEnabled = true;
2840#endif
2841#if GEN_GEN <= 5
2842      gs.MaximumVPIndex = brw->clip.viewport_count - 1;
2843#endif
2844   }
2845
2846#if GEN_GEN == 6
2847   brw->gs.enabled = active;
2848#endif
2849}
2850
2851static const struct brw_tracked_state genX(gs_state) = {
2852   .dirty = {
2853      .mesa  = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0),
2854      .brw   = BRW_NEW_BATCH |
2855               BRW_NEW_BLORP |
2856               (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION |
2857                               BRW_NEW_PROGRAM_CACHE |
2858                               BRW_NEW_URB_FENCE |
2859                               BRW_NEW_VIEWPORT_COUNT
2860                             : 0) |
2861               (GEN_GEN >= 6 ? BRW_NEW_CONTEXT |
2862                               BRW_NEW_GEOMETRY_PROGRAM |
2863                               BRW_NEW_GS_PROG_DATA
2864                             : 0) |
2865               (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0),
2866   },
2867   .emit = genX(upload_gs_state),
2868};
2869
2870/* ---------------------------------------------------------------------- */
2871
2872UNUSED static GLenum
2873fix_dual_blend_alpha_to_one(GLenum function)
2874{
2875   switch (function) {
2876   case GL_SRC1_ALPHA:
2877      return GL_ONE;
2878
2879   case GL_ONE_MINUS_SRC1_ALPHA:
2880      return GL_ZERO;
2881   }
2882
2883   return function;
2884}
2885
2886#define blend_factor(x) brw_translate_blend_factor(x)
2887#define blend_eqn(x) brw_translate_blend_equation(x)
2888
2889/**
2890 * Modify blend function to force destination alpha to 1.0
2891 *
2892 * If \c function specifies a blend function that uses destination alpha,
2893 * replace it with a function that hard-wires destination alpha to 1.0.  This
2894 * is used when rendering to xRGB targets.
2895 */
2896static GLenum
2897brw_fix_xRGB_alpha(GLenum function)
2898{
2899   switch (function) {
2900   case GL_DST_ALPHA:
2901      return GL_ONE;
2902
2903   case GL_ONE_MINUS_DST_ALPHA:
2904   case GL_SRC_ALPHA_SATURATE:
2905      return GL_ZERO;
2906   }
2907
2908   return function;
2909}
2910
2911#if GEN_GEN >= 6
2912typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML;
2913#else
2914typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML;
2915#endif
2916
2917UNUSED static bool
2918set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i,
2919                     bool alpha_to_one)
2920{
2921   struct gl_context *ctx = &brw->ctx;
2922
2923   /* _NEW_BUFFERS */
2924   const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
2925
2926   bool independent_alpha_blend = false;
2927
2928   /* Used for implementing the following bit of GL_EXT_texture_integer:
2929    * "Per-fragment operations that require floating-point color
2930    *  components, including multisample alpha operations, alpha test,
2931    *  blending, and dithering, have no effect when the corresponding
2932    *  colors are written to an integer color buffer."
2933    */
2934   const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i);
2935
2936   const unsigned blend_enabled = GEN_GEN >= 6 ?
2937      ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled;
2938
2939   /* _NEW_COLOR */
2940   if (ctx->Color.ColorLogicOpEnabled) {
2941      GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format)
2942         : GL_UNSIGNED_NORMALIZED;
2943      WARN_ONCE(ctx->Color.LogicOp != GL_COPY &&
2944                rb_type != GL_UNSIGNED_NORMALIZED &&
2945                rb_type != GL_FLOAT, "Ignoring %s logic op on %s "
2946                "renderbuffer\n",
2947                _mesa_enum_to_string(ctx->Color.LogicOp),
2948                _mesa_enum_to_string(rb_type));
2949      if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) {
2950         entry->LogicOpEnable = true;
2951         entry->LogicOpFunction = ctx->Color._LogicOp;
2952      }
2953   } else if (blend_enabled && !ctx->Color._AdvancedBlendMode
2954              && (GEN_GEN <= 5 || !integer)) {
2955      GLenum eqRGB = ctx->Color.Blend[i].EquationRGB;
2956      GLenum eqA = ctx->Color.Blend[i].EquationA;
2957      GLenum srcRGB = ctx->Color.Blend[i].SrcRGB;
2958      GLenum dstRGB = ctx->Color.Blend[i].DstRGB;
2959      GLenum srcA = ctx->Color.Blend[i].SrcA;
2960      GLenum dstA = ctx->Color.Blend[i].DstA;
2961
2962      if (eqRGB == GL_MIN || eqRGB == GL_MAX)
2963         srcRGB = dstRGB = GL_ONE;
2964
2965      if (eqA == GL_MIN || eqA == GL_MAX)
2966         srcA = dstA = GL_ONE;
2967
2968      /* Due to hardware limitations, the destination may have information
2969       * in an alpha channel even when the format specifies no alpha
2970       * channel. In order to avoid getting any incorrect blending due to
2971       * that alpha channel, coerce the blend factors to values that will
2972       * not read the alpha channel, but will instead use the correct
2973       * implicit value for alpha.
2974       */
2975      if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat,
2976                                               GL_TEXTURE_ALPHA_TYPE)) {
2977         srcRGB = brw_fix_xRGB_alpha(srcRGB);
2978         srcA = brw_fix_xRGB_alpha(srcA);
2979         dstRGB = brw_fix_xRGB_alpha(dstRGB);
2980         dstA = brw_fix_xRGB_alpha(dstA);
2981      }
2982
2983      /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable):
2984       * "If Dual Source Blending is enabled, this bit must be disabled."
2985       *
2986       * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO,
2987       * and leave it enabled anyway.
2988       */
2989      if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) {
2990         srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
2991         srcA = fix_dual_blend_alpha_to_one(srcA);
2992         dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
2993         dstA = fix_dual_blend_alpha_to_one(dstA);
2994      }
2995
2996      /* BRW_NEW_FS_PROG_DATA */
2997      const struct brw_wm_prog_data *wm_prog_data =
2998         brw_wm_prog_data(brw->wm.base.prog_data);
2999
3000      /* The Dual Source Blending documentation says:
3001       *
3002       * "If SRC1 is included in a src/dst blend factor and
3003       * a DualSource RT Write message is not used, results
3004       * are UNDEFINED. (This reflects the same restriction in DX APIs,
3005       * where undefined results are produced if “o1” is not written
3006       * by a PS – there are no default values defined).
3007       * If SRC1 is not included in a src/dst blend factor,
3008       * dual source blending must be disabled."
3009       *
3010       * There is no way to gracefully fix this undefined situation
3011       * so we just disable the blending to prevent possible issues.
3012       */
3013      entry->ColorBufferBlendEnable =
3014         !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
3015
3016      entry->DestinationBlendFactor = blend_factor(dstRGB);
3017      entry->SourceBlendFactor = blend_factor(srcRGB);
3018      entry->DestinationAlphaBlendFactor = blend_factor(dstA);
3019      entry->SourceAlphaBlendFactor = blend_factor(srcA);
3020      entry->ColorBlendFunction = blend_eqn(eqRGB);
3021      entry->AlphaBlendFunction = blend_eqn(eqA);
3022
3023      if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB)
3024         independent_alpha_blend = true;
3025   }
3026
3027   return independent_alpha_blend;
3028}
3029
3030#if GEN_GEN >= 6
3031static void
3032genX(upload_blend_state)(struct brw_context *brw)
3033{
3034   struct gl_context *ctx = &brw->ctx;
3035   int size;
3036
3037   /* We need at least one BLEND_STATE written, because we might do
3038    * thread dispatch even if _NumColorDrawBuffers is 0 (for example
3039    * for computed depth or alpha test), which will do an FB write
3040    * with render target 0, which will reference BLEND_STATE[0] for
3041    * alpha test enable.
3042    */
3043   int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers;
3044   if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled)
3045      nr_draw_buffers = 1;
3046
3047   size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers;
3048#if GEN_GEN >= 8
3049   size += GENX(BLEND_STATE_length) * 4;
3050#endif
3051
3052   uint32_t *blend_map;
3053   blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset);
3054
3055#if GEN_GEN >= 8
3056   struct GENX(BLEND_STATE) blend = { 0 };
3057   {
3058#else
3059   for (int i = 0; i < nr_draw_buffers; i++) {
3060      struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3061#define blend entry
3062#endif
3063      /* OpenGL specification 3.3 (page 196), section 4.1.3 says:
3064       * "If drawbuffer zero is not NONE and the buffer it references has an
3065       * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
3066       * operations are skipped."
3067       */
3068      if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) {
3069         /* _NEW_MULTISAMPLE */
3070         if (_mesa_is_multisample_enabled(ctx)) {
3071            if (ctx->Multisample.SampleAlphaToCoverage) {
3072               blend.AlphaToCoverageEnable = true;
3073               blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7;
3074            }
3075            if (ctx->Multisample.SampleAlphaToOne)
3076               blend.AlphaToOneEnable = true;
3077         }
3078
3079         /* _NEW_COLOR */
3080         if (ctx->Color.AlphaEnabled) {
3081            blend.AlphaTestEnable = true;
3082            blend.AlphaTestFunction =
3083               intel_translate_compare_func(ctx->Color.AlphaFunc);
3084         }
3085
3086         if (ctx->Color.DitherFlag) {
3087            blend.ColorDitherEnable = true;
3088         }
3089      }
3090
3091#if GEN_GEN >= 8
3092      for (int i = 0; i < nr_draw_buffers; i++) {
3093         struct GENX(BLEND_STATE_ENTRY) entry = { 0 };
3094#else
3095      {
3096#endif
3097         blend.IndependentAlphaBlendEnable =
3098            set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) ||
3099            blend.IndependentAlphaBlendEnable;
3100
3101         /* See section 8.1.6 "Pre-Blend Color Clamping" of the
3102          * SandyBridge PRM Volume 2 Part 1 for HW requirements.
3103          *
3104          * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR
3105          * clamping in the fragment shader.  For its clamping of
3106          * blending, the spec says:
3107          *
3108          *     "RESOLVED: For fixed-point color buffers, the inputs and
3109          *      the result of the blending equation are clamped.  For
3110          *      floating-point color buffers, no clamping occurs."
3111          *
3112          * So, generally, we want clamping to the render target's range.
3113          * And, good news, the hardware tables for both pre- and
3114          * post-blend color clamping are either ignored, or any are
3115          * allowed, or clamping is required but RT range clamping is a
3116          * valid option.
3117          */
3118         entry.PreBlendColorClampEnable = true;
3119         entry.PostBlendColorClampEnable = true;
3120         entry.ColorClampRange = COLORCLAMP_RTFORMAT;
3121
3122         entry.WriteDisableRed   = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0);
3123         entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1);
3124         entry.WriteDisableBlue  = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2);
3125         entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3);
3126
3127#if GEN_GEN >= 8
3128         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry);
3129#else
3130         GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry);
3131#endif
3132      }
3133   }
3134
3135#if GEN_GEN >= 8
3136   GENX(BLEND_STATE_pack)(NULL, blend_map, &blend);
3137#endif
3138
3139#if GEN_GEN < 7
3140   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3141      ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset;
3142      ptr.BLEND_STATEChange = true;
3143   }
3144#else
3145   brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) {
3146      ptr.BlendStatePointer = brw->cc.blend_state_offset;
3147#if GEN_GEN >= 8
3148      ptr.BlendStatePointerValid = true;
3149#endif
3150   }
3151#endif
3152}
3153
3154static const struct brw_tracked_state genX(blend_state) = {
3155   .dirty = {
3156      .mesa = _NEW_BUFFERS |
3157              _NEW_COLOR |
3158              _NEW_MULTISAMPLE,
3159      .brw = BRW_NEW_BATCH |
3160             BRW_NEW_BLORP |
3161             BRW_NEW_FS_PROG_DATA |
3162             BRW_NEW_STATE_BASE_ADDRESS,
3163   },
3164   .emit = genX(upload_blend_state),
3165};
3166#endif
3167
3168/* ---------------------------------------------------------------------- */
3169
3170#if GEN_GEN >= 7
3171UNUSED static const uint32_t push_constant_opcodes[] = {
3172   [MESA_SHADER_VERTEX]                      = 21,
3173   [MESA_SHADER_TESS_CTRL]                   = 25, /* HS */
3174   [MESA_SHADER_TESS_EVAL]                   = 26, /* DS */
3175   [MESA_SHADER_GEOMETRY]                    = 22,
3176   [MESA_SHADER_FRAGMENT]                    = 23,
3177   [MESA_SHADER_COMPUTE]                     = 0,
3178};
3179
3180static void
3181genX(upload_push_constant_packets)(struct brw_context *brw)
3182{
3183   const struct gen_device_info *devinfo = &brw->screen->devinfo;
3184   struct gl_context *ctx = &brw->ctx;
3185
3186   UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0;
3187
3188   struct brw_stage_state *stage_states[] = {
3189      &brw->vs.base,
3190      &brw->tcs.base,
3191      &brw->tes.base,
3192      &brw->gs.base,
3193      &brw->wm.base,
3194   };
3195
3196   if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail &&
3197       stage_states[MESA_SHADER_VERTEX]->push_constants_dirty)
3198      gen7_emit_vs_workaround_flush(brw);
3199
3200   for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) {
3201      struct brw_stage_state *stage_state = stage_states[stage];
3202      UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage];
3203
3204      if (!stage_state->push_constants_dirty)
3205         continue;
3206
3207      brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) {
3208         pkt._3DCommandSubOpcode = push_constant_opcodes[stage];
3209         if (stage_state->prog_data) {
3210#if GEN_GEN >= 8 || GEN_IS_HASWELL
3211            /* The Skylake PRM contains the following restriction:
3212             *
3213             *    "The driver must ensure The following case does not occur
3214             *     without a flush to the 3D engine: 3DSTATE_CONSTANT_* with
3215             *     buffer 3 read length equal to zero committed followed by a
3216             *     3DSTATE_CONSTANT_* with buffer 0 read length not equal to
3217             *     zero committed."
3218             *
3219             * To avoid this, we program the buffers in the highest slots.
3220             * This way, slot 0 is only used if slot 3 is also used.
3221             */
3222            int n = 3;
3223
3224            for (int i = 3; i >= 0; i--) {
3225               const struct brw_ubo_range *range =
3226                  &stage_state->prog_data->ubo_ranges[i];
3227
3228               if (range->length == 0)
3229                  continue;
3230
3231               const struct gl_uniform_block *block =
3232                  prog->sh.UniformBlocks[range->block];
3233               const struct gl_buffer_binding *binding =
3234                  &ctx->UniformBufferBindings[block->Binding];
3235
3236               if (binding->BufferObject == ctx->Shared->NullBufferObj) {
3237                  static unsigned msg_id = 0;
3238                  _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API,
3239                                  MESA_DEBUG_TYPE_UNDEFINED,
3240                                  MESA_DEBUG_SEVERITY_HIGH,
3241                                  "UBO %d unbound, %s shader uniform data "
3242                                  "will be undefined.",
3243                                  range->block,
3244                                  _mesa_shader_stage_to_string(stage));
3245                  continue;
3246               }
3247
3248               assert(binding->Offset % 32 == 0);
3249
3250               struct brw_bo *bo = intel_bufferobj_buffer(brw,
3251                  intel_buffer_object(binding->BufferObject),
3252                  binding->Offset, range->length * 32, false);
3253
3254               pkt.ConstantBody.ReadLength[n] = range->length;
3255               pkt.ConstantBody.Buffer[n] =
3256                  ro_bo(bo, range->start * 32 + binding->Offset);
3257               n--;
3258            }
3259
3260            if (stage_state->push_const_size > 0) {
3261               assert(n >= 0);
3262               pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size;
3263               pkt.ConstantBody.Buffer[n] =
3264                  ro_bo(stage_state->push_const_bo,
3265                        stage_state->push_const_offset);
3266            }
3267#else
3268            pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size;
3269            pkt.ConstantBody.Buffer[0].offset =
3270               stage_state->push_const_offset | mocs;
3271#endif
3272         }
3273      }
3274
3275      stage_state->push_constants_dirty = false;
3276      brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0;
3277   }
3278}
3279
3280const struct brw_tracked_state genX(push_constant_packets) = {
3281   .dirty = {
3282      .mesa  = 0,
3283      .brw   = BRW_NEW_DRAW_CALL,
3284   },
3285   .emit = genX(upload_push_constant_packets),
3286};
3287#endif
3288
3289#if GEN_GEN >= 6
3290static void
3291genX(upload_vs_push_constants)(struct brw_context *brw)
3292{
3293   struct brw_stage_state *stage_state = &brw->vs.base;
3294
3295   /* BRW_NEW_VERTEX_PROGRAM */
3296   const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX];
3297   /* BRW_NEW_VS_PROG_DATA */
3298   const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data;
3299
3300   gen6_upload_push_constants(brw, vp, prog_data, stage_state);
3301}
3302
3303static const struct brw_tracked_state genX(vs_push_constants) = {
3304   .dirty = {
3305      .mesa  = _NEW_PROGRAM_CONSTANTS |
3306               _NEW_TRANSFORM,
3307      .brw   = BRW_NEW_BATCH |
3308               BRW_NEW_BLORP |
3309               BRW_NEW_VERTEX_PROGRAM |
3310               BRW_NEW_VS_PROG_DATA,
3311   },
3312   .emit = genX(upload_vs_push_constants),
3313};
3314
3315static void
3316genX(upload_gs_push_constants)(struct brw_context *brw)
3317{
3318   struct brw_stage_state *stage_state = &brw->gs.base;
3319
3320   /* BRW_NEW_GEOMETRY_PROGRAM */
3321   const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY];
3322
3323   /* BRW_NEW_GS_PROG_DATA */
3324   struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data;
3325
3326   gen6_upload_push_constants(brw, gp, prog_data, stage_state);
3327}
3328
3329static const struct brw_tracked_state genX(gs_push_constants) = {
3330   .dirty = {
3331      .mesa  = _NEW_PROGRAM_CONSTANTS |
3332               _NEW_TRANSFORM,
3333      .brw   = BRW_NEW_BATCH |
3334               BRW_NEW_BLORP |
3335               BRW_NEW_GEOMETRY_PROGRAM |
3336               BRW_NEW_GS_PROG_DATA,
3337   },
3338   .emit = genX(upload_gs_push_constants),
3339};
3340
3341static void
3342genX(upload_wm_push_constants)(struct brw_context *brw)
3343{
3344   struct brw_stage_state *stage_state = &brw->wm.base;
3345   /* BRW_NEW_FRAGMENT_PROGRAM */
3346   const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3347   /* BRW_NEW_FS_PROG_DATA */
3348   const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data;
3349
3350   gen6_upload_push_constants(brw, fp, prog_data, stage_state);
3351}
3352
3353static const struct brw_tracked_state genX(wm_push_constants) = {
3354   .dirty = {
3355      .mesa  = _NEW_PROGRAM_CONSTANTS,
3356      .brw   = BRW_NEW_BATCH |
3357               BRW_NEW_BLORP |
3358               BRW_NEW_FRAGMENT_PROGRAM |
3359               BRW_NEW_FS_PROG_DATA,
3360   },
3361   .emit = genX(upload_wm_push_constants),
3362};
3363#endif
3364
3365/* ---------------------------------------------------------------------- */
3366
3367#if GEN_GEN >= 6
3368static unsigned
3369genX(determine_sample_mask)(struct brw_context *brw)
3370{
3371   struct gl_context *ctx = &brw->ctx;
3372   float coverage = 1.0f;
3373   float coverage_invert = false;
3374   unsigned sample_mask = ~0u;
3375
3376   /* BRW_NEW_NUM_SAMPLES */
3377   unsigned num_samples = brw->num_samples;
3378
3379   if (_mesa_is_multisample_enabled(ctx)) {
3380      if (ctx->Multisample.SampleCoverage) {
3381         coverage = ctx->Multisample.SampleCoverageValue;
3382         coverage_invert = ctx->Multisample.SampleCoverageInvert;
3383      }
3384      if (ctx->Multisample.SampleMask) {
3385         sample_mask = ctx->Multisample.SampleMaskValue;
3386      }
3387   }
3388
3389   if (num_samples > 1) {
3390      int coverage_int = (int) (num_samples * coverage + 0.5f);
3391      uint32_t coverage_bits = (1 << coverage_int) - 1;
3392      if (coverage_invert)
3393         coverage_bits ^= (1 << num_samples) - 1;
3394      return coverage_bits & sample_mask;
3395   } else {
3396      return 1;
3397   }
3398}
3399
3400static void
3401genX(emit_3dstate_multisample2)(struct brw_context *brw,
3402                                unsigned num_samples)
3403{
3404   unsigned log2_samples = ffs(num_samples) - 1;
3405
3406   brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) {
3407      multi.PixelLocation = CENTER;
3408      multi.NumberofMultisamples = log2_samples;
3409#if GEN_GEN == 6
3410      GEN_SAMPLE_POS_4X(multi.Sample);
3411#elif GEN_GEN == 7
3412      switch (num_samples) {
3413      case 1:
3414         GEN_SAMPLE_POS_1X(multi.Sample);
3415         break;
3416      case 2:
3417         GEN_SAMPLE_POS_2X(multi.Sample);
3418         break;
3419      case 4:
3420         GEN_SAMPLE_POS_4X(multi.Sample);
3421         break;
3422      case 8:
3423         GEN_SAMPLE_POS_8X(multi.Sample);
3424         break;
3425      default:
3426         break;
3427      }
3428#endif
3429   }
3430}
3431
3432static void
3433genX(upload_multisample_state)(struct brw_context *brw)
3434{
3435   assert(brw->num_samples > 0 && brw->num_samples <= 16);
3436
3437   genX(emit_3dstate_multisample2)(brw, brw->num_samples);
3438
3439   brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) {
3440      sm.SampleMask = genX(determine_sample_mask)(brw);
3441   }
3442}
3443
3444static const struct brw_tracked_state genX(multisample_state) = {
3445   .dirty = {
3446      .mesa = _NEW_MULTISAMPLE |
3447              (GEN_GEN == 10 ? _NEW_BUFFERS : 0),
3448      .brw = BRW_NEW_BLORP |
3449             BRW_NEW_CONTEXT |
3450             BRW_NEW_NUM_SAMPLES,
3451   },
3452   .emit = genX(upload_multisample_state)
3453};
3454#endif
3455
3456/* ---------------------------------------------------------------------- */
3457
3458static void
3459genX(upload_color_calc_state)(struct brw_context *brw)
3460{
3461   struct gl_context *ctx = &brw->ctx;
3462
3463   brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) {
3464#if GEN_GEN <= 5
3465      cc.IndependentAlphaBlendEnable =
3466         set_blend_entry_bits(brw, &cc, 0, false);
3467      set_depth_stencil_bits(brw, &cc);
3468
3469      if (ctx->Color.AlphaEnabled &&
3470          ctx->DrawBuffer->_NumColorDrawBuffers <= 1) {
3471         cc.AlphaTestEnable = true;
3472         cc.AlphaTestFunction =
3473            intel_translate_compare_func(ctx->Color.AlphaFunc);
3474      }
3475
3476      cc.ColorDitherEnable = ctx->Color.DitherFlag;
3477
3478      cc.StatisticsEnable = brw->stats_wm;
3479
3480      cc.CCViewportStatePointer =
3481         ro_bo(brw->batch.state.bo, brw->cc.vp_offset);
3482#else
3483      /* _NEW_COLOR */
3484      cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
3485      cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
3486      cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
3487      cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
3488
3489#if GEN_GEN < 9
3490      /* _NEW_STENCIL */
3491      cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0);
3492      cc.BackfaceStencilReferenceValue =
3493         _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace);
3494#endif
3495
3496#endif
3497
3498      /* _NEW_COLOR */
3499      UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8,
3500                               ctx->Color.AlphaRef);
3501   }
3502
3503#if GEN_GEN >= 6
3504   brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) {
3505      ptr.ColorCalcStatePointer = brw->cc.state_offset;
3506#if GEN_GEN != 7
3507      ptr.ColorCalcStatePointerValid = true;
3508#endif
3509   }
3510#else
3511   brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE;
3512#endif
3513}
3514
3515static const struct brw_tracked_state genX(color_calc_state) = {
3516   .dirty = {
3517      .mesa = _NEW_COLOR |
3518              _NEW_STENCIL |
3519              (GEN_GEN <= 5 ? _NEW_BUFFERS |
3520                              _NEW_DEPTH
3521                            : 0),
3522      .brw = BRW_NEW_BATCH |
3523             BRW_NEW_BLORP |
3524             (GEN_GEN <= 5 ? BRW_NEW_CC_VP |
3525                             BRW_NEW_STATS_WM
3526                           : BRW_NEW_CC_STATE |
3527                             BRW_NEW_STATE_BASE_ADDRESS),
3528   },
3529   .emit = genX(upload_color_calc_state),
3530};
3531
3532
3533/* ---------------------------------------------------------------------- */
3534
3535#if GEN_GEN >= 7
3536static void
3537genX(upload_sbe)(struct brw_context *brw)
3538{
3539   struct gl_context *ctx = &brw->ctx;
3540   /* BRW_NEW_FRAGMENT_PROGRAM */
3541   UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT];
3542   /* BRW_NEW_FS_PROG_DATA */
3543   const struct brw_wm_prog_data *wm_prog_data =
3544      brw_wm_prog_data(brw->wm.base.prog_data);
3545#if GEN_GEN >= 8
3546   struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } };
3547#else
3548#define attr_overrides sbe.Attribute
3549#endif
3550   uint32_t urb_entry_read_length;
3551   uint32_t urb_entry_read_offset;
3552   uint32_t point_sprite_enables;
3553
3554   brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) {
3555      sbe.AttributeSwizzleEnable = true;
3556      sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs;
3557
3558      /* _NEW_BUFFERS */
3559      bool flip_y = ctx->DrawBuffer->FlipY;
3560
3561      /* _NEW_POINT
3562       *
3563       * Window coordinates in an FBO are inverted, which means point
3564       * sprite origin must be inverted.
3565       */
3566      if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y)
3567         sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT;
3568      else
3569         sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT;
3570
3571      /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM,
3572       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM |
3573       * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA |
3574       * BRW_NEW_VUE_MAP_GEOM_OUT
3575       */
3576      genX(calculate_attr_overrides)(brw,
3577                                     attr_overrides,
3578                                     &point_sprite_enables,
3579                                     &urb_entry_read_length,
3580                                     &urb_entry_read_offset);
3581
3582      /* Typically, the URB entry read length and offset should be programmed
3583       * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active
3584       * stage which produces geometry.  However, we don't know the proper
3585       * value until we call calculate_attr_overrides().
3586       *
3587       * To fit with our existing code, we override the inherited values and
3588       * specify it here directly, as we did on previous generations.
3589       */
3590      sbe.VertexURBEntryReadLength = urb_entry_read_length;
3591      sbe.VertexURBEntryReadOffset = urb_entry_read_offset;
3592      sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables;
3593      sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs;
3594
3595#if GEN_GEN >= 8
3596      sbe.ForceVertexURBEntryReadLength = true;
3597      sbe.ForceVertexURBEntryReadOffset = true;
3598#endif
3599
3600#if GEN_GEN >= 9
3601      /* prepare the active component dwords */
3602      for (int i = 0; i < 32; i++)
3603         sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW;
3604#endif
3605   }
3606
3607#if GEN_GEN >= 8
3608   brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) {
3609      for (int i = 0; i < 16; i++)
3610         sbes.Attribute[i] = attr_overrides[i];
3611   }
3612#endif
3613
3614#undef attr_overrides
3615}
3616
3617static const struct brw_tracked_state genX(sbe_state) = {
3618   .dirty = {
3619      .mesa  = _NEW_BUFFERS |
3620               _NEW_LIGHT |
3621               _NEW_POINT |
3622               _NEW_POLYGON |
3623               _NEW_PROGRAM,
3624      .brw   = BRW_NEW_BLORP |
3625               BRW_NEW_CONTEXT |
3626               BRW_NEW_FRAGMENT_PROGRAM |
3627               BRW_NEW_FS_PROG_DATA |
3628               BRW_NEW_GS_PROG_DATA |
3629               BRW_NEW_TES_PROG_DATA |
3630               BRW_NEW_VUE_MAP_GEOM_OUT |
3631               (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE
3632                             : 0),
3633   },
3634   .emit = genX(upload_sbe),
3635};
3636#endif
3637
3638/* ---------------------------------------------------------------------- */
3639
3640#if GEN_GEN >= 7
3641/**
3642 * Outputs the 3DSTATE_SO_DECL_LIST command.
3643 *
3644 * The data output is a series of 64-bit entries containing a SO_DECL per
3645 * stream.  We only have one stream of rendering coming out of the GS unit, so
3646 * we only emit stream 0 (low 16 bits) SO_DECLs.
3647 */
3648static void
3649genX(upload_3dstate_so_decl_list)(struct brw_context *brw,
3650                                  const struct brw_vue_map *vue_map)
3651{
3652   struct gl_context *ctx = &brw->ctx;
3653   /* BRW_NEW_TRANSFORM_FEEDBACK */
3654   struct gl_transform_feedback_object *xfb_obj =
3655      ctx->TransformFeedback.CurrentObject;
3656   const struct gl_transform_feedback_info *linked_xfb_info =
3657      xfb_obj->program->sh.LinkedTransformFeedback;
3658   struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128];
3659   int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3660   int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3661   int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0};
3662   int max_decls = 0;
3663   STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS);
3664
3665   memset(so_decl, 0, sizeof(so_decl));
3666
3667   /* Construct the list of SO_DECLs to be emitted.  The formatting of the
3668    * command feels strange -- each dword pair contains a SO_DECL per stream.
3669    */
3670   for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) {
3671      const struct gl_transform_feedback_output *output =
3672         &linked_xfb_info->Outputs[i];
3673      const int buffer = output->OutputBuffer;
3674      const int varying = output->OutputRegister;
3675      const unsigned stream_id = output->StreamId;
3676      assert(stream_id < MAX_VERTEX_STREAMS);
3677
3678      buffer_mask[stream_id] |= 1 << buffer;
3679
3680      assert(vue_map->varying_to_slot[varying] >= 0);
3681
3682      /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[]
3683       * array.  Instead, it simply increments DstOffset for the following
3684       * input by the number of components that should be skipped.
3685       *
3686       * Our hardware is unusual in that it requires us to program SO_DECLs
3687       * for fake "hole" components, rather than simply taking the offset
3688       * for each real varying.  Each hole can have size 1, 2, 3, or 4; we
3689       * program as many size = 4 holes as we can, then a final hole to
3690       * accommodate the final 1, 2, or 3 remaining.
3691       */
3692      int skip_components = output->DstOffset - next_offset[buffer];
3693
3694      while (skip_components > 0) {
3695         so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3696            .HoleFlag = 1,
3697            .OutputBufferSlot = output->OutputBuffer,
3698            .ComponentMask = (1 << MIN2(skip_components, 4)) - 1,
3699         };
3700         skip_components -= 4;
3701      }
3702
3703      next_offset[buffer] = output->DstOffset + output->NumComponents;
3704
3705      so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) {
3706         .OutputBufferSlot = output->OutputBuffer,
3707         .RegisterIndex = vue_map->varying_to_slot[varying],
3708         .ComponentMask =
3709            ((1 << output->NumComponents) - 1) << output->ComponentOffset,
3710      };
3711
3712      if (decls[stream_id] > max_decls)
3713         max_decls = decls[stream_id];
3714   }
3715
3716   uint32_t *dw;
3717   dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls,
3718                        .StreamtoBufferSelects0 = buffer_mask[0],
3719                        .StreamtoBufferSelects1 = buffer_mask[1],
3720                        .StreamtoBufferSelects2 = buffer_mask[2],
3721                        .StreamtoBufferSelects3 = buffer_mask[3],
3722                        .NumEntries0 = decls[0],
3723                        .NumEntries1 = decls[1],
3724                        .NumEntries2 = decls[2],
3725                        .NumEntries3 = decls[3]);
3726
3727   for (int i = 0; i < max_decls; i++) {
3728      GENX(SO_DECL_ENTRY_pack)(
3729         brw, dw + 2 + i * 2,
3730         &(struct GENX(SO_DECL_ENTRY)) {
3731            .Stream0Decl = so_decl[0][i],
3732            .Stream1Decl = so_decl[1][i],
3733            .Stream2Decl = so_decl[2][i],
3734            .Stream3Decl = so_decl[3][i],
3735         });
3736   }
3737}
3738
3739static void
3740genX(upload_3dstate_so_buffers)(struct brw_context *brw)
3741{
3742   struct gl_context *ctx = &brw->ctx;
3743   /* BRW_NEW_TRANSFORM_FEEDBACK */
3744   struct gl_transform_feedback_object *xfb_obj =
3745      ctx->TransformFeedback.CurrentObject;
3746#if GEN_GEN < 8
3747   const struct gl_transform_feedback_info *linked_xfb_info =
3748      xfb_obj->program->sh.LinkedTransformFeedback;
3749#else
3750   struct brw_transform_feedback_object *brw_obj =
3751      (struct brw_transform_feedback_object *) xfb_obj;
3752   uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
3753#endif
3754
3755   /* Set up the up to 4 output buffers.  These are the ranges defined in the
3756    * gl_transform_feedback_object.
3757    */
3758   for (int i = 0; i < 4; i++) {
3759      struct intel_buffer_object *bufferobj =
3760         intel_buffer_object(xfb_obj->Buffers[i]);
3761      uint32_t start = xfb_obj->Offset[i];
3762      uint32_t end = ALIGN(start + xfb_obj->Size[i], 4);
3763      uint32_t const size = end - start;
3764
3765      if (!bufferobj || !size) {
3766         brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3767            sob.SOBufferIndex = i;
3768         }
3769         continue;
3770      }
3771
3772      assert(start % 4 == 0);
3773      struct brw_bo *bo =
3774         intel_bufferobj_buffer(brw, bufferobj, start, size, true);
3775      assert(end <= bo->size);
3776
3777      brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) {
3778         sob.SOBufferIndex = i;
3779
3780         sob.SurfaceBaseAddress = rw_bo(bo, start);
3781#if GEN_GEN < 8
3782         sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4;
3783         sob.SurfaceEndAddress = rw_bo(bo, end);
3784#else
3785         sob.SOBufferEnable = true;
3786         sob.StreamOffsetWriteEnable = true;
3787         sob.StreamOutputBufferOffsetAddressEnable = true;
3788         sob.MOCS = mocs_wb;
3789
3790         sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1;
3791         sob.StreamOutputBufferOffsetAddress =
3792            rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t));
3793
3794         if (brw_obj->zero_offsets) {
3795            /* Zero out the offset and write that to offset_bo */
3796            sob.StreamOffset = 0;
3797         } else {
3798            /* Use offset_bo as the "Stream Offset." */
3799            sob.StreamOffset = 0xFFFFFFFF;
3800         }
3801#endif
3802      }
3803   }
3804
3805#if GEN_GEN >= 8
3806   brw_obj->zero_offsets = false;
3807#endif
3808}
3809
3810static bool
3811query_active(struct gl_query_object *q)
3812{
3813   return q && q->Active;
3814}
3815
3816static void
3817genX(upload_3dstate_streamout)(struct brw_context *brw, bool active,
3818                               const struct brw_vue_map *vue_map)
3819{
3820   struct gl_context *ctx = &brw->ctx;
3821   /* BRW_NEW_TRANSFORM_FEEDBACK */
3822   struct gl_transform_feedback_object *xfb_obj =
3823      ctx->TransformFeedback.CurrentObject;
3824
3825   brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) {
3826      if (active) {
3827         int urb_entry_read_offset = 0;
3828         int urb_entry_read_length = (vue_map->num_slots + 1) / 2 -
3829            urb_entry_read_offset;
3830
3831         sos.SOFunctionEnable = true;
3832         sos.SOStatisticsEnable = true;
3833
3834         /* BRW_NEW_RASTERIZER_DISCARD */
3835         if (ctx->RasterDiscard) {
3836            if (!query_active(ctx->Query.PrimitivesGenerated[0])) {
3837               sos.RenderingDisable = true;
3838            } else {
3839               perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED "
3840                          "query active relies on the clipper.\n");
3841            }
3842         }
3843
3844         /* _NEW_LIGHT */
3845         if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION)
3846            sos.ReorderMode = TRAILING;
3847
3848#if GEN_GEN < 8
3849         sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL;
3850         sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL;
3851         sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL;
3852         sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL;
3853#else
3854         const struct gl_transform_feedback_info *linked_xfb_info =
3855            xfb_obj->program->sh.LinkedTransformFeedback;
3856         /* Set buffer pitches; 0 means unbound. */
3857         if (xfb_obj->Buffers[0])
3858            sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4;
3859         if (xfb_obj->Buffers[1])
3860            sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4;
3861         if (xfb_obj->Buffers[2])
3862            sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4;
3863         if (xfb_obj->Buffers[3])
3864            sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4;
3865#endif
3866
3867         /* We always read the whole vertex.  This could be reduced at some
3868          * point by reading less and offsetting the register index in the
3869          * SO_DECLs.
3870          */
3871         sos.Stream0VertexReadOffset = urb_entry_read_offset;
3872         sos.Stream0VertexReadLength = urb_entry_read_length - 1;
3873         sos.Stream1VertexReadOffset = urb_entry_read_offset;
3874         sos.Stream1VertexReadLength = urb_entry_read_length - 1;
3875         sos.Stream2VertexReadOffset = urb_entry_read_offset;
3876         sos.Stream2VertexReadLength = urb_entry_read_length - 1;
3877         sos.Stream3VertexReadOffset = urb_entry_read_offset;
3878         sos.Stream3VertexReadLength = urb_entry_read_length - 1;
3879      }
3880   }
3881}
3882
3883static void
3884genX(upload_sol)(struct brw_context *brw)
3885{
3886   struct gl_context *ctx = &brw->ctx;
3887   /* BRW_NEW_TRANSFORM_FEEDBACK */
3888   bool active = _mesa_is_xfb_active_and_unpaused(ctx);
3889
3890   if (active) {
3891      genX(upload_3dstate_so_buffers)(brw);
3892
3893      /* BRW_NEW_VUE_MAP_GEOM_OUT */
3894      genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out);
3895   }
3896
3897   /* Finally, set up the SOL stage.  This command must always follow updates to
3898    * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or
3899    * MMIO register updates (current performed by the kernel at each batch
3900    * emit).
3901    */
3902   genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out);
3903}
3904
3905static const struct brw_tracked_state genX(sol_state) = {
3906   .dirty = {
3907      .mesa  = _NEW_LIGHT,
3908      .brw   = BRW_NEW_BATCH |
3909               BRW_NEW_BLORP |
3910               BRW_NEW_RASTERIZER_DISCARD |
3911               BRW_NEW_VUE_MAP_GEOM_OUT |
3912               BRW_NEW_TRANSFORM_FEEDBACK,
3913   },
3914   .emit = genX(upload_sol),
3915};
3916#endif
3917
3918/* ---------------------------------------------------------------------- */
3919
3920#if GEN_GEN >= 7
3921static void
3922genX(upload_ps)(struct brw_context *brw)
3923{
3924   UNUSED const struct gl_context *ctx = &brw->ctx;
3925   UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo;
3926
3927   /* BRW_NEW_FS_PROG_DATA */
3928   const struct brw_wm_prog_data *prog_data =
3929      brw_wm_prog_data(brw->wm.base.prog_data);
3930   const struct brw_stage_state *stage_state = &brw->wm.base;
3931
3932#if GEN_GEN < 8
3933#endif
3934
3935   brw_batch_emit(brw, GENX(3DSTATE_PS), ps) {
3936      /* Initialize the execution mask with VMask.  Otherwise, derivatives are
3937       * incorrect for subspans where some of the pixels are unlit.  We believe
3938       * the bit just didn't take effect in previous generations.
3939       */
3940      ps.VectorMaskEnable = GEN_GEN >= 8;
3941
3942      /* WA_1606682166:
3943       * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes.
3944       * Disable the Sampler state prefetch functionality in the SARB by
3945       * programming 0xB000[30] to '1'."
3946       */
3947      ps.SamplerCount = GEN_GEN == 11 ?
3948         0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4);
3949
3950      /* BRW_NEW_FS_PROG_DATA */
3951      /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable
3952       * prefetching of binding tables in A0 and B0 steppings.
3953       * TODO: Revisit this workaround on C0 stepping.
3954       */
3955      ps.BindingTableEntryCount = GEN_GEN == 11 ?
3956                                  0 :
3957                                  prog_data->base.binding_table.size_bytes / 4;
3958
3959      if (prog_data->base.use_alt_mode)
3960         ps.FloatingPointMode = Alternate;
3961
3962      /* Haswell requires the sample mask to be set in this packet as well as
3963       * in 3DSTATE_SAMPLE_MASK; the values should match.
3964       */
3965
3966      /* _NEW_BUFFERS, _NEW_MULTISAMPLE */
3967#if GEN_IS_HASWELL
3968      ps.SampleMask = genX(determine_sample_mask(brw));
3969#endif
3970
3971      /* 3DSTATE_PS expects the number of threads per PSD, which is always 64
3972       * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is
3973       * k, it implies 2(k+1) threads. It implicitly scales for different GT
3974       * levels (which have some # of PSDs).
3975       *
3976       * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1.
3977       */
3978#if GEN_GEN >= 9
3979      ps.MaximumNumberofThreadsPerPSD = 64 - 1;
3980#elif GEN_GEN >= 8
3981      ps.MaximumNumberofThreadsPerPSD = 64 - 2;
3982#else
3983      ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1;
3984#endif
3985
3986      if (prog_data->base.nr_params > 0 ||
3987          prog_data->base.ubo_ranges[0].length > 0)
3988         ps.PushConstantEnable = true;
3989
3990#if GEN_GEN < 8
3991      /* From the IVB PRM, volume 2 part 1, page 287:
3992       * "This bit is inserted in the PS payload header and made available to
3993       * the DataPort (either via the message header or via header bypass) to
3994       * indicate that oMask data (one or two phases) is included in Render
3995       * Target Write messages. If present, the oMask data is used to mask off
3996       * samples."
3997       */
3998      ps.oMaskPresenttoRenderTarget = prog_data->uses_omask;
3999
4000      /* The hardware wedges if you have this bit set but don't turn on any
4001       * dual source blend factors.
4002       *
4003       * BRW_NEW_FS_PROG_DATA | _NEW_COLOR
4004       */
4005      ps.DualSourceBlendEnable = prog_data->dual_src_blend &&
4006                                 (ctx->Color.BlendEnabled & 1) &&
4007                                 ctx->Color.Blend[0]._UsesDualSrc;
4008
4009      /* BRW_NEW_FS_PROG_DATA */
4010      ps.AttributeEnable = (prog_data->num_varying_inputs != 0);
4011#endif
4012
4013      /* From the documentation for this packet:
4014       * "If the PS kernel does not need the Position XY Offsets to
4015       *  compute a Position Value, then this field should be programmed
4016       *  to POSOFFSET_NONE."
4017       *
4018       * "SW Recommendation: If the PS kernel needs the Position Offsets
4019       *  to compute a Position XY value, this field should match Position
4020       *  ZW Interpolation Mode to ensure a consistent position.xyzw
4021       *  computation."
4022       *
4023       * We only require XY sample offsets. So, this recommendation doesn't
4024       * look useful at the moment. We might need this in future.
4025       */
4026      if (prog_data->uses_pos_offset)
4027         ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE;
4028      else
4029         ps.PositionXYOffsetSelect = POSOFFSET_NONE;
4030
4031      ps._8PixelDispatchEnable = prog_data->dispatch_8;
4032      ps._16PixelDispatchEnable = prog_data->dispatch_16;
4033      ps._32PixelDispatchEnable = prog_data->dispatch_32;
4034
4035      /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable:
4036       *
4037       *    "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32
4038       *    Dispatch must not be enabled for PER_PIXEL dispatch mode."
4039       *
4040       * Since 16x MSAA is first introduced on SKL, we don't need to apply
4041       * the workaround on any older hardware.
4042       *
4043       * BRW_NEW_NUM_SAMPLES
4044       */
4045      if (GEN_GEN >= 9 && !prog_data->persample_dispatch &&
4046          brw->num_samples == 16) {
4047         assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable);
4048         ps._32PixelDispatchEnable = false;
4049      }
4050
4051      ps.DispatchGRFStartRegisterForConstantSetupData0 =
4052         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0);
4053      ps.DispatchGRFStartRegisterForConstantSetupData1 =
4054         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1);
4055      ps.DispatchGRFStartRegisterForConstantSetupData2 =
4056         brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2);
4057
4058      ps.KernelStartPointer0 = stage_state->prog_offset +
4059                               brw_wm_prog_data_prog_offset(prog_data, ps, 0);
4060      ps.KernelStartPointer1 = stage_state->prog_offset +
4061                               brw_wm_prog_data_prog_offset(prog_data, ps, 1);
4062      ps.KernelStartPointer2 = stage_state->prog_offset +
4063                               brw_wm_prog_data_prog_offset(prog_data, ps, 2);
4064
4065      if (prog_data->base.total_scratch) {
4066         ps.ScratchSpaceBasePointer =
4067            rw_32_bo(stage_state->scratch_bo,
4068                     ffs(stage_state->per_thread_scratch) - 11);
4069      }
4070   }
4071}
4072
4073static const struct brw_tracked_state genX(ps_state) = {
4074   .dirty = {
4075      .mesa  = _NEW_MULTISAMPLE |
4076               (GEN_GEN < 8 ? _NEW_BUFFERS |
4077                              _NEW_COLOR
4078                            : 0),
4079      .brw   = BRW_NEW_BATCH |
4080               BRW_NEW_BLORP |
4081               BRW_NEW_FS_PROG_DATA |
4082               (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0),
4083   },
4084   .emit = genX(upload_ps),
4085};
4086#endif
4087
4088/* ---------------------------------------------------------------------- */
4089
4090#if GEN_GEN >= 7
4091static void
4092genX(upload_hs_state)(struct brw_context *brw)
4093{
4094   const struct gen_device_info *devinfo = &brw->screen->devinfo;
4095   struct brw_stage_state *stage_state = &brw->tcs.base;
4096   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4097   const struct brw_vue_prog_data *vue_prog_data =
4098      brw_vue_prog_data(stage_prog_data);
4099
4100   /* BRW_NEW_TES_PROG_DATA */
4101   struct brw_tcs_prog_data *tcs_prog_data =
4102      brw_tcs_prog_data(stage_prog_data);
4103
4104   if (!tcs_prog_data) {
4105      brw_batch_emit(brw, GENX(3DSTATE_HS), hs);
4106   } else {
4107      brw_batch_emit(brw, GENX(3DSTATE_HS), hs) {
4108         INIT_THREAD_DISPATCH_FIELDS(hs, Vertex);
4109
4110         hs.InstanceCount = tcs_prog_data->instances - 1;
4111         hs.IncludeVertexHandles = true;
4112
4113         hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1;
4114      }
4115   }
4116}
4117
4118static const struct brw_tracked_state genX(hs_state) = {
4119   .dirty = {
4120      .mesa  = 0,
4121      .brw   = BRW_NEW_BATCH |
4122               BRW_NEW_BLORP |
4123               BRW_NEW_TCS_PROG_DATA |
4124               BRW_NEW_TESS_PROGRAMS,
4125   },
4126   .emit = genX(upload_hs_state),
4127};
4128
4129static void
4130genX(upload_ds_state)(struct brw_context *brw)
4131{
4132   const struct gen_device_info *devinfo = &brw->screen->devinfo;
4133   const struct brw_stage_state *stage_state = &brw->tes.base;
4134   struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data;
4135
4136   /* BRW_NEW_TES_PROG_DATA */
4137   const struct brw_tes_prog_data *tes_prog_data =
4138      brw_tes_prog_data(stage_prog_data);
4139   const struct brw_vue_prog_data *vue_prog_data =
4140      brw_vue_prog_data(stage_prog_data);
4141
4142   if (!tes_prog_data) {
4143      brw_batch_emit(brw, GENX(3DSTATE_DS), ds);
4144   } else {
4145      assert(GEN_GEN < 11 ||
4146             vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8);
4147
4148      brw_batch_emit(brw, GENX(3DSTATE_DS), ds) {
4149         INIT_THREAD_DISPATCH_FIELDS(ds, Patch);
4150
4151        ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1;
4152        ds.ComputeWCoordinateEnable =
4153           tes_prog_data->domain == BRW_TESS_DOMAIN_TRI;
4154
4155#if GEN_GEN >= 8
4156        if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8)
4157           ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH;
4158        ds.UserClipDistanceCullTestEnableBitmask =
4159            vue_prog_data->cull_distance_mask;
4160#endif
4161      }
4162   }
4163}
4164
4165static const struct brw_tracked_state genX(ds_state) = {
4166   .dirty = {
4167      .mesa  = 0,
4168      .brw   = BRW_NEW_BATCH |
4169               BRW_NEW_BLORP |
4170               BRW_NEW_TESS_PROGRAMS |
4171               BRW_NEW_TES_PROG_DATA,
4172   },
4173   .emit = genX(upload_ds_state),
4174};
4175
4176/* ---------------------------------------------------------------------- */
4177
4178static void
4179upload_te_state(struct brw_context *brw)
4180{
4181   /* BRW_NEW_TESS_PROGRAMS */
4182   bool active = brw->programs[MESA_SHADER_TESS_EVAL];
4183
4184   /* BRW_NEW_TES_PROG_DATA */
4185   const struct brw_tes_prog_data *tes_prog_data =
4186      brw_tes_prog_data(brw->tes.base.prog_data);
4187
4188   if (active) {
4189      brw_batch_emit(brw, GENX(3DSTATE_TE), te) {
4190         te.Partitioning = tes_prog_data->partitioning;
4191         te.OutputTopology = tes_prog_data->output_topology;
4192         te.TEDomain = tes_prog_data->domain;
4193         te.TEEnable = true;
4194         te.MaximumTessellationFactorOdd = 63.0;
4195         te.MaximumTessellationFactorNotOdd = 64.0;
4196      }
4197   } else {
4198      brw_batch_emit(brw, GENX(3DSTATE_TE), te);
4199   }
4200}
4201
4202static const struct brw_tracked_state genX(te_state) = {
4203   .dirty = {
4204      .mesa  = 0,
4205      .brw   = BRW_NEW_BLORP |
4206               BRW_NEW_CONTEXT |
4207               BRW_NEW_TES_PROG_DATA |
4208               BRW_NEW_TESS_PROGRAMS,
4209   },
4210   .emit = upload_te_state,
4211};
4212
4213/* ---------------------------------------------------------------------- */
4214
4215static void
4216genX(upload_tes_push_constants)(struct brw_context *brw)
4217{
4218   struct brw_stage_state *stage_state = &brw->tes.base;
4219   /* BRW_NEW_TESS_PROGRAMS */
4220   const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL];
4221
4222   /* BRW_NEW_TES_PROG_DATA */
4223   const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data;
4224   gen6_upload_push_constants(brw, tep, prog_data, stage_state);
4225}
4226
4227static const struct brw_tracked_state genX(tes_push_constants) = {
4228   .dirty = {
4229      .mesa  = _NEW_PROGRAM_CONSTANTS,
4230      .brw   = BRW_NEW_BATCH |
4231               BRW_NEW_BLORP |
4232               BRW_NEW_TESS_PROGRAMS |
4233               BRW_NEW_TES_PROG_DATA,
4234   },
4235   .emit = genX(upload_tes_push_constants),
4236};
4237
4238static void
4239genX(upload_tcs_push_constants)(struct brw_context *brw)
4240{
4241   struct brw_stage_state *stage_state = &brw->tcs.base;
4242   /* BRW_NEW_TESS_PROGRAMS */
4243   const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL];
4244
4245   /* BRW_NEW_TCS_PROG_DATA */
4246   const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data;
4247
4248   gen6_upload_push_constants(brw, tcp, prog_data, stage_state);
4249}
4250
4251static const struct brw_tracked_state genX(tcs_push_constants) = {
4252   .dirty = {
4253      .mesa  = _NEW_PROGRAM_CONSTANTS,
4254      .brw   = BRW_NEW_BATCH |
4255               BRW_NEW_BLORP |
4256               BRW_NEW_DEFAULT_TESS_LEVELS |
4257               BRW_NEW_TESS_PROGRAMS |
4258               BRW_NEW_TCS_PROG_DATA,
4259   },
4260   .emit = genX(upload_tcs_push_constants),
4261};
4262
4263#endif
4264
4265/* ---------------------------------------------------------------------- */
4266
4267#if GEN_GEN >= 7
4268static void
4269genX(upload_cs_push_constants)(struct brw_context *brw)
4270{
4271   struct brw_stage_state *stage_state = &brw->cs.base;
4272
4273   /* BRW_NEW_COMPUTE_PROGRAM */
4274   const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE];
4275
4276   if (cp) {
4277      /* BRW_NEW_CS_PROG_DATA */
4278      struct brw_cs_prog_data *cs_prog_data =
4279         brw_cs_prog_data(brw->cs.base.prog_data);
4280
4281      _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4282      brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state);
4283   }
4284}
4285
4286const struct brw_tracked_state genX(cs_push_constants) = {
4287   .dirty = {
4288      .mesa = _NEW_PROGRAM_CONSTANTS,
4289      .brw = BRW_NEW_BATCH |
4290             BRW_NEW_BLORP |
4291             BRW_NEW_COMPUTE_PROGRAM |
4292             BRW_NEW_CS_PROG_DATA,
4293   },
4294   .emit = genX(upload_cs_push_constants),
4295};
4296
4297/**
4298 * Creates a new CS constant buffer reflecting the current CS program's
4299 * constants, if needed by the CS program.
4300 */
4301static void
4302genX(upload_cs_pull_constants)(struct brw_context *brw)
4303{
4304   struct brw_stage_state *stage_state = &brw->cs.base;
4305
4306   /* BRW_NEW_COMPUTE_PROGRAM */
4307   struct brw_program *cp =
4308      (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE];
4309
4310   /* BRW_NEW_CS_PROG_DATA */
4311   const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data;
4312
4313   _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE);
4314   /* _NEW_PROGRAM_CONSTANTS */
4315   brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program,
4316                             stage_state, prog_data);
4317}
4318
4319const struct brw_tracked_state genX(cs_pull_constants) = {
4320   .dirty = {
4321      .mesa = _NEW_PROGRAM_CONSTANTS,
4322      .brw = BRW_NEW_BATCH |
4323             BRW_NEW_BLORP |
4324             BRW_NEW_COMPUTE_PROGRAM |
4325             BRW_NEW_CS_PROG_DATA,
4326   },
4327   .emit = genX(upload_cs_pull_constants),
4328};
4329
4330static void
4331genX(upload_cs_state)(struct brw_context *brw)
4332{
4333   if (!brw->cs.base.prog_data)
4334      return;
4335
4336   uint32_t offset;
4337   uint32_t *desc = (uint32_t*) brw_state_batch(
4338      brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64,
4339      &offset);
4340
4341   struct brw_stage_state *stage_state = &brw->cs.base;
4342   struct brw_stage_prog_data *prog_data = stage_state->prog_data;
4343   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
4344   const struct gen_device_info *devinfo = &brw->screen->devinfo;
4345
4346   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
4347      brw_emit_buffer_surface_state(
4348         brw, &stage_state->surf_offset[
4349                 prog_data->binding_table.shader_time_start],
4350         brw->shader_time.bo, 0, ISL_FORMAT_RAW,
4351         brw->shader_time.bo->size, 1,
4352         RELOC_WRITE);
4353   }
4354
4355   uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes,
4356                                    32, &stage_state->bind_bo_offset);
4357
4358   /* The MEDIA_VFE_STATE documentation for Gen8+ says:
4359    *
4360    * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless
4361    *  the only bits that are changed are scoreboard related: Scoreboard
4362    *  Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For
4363    *  these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient."
4364    *
4365    * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL",
4366    * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL.
4367    */
4368   brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL);
4369
4370   brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) {
4371      if (prog_data->total_scratch) {
4372         uint32_t per_thread_scratch_value;
4373
4374         if (GEN_GEN >= 8) {
4375            /* Broadwell's Per Thread Scratch Space is in the range [0, 11]
4376             * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M.
4377             */
4378            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11;
4379         } else if (GEN_IS_HASWELL) {
4380            /* Haswell's Per Thread Scratch Space is in the range [0, 10]
4381             * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M.
4382             */
4383            per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12;
4384         } else {
4385            /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB]
4386             * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB.
4387             */
4388            per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1;
4389         }
4390         vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0);
4391         vfe.PerThreadScratchSpace = per_thread_scratch_value;
4392      }
4393
4394      /* If brw->screen->subslice_total is greater than one, then
4395       * devinfo->max_cs_threads stores number of threads per sub-slice;
4396       * thus we need to multiply by that number by subslices to get
4397       * the actual maximum number of threads; the -1 is because the HW
4398       * has a bias of 1 (would not make sense to say the maximum number
4399       * of threads is 0).
4400       */
4401      const uint32_t subslices = MAX2(brw->screen->subslice_total, 1);
4402      vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1;
4403      vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0;
4404#if GEN_GEN < 11
4405      vfe.ResetGatewayTimer =
4406         Resettingrelativetimerandlatchingtheglobaltimestamp;
4407#endif
4408#if GEN_GEN < 9
4409      vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol;
4410#endif
4411#if GEN_GEN == 7
4412      vfe.GPGPUMode = 1;
4413#endif
4414
4415      /* We are uploading duplicated copies of push constant uniforms for each
4416       * thread. Although the local id data needs to vary per thread, it won't
4417       * change for other uniform data. Unfortunately this duplication is
4418       * required for gen7. As of Haswell, this duplication can be avoided,
4419       * but this older mechanism with duplicated data continues to work.
4420       *
4421       * FINISHME: As of Haswell, we could make use of the
4422       * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length"
4423       * field to only store one copy of uniform data.
4424       *
4425       * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage"
4426       * which is described in the GPGPU_WALKER command and in the Broadwell
4427       * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of
4428       * Operations => GPGPU Mode => Indirect Payload Storage.
4429       *
4430       * Note: The constant data is built in brw_upload_cs_push_constants
4431       * below.
4432       */
4433      vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0;
4434
4435      const uint32_t vfe_curbe_allocation =
4436         ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads +
4437               cs_prog_data->push.cross_thread.regs, 2);
4438      vfe.CURBEAllocationSize = vfe_curbe_allocation;
4439   }
4440
4441   if (cs_prog_data->push.total.size > 0) {
4442      brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) {
4443         curbe.CURBETotalDataLength =
4444            ALIGN(cs_prog_data->push.total.size, 64);
4445         curbe.CURBEDataStartAddress = stage_state->push_const_offset;
4446      }
4447   }
4448
4449   /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */
4450   memcpy(bind, stage_state->surf_offset,
4451          prog_data->binding_table.size_bytes);
4452   const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = {
4453      .KernelStartPointer = brw->cs.base.prog_offset,
4454      .SamplerStatePointer = stage_state->sampler_offset,
4455      .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4),
4456      .BindingTablePointer = stage_state->bind_bo_offset,
4457      .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs,
4458      .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads,
4459      .SharedLocalMemorySize = encode_slm_size(GEN_GEN,
4460                                               prog_data->total_shared),
4461      .BarrierEnable = cs_prog_data->uses_barrier,
4462#if GEN_GEN >= 8 || GEN_IS_HASWELL
4463      .CrossThreadConstantDataReadLength =
4464         cs_prog_data->push.cross_thread.regs,
4465#endif
4466   };
4467
4468   GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd);
4469
4470   brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) {
4471      load.InterfaceDescriptorTotalLength =
4472         GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t);
4473      load.InterfaceDescriptorDataStartAddress = offset;
4474   }
4475}
4476
4477static const struct brw_tracked_state genX(cs_state) = {
4478   .dirty = {
4479      .mesa = _NEW_PROGRAM_CONSTANTS,
4480      .brw = BRW_NEW_BATCH |
4481             BRW_NEW_BLORP |
4482             BRW_NEW_CS_PROG_DATA |
4483             BRW_NEW_SAMPLER_STATE_TABLE |
4484             BRW_NEW_SURFACES,
4485   },
4486   .emit = genX(upload_cs_state)
4487};
4488
4489#define GPGPU_DISPATCHDIMX 0x2500
4490#define GPGPU_DISPATCHDIMY 0x2504
4491#define GPGPU_DISPATCHDIMZ 0x2508
4492
4493#define MI_PREDICATE_SRC0  0x2400
4494#define MI_PREDICATE_SRC1  0x2408
4495
4496static void
4497prepare_indirect_gpgpu_walker(struct brw_context *brw)
4498{
4499   GLintptr indirect_offset = brw->compute.num_work_groups_offset;
4500   struct brw_bo *bo = brw->compute.num_work_groups_bo;
4501
4502   emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0));
4503   emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4));
4504   emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8));
4505
4506#if GEN_GEN <= 7
4507   /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */
4508   emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0);
4509   emit_lri(brw, MI_PREDICATE_SRC1    , 0);
4510   emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0);
4511
4512   /* Load compute_dispatch_indirect_x_size into SRC0 */
4513   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0));
4514
4515   /* predicate = (compute_dispatch_indirect_x_size == 0); */
4516   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4517      mip.LoadOperation    = LOAD_LOAD;
4518      mip.CombineOperation = COMBINE_SET;
4519      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4520   }
4521
4522   /* Load compute_dispatch_indirect_y_size into SRC0 */
4523   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4));
4524
4525   /* predicate |= (compute_dispatch_indirect_y_size == 0); */
4526   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4527      mip.LoadOperation    = LOAD_LOAD;
4528      mip.CombineOperation = COMBINE_OR;
4529      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4530   }
4531
4532   /* Load compute_dispatch_indirect_z_size into SRC0 */
4533   emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8));
4534
4535   /* predicate |= (compute_dispatch_indirect_z_size == 0); */
4536   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4537      mip.LoadOperation    = LOAD_LOAD;
4538      mip.CombineOperation = COMBINE_OR;
4539      mip.CompareOperation = COMPARE_SRCS_EQUAL;
4540   }
4541
4542   /* predicate = !predicate; */
4543#define COMPARE_FALSE                           1
4544   brw_batch_emit(brw, GENX(MI_PREDICATE), mip) {
4545      mip.LoadOperation    = LOAD_LOADINV;
4546      mip.CombineOperation = COMBINE_OR;
4547      mip.CompareOperation = COMPARE_FALSE;
4548   }
4549#endif
4550}
4551
4552static void
4553genX(emit_gpgpu_walker)(struct brw_context *brw)
4554{
4555   const struct brw_cs_prog_data *prog_data =
4556      brw_cs_prog_data(brw->cs.base.prog_data);
4557
4558   const GLuint *num_groups = brw->compute.num_work_groups;
4559
4560   bool indirect = brw->compute.num_work_groups_bo != NULL;
4561   if (indirect)
4562      prepare_indirect_gpgpu_walker(brw);
4563
4564   const unsigned simd_size = prog_data->simd_size;
4565   unsigned group_size = prog_data->local_size[0] *
4566      prog_data->local_size[1] * prog_data->local_size[2];
4567
4568   uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
4569   const unsigned right_non_aligned = group_size & (simd_size - 1);
4570   if (right_non_aligned != 0)
4571      right_mask >>= (simd_size - right_non_aligned);
4572
4573   brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) {
4574      ggw.IndirectParameterEnable      = indirect;
4575      ggw.PredicateEnable              = GEN_GEN <= 7 && indirect;
4576      ggw.SIMDSize                     = prog_data->simd_size / 16;
4577      ggw.ThreadDepthCounterMaximum    = 0;
4578      ggw.ThreadHeightCounterMaximum   = 0;
4579      ggw.ThreadWidthCounterMaximum    = prog_data->threads - 1;
4580      ggw.ThreadGroupIDXDimension      = num_groups[0];
4581      ggw.ThreadGroupIDYDimension      = num_groups[1];
4582      ggw.ThreadGroupIDZDimension      = num_groups[2];
4583      ggw.RightExecutionMask           = right_mask;
4584      ggw.BottomExecutionMask          = 0xffffffff;
4585   }
4586
4587   brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf);
4588}
4589
4590#endif
4591
4592/* ---------------------------------------------------------------------- */
4593
4594#if GEN_GEN >= 8
4595static void
4596genX(upload_raster)(struct brw_context *brw)
4597{
4598   const struct gl_context *ctx = &brw->ctx;
4599
4600   /* _NEW_BUFFERS */
4601   const bool flip_y = ctx->DrawBuffer->FlipY;
4602
4603   /* _NEW_POLYGON */
4604   const struct gl_polygon_attrib *polygon = &ctx->Polygon;
4605
4606   /* _NEW_POINT */
4607   const struct gl_point_attrib *point = &ctx->Point;
4608
4609   brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) {
4610      if (brw->polygon_front_bit != flip_y)
4611         raster.FrontWinding = CounterClockwise;
4612
4613      if (polygon->CullFlag) {
4614         switch (polygon->CullFaceMode) {
4615         case GL_FRONT:
4616            raster.CullMode = CULLMODE_FRONT;
4617            break;
4618         case GL_BACK:
4619            raster.CullMode = CULLMODE_BACK;
4620            break;
4621         case GL_FRONT_AND_BACK:
4622            raster.CullMode = CULLMODE_BOTH;
4623            break;
4624         default:
4625            unreachable("not reached");
4626         }
4627      } else {
4628         raster.CullMode = CULLMODE_NONE;
4629      }
4630
4631      raster.SmoothPointEnable = point->SmoothFlag;
4632
4633      raster.DXMultisampleRasterizationEnable =
4634         _mesa_is_multisample_enabled(ctx);
4635
4636      raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill;
4637      raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine;
4638      raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint;
4639
4640      switch (polygon->FrontMode) {
4641      case GL_FILL:
4642         raster.FrontFaceFillMode = FILL_MODE_SOLID;
4643         break;
4644      case GL_LINE:
4645         raster.FrontFaceFillMode = FILL_MODE_WIREFRAME;
4646         break;
4647      case GL_POINT:
4648         raster.FrontFaceFillMode = FILL_MODE_POINT;
4649         break;
4650      default:
4651         unreachable("not reached");
4652      }
4653
4654      switch (polygon->BackMode) {
4655      case GL_FILL:
4656         raster.BackFaceFillMode = FILL_MODE_SOLID;
4657         break;
4658      case GL_LINE:
4659         raster.BackFaceFillMode = FILL_MODE_WIREFRAME;
4660         break;
4661      case GL_POINT:
4662         raster.BackFaceFillMode = FILL_MODE_POINT;
4663         break;
4664      default:
4665         unreachable("not reached");
4666      }
4667
4668      /* _NEW_LINE */
4669      raster.AntialiasingEnable = ctx->Line.SmoothFlag;
4670
4671#if GEN_GEN == 10
4672      /* _NEW_BUFFERS
4673       * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1.
4674       */
4675      const bool multisampled_fbo =
4676         _mesa_geometric_samples(ctx->DrawBuffer) > 1;
4677      if (multisampled_fbo)
4678         raster.AntialiasingEnable = false;
4679#endif
4680
4681      /* _NEW_SCISSOR */
4682      raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags;
4683
4684      /* _NEW_TRANSFORM */
4685#if GEN_GEN < 9
4686      if (!(ctx->Transform.DepthClampNear &&
4687            ctx->Transform.DepthClampFar))
4688         raster.ViewportZClipTestEnable = true;
4689#endif
4690
4691#if GEN_GEN >= 9
4692      if (!ctx->Transform.DepthClampNear)
4693         raster.ViewportZNearClipTestEnable = true;
4694
4695      if (!ctx->Transform.DepthClampFar)
4696         raster.ViewportZFarClipTestEnable = true;
4697#endif
4698
4699      /* BRW_NEW_CONSERVATIVE_RASTERIZATION */
4700#if GEN_GEN >= 9
4701      raster.ConservativeRasterizationEnable =
4702         ctx->IntelConservativeRasterization;
4703#endif
4704
4705      raster.GlobalDepthOffsetClamp = polygon->OffsetClamp;
4706      raster.GlobalDepthOffsetScale = polygon->OffsetFactor;
4707
4708      raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2;
4709   }
4710}
4711
4712static const struct brw_tracked_state genX(raster_state) = {
4713   .dirty = {
4714      .mesa  = _NEW_BUFFERS |
4715               _NEW_LINE |
4716               _NEW_MULTISAMPLE |
4717               _NEW_POINT |
4718               _NEW_POLYGON |
4719               _NEW_SCISSOR |
4720               _NEW_TRANSFORM,
4721      .brw   = BRW_NEW_BLORP |
4722               BRW_NEW_CONTEXT |
4723               BRW_NEW_CONSERVATIVE_RASTERIZATION,
4724   },
4725   .emit = genX(upload_raster),
4726};
4727#endif
4728
4729/* ---------------------------------------------------------------------- */
4730
4731#if GEN_GEN >= 8
4732static void
4733genX(upload_ps_extra)(struct brw_context *brw)
4734{
4735   UNUSED struct gl_context *ctx = &brw->ctx;
4736
4737   const struct brw_wm_prog_data *prog_data =
4738      brw_wm_prog_data(brw->wm.base.prog_data);
4739
4740   brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) {
4741      psx.PixelShaderValid = true;
4742      psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode;
4743      psx.PixelShaderKillsPixel = prog_data->uses_kill;
4744      psx.AttributeEnable = prog_data->num_varying_inputs != 0;
4745      psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth;
4746      psx.PixelShaderUsesSourceW = prog_data->uses_src_w;
4747      psx.PixelShaderIsPerSample = prog_data->persample_dispatch;
4748
4749      /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */
4750      if (prog_data->uses_sample_mask) {
4751#if GEN_GEN >= 9
4752         if (prog_data->post_depth_coverage)
4753            psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE;
4754         else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization)
4755            psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE;
4756         else
4757            psx.InputCoverageMaskState = ICMS_NORMAL;
4758#else
4759         psx.PixelShaderUsesInputCoverageMask = true;
4760#endif
4761      }
4762
4763      psx.oMaskPresenttoRenderTarget = prog_data->uses_omask;
4764#if GEN_GEN >= 9
4765      psx.PixelShaderPullsBary = prog_data->pulls_bary;
4766      psx.PixelShaderComputesStencil = prog_data->computed_stencil;
4767#endif
4768
4769      /* The stricter cross-primitive coherency guarantees that the hardware
4770       * gives us with the "Accesses UAV" bit set for at least one shader stage
4771       * and the "UAV coherency required" bit set on the 3DPRIMITIVE command
4772       * are redundant within the current image, atomic counter and SSBO GL
4773       * APIs, which all have very loose ordering and coherency requirements
4774       * and generally rely on the application to insert explicit barriers when
4775       * a shader invocation is expected to see the memory writes performed by
4776       * the invocations of some previous primitive.  Regardless of the value
4777       * of "UAV coherency required", the "Accesses UAV" bits will implicitly
4778       * cause an in most cases useless DC flush when the lowermost stage with
4779       * the bit set finishes execution.
4780       *
4781       * It would be nice to disable it, but in some cases we can't because on
4782       * Gen8+ it also has an influence on rasterization via the PS UAV-only
4783       * signal (which could be set independently from the coherency mechanism
4784       * in the 3DSTATE_WM command on Gen7), and because in some cases it will
4785       * determine whether the hardware skips execution of the fragment shader
4786       * or not via the ThreadDispatchEnable signal.  However if we know that
4787       * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and
4788       * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
4789       * difference so we may just disable it here.
4790       *
4791       * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
4792       * take into account KillPixels when no depth or stencil writes are
4793       * enabled.  In order for occlusion queries to work correctly with no
4794       * attachments, we need to force-enable here.
4795       *
4796       * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS |
4797       * _NEW_COLOR
4798       */
4799      if ((prog_data->has_side_effects || prog_data->uses_kill) &&
4800          !brw_color_buffer_write_enabled(brw))
4801         psx.PixelShaderHasUAV = true;
4802   }
4803}
4804
4805const struct brw_tracked_state genX(ps_extra) = {
4806   .dirty = {
4807      .mesa  = _NEW_BUFFERS | _NEW_COLOR,
4808      .brw   = BRW_NEW_BLORP |
4809               BRW_NEW_CONTEXT |
4810               BRW_NEW_FRAGMENT_PROGRAM |
4811               BRW_NEW_FS_PROG_DATA |
4812               BRW_NEW_CONSERVATIVE_RASTERIZATION,
4813   },
4814   .emit = genX(upload_ps_extra),
4815};
4816#endif
4817
4818/* ---------------------------------------------------------------------- */
4819
4820#if GEN_GEN >= 8
4821static void
4822genX(upload_ps_blend)(struct brw_context *brw)
4823{
4824   struct gl_context *ctx = &brw->ctx;
4825
4826   /* _NEW_BUFFERS */
4827   struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
4828   const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1;
4829
4830   /* _NEW_COLOR */
4831   struct gl_colorbuffer_attrib *color = &ctx->Color;
4832
4833   brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) {
4834      /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */
4835      pb.HasWriteableRT = brw_color_buffer_write_enabled(brw);
4836
4837      bool alpha_to_one = false;
4838
4839      if (!buffer0_is_integer) {
4840         /* _NEW_MULTISAMPLE */
4841
4842         if (_mesa_is_multisample_enabled(ctx)) {
4843            pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage;
4844            alpha_to_one = ctx->Multisample.SampleAlphaToOne;
4845         }
4846
4847         pb.AlphaTestEnable = color->AlphaEnabled;
4848      }
4849
4850      /* Used for implementing the following bit of GL_EXT_texture_integer:
4851       * "Per-fragment operations that require floating-point color
4852       *  components, including multisample alpha operations, alpha test,
4853       *  blending, and dithering, have no effect when the corresponding
4854       *  colors are written to an integer color buffer."
4855       *
4856       * The OpenGL specification 3.3 (page 196), section 4.1.3 says:
4857       * "If drawbuffer zero is not NONE and the buffer it references has an
4858       *  integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE
4859       *  operations are skipped."
4860       */
4861      if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) {
4862         GLenum eqRGB = color->Blend[0].EquationRGB;
4863         GLenum eqA = color->Blend[0].EquationA;
4864         GLenum srcRGB = color->Blend[0].SrcRGB;
4865         GLenum dstRGB = color->Blend[0].DstRGB;
4866         GLenum srcA = color->Blend[0].SrcA;
4867         GLenum dstA = color->Blend[0].DstA;
4868
4869         if (eqRGB == GL_MIN || eqRGB == GL_MAX)
4870            srcRGB = dstRGB = GL_ONE;
4871
4872         if (eqA == GL_MIN || eqA == GL_MAX)
4873            srcA = dstA = GL_ONE;
4874
4875         /* Due to hardware limitations, the destination may have information
4876          * in an alpha channel even when the format specifies no alpha
4877          * channel. In order to avoid getting any incorrect blending due to
4878          * that alpha channel, coerce the blend factors to values that will
4879          * not read the alpha channel, but will instead use the correct
4880          * implicit value for alpha.
4881          */
4882         if (!_mesa_base_format_has_channel(rb->_BaseFormat,
4883                                            GL_TEXTURE_ALPHA_TYPE)) {
4884            srcRGB = brw_fix_xRGB_alpha(srcRGB);
4885            srcA = brw_fix_xRGB_alpha(srcA);
4886            dstRGB = brw_fix_xRGB_alpha(dstRGB);
4887            dstA = brw_fix_xRGB_alpha(dstA);
4888         }
4889
4890         /* Alpha to One doesn't work with Dual Color Blending.  Override
4891          * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO.
4892          */
4893         if (alpha_to_one && color->Blend[0]._UsesDualSrc) {
4894            srcRGB = fix_dual_blend_alpha_to_one(srcRGB);
4895            srcA = fix_dual_blend_alpha_to_one(srcA);
4896            dstRGB = fix_dual_blend_alpha_to_one(dstRGB);
4897            dstA = fix_dual_blend_alpha_to_one(dstA);
4898         }
4899
4900         /* BRW_NEW_FS_PROG_DATA */
4901         const struct brw_wm_prog_data *wm_prog_data =
4902            brw_wm_prog_data(brw->wm.base.prog_data);
4903
4904         /* The Dual Source Blending documentation says:
4905          *
4906          * "If SRC1 is included in a src/dst blend factor and
4907          * a DualSource RT Write message is not used, results
4908          * are UNDEFINED. (This reflects the same restriction in DX APIs,
4909          * where undefined results are produced if “o1” is not written
4910          * by a PS – there are no default values defined).
4911          * If SRC1 is not included in a src/dst blend factor,
4912          * dual source blending must be disabled."
4913          *
4914          * There is no way to gracefully fix this undefined situation
4915          * so we just disable the blending to prevent possible issues.
4916          */
4917         pb.ColorBufferBlendEnable =
4918            !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend;
4919         pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA);
4920         pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA);
4921         pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB);
4922         pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB);
4923
4924         pb.IndependentAlphaBlendEnable =
4925            srcA != srcRGB || dstA != dstRGB || eqA != eqRGB;
4926      }
4927   }
4928}
4929
4930static const struct brw_tracked_state genX(ps_blend) = {
4931   .dirty = {
4932      .mesa = _NEW_BUFFERS |
4933              _NEW_COLOR |
4934              _NEW_MULTISAMPLE,
4935      .brw = BRW_NEW_BLORP |
4936             BRW_NEW_CONTEXT |
4937             BRW_NEW_FRAGMENT_PROGRAM |
4938             BRW_NEW_FS_PROG_DATA,
4939   },
4940   .emit = genX(upload_ps_blend)
4941};
4942#endif
4943
4944/* ---------------------------------------------------------------------- */
4945
4946#if GEN_GEN >= 8
4947static void
4948genX(emit_vf_topology)(struct brw_context *brw)
4949{
4950   brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) {
4951      vftopo.PrimitiveTopologyType = brw->primitive;
4952   }
4953}
4954
4955static const struct brw_tracked_state genX(vf_topology) = {
4956   .dirty = {
4957      .mesa = 0,
4958      .brw = BRW_NEW_BLORP |
4959             BRW_NEW_PRIMITIVE,
4960   },
4961   .emit = genX(emit_vf_topology),
4962};
4963#endif
4964
4965/* ---------------------------------------------------------------------- */
4966
4967#if GEN_GEN >= 7
4968static void
4969genX(emit_mi_report_perf_count)(struct brw_context *brw,
4970                                struct brw_bo *bo,
4971                                uint32_t offset_in_bytes,
4972                                uint32_t report_id)
4973{
4974   brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) {
4975      mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes);
4976      mi_rpc.ReportID = report_id;
4977   }
4978}
4979#endif
4980
4981/* ---------------------------------------------------------------------- */
4982
4983/**
4984 * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet.
4985 */
4986static void
4987genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw,
4988                                     MAYBE_UNUSED struct brw_stage_state *stage_state)
4989{
4990#if GEN_GEN >= 7
4991   static const uint16_t packet_headers[] = {
4992      [MESA_SHADER_VERTEX] = 43,
4993      [MESA_SHADER_TESS_CTRL] = 44,
4994      [MESA_SHADER_TESS_EVAL] = 45,
4995      [MESA_SHADER_GEOMETRY] = 46,
4996      [MESA_SHADER_FRAGMENT] = 47,
4997   };
4998
4999   /* Ivybridge requires a workaround flush before VS packets. */
5000   if (GEN_GEN == 7 && !GEN_IS_HASWELL &&
5001       stage_state->stage == MESA_SHADER_VERTEX) {
5002      gen7_emit_vs_workaround_flush(brw);
5003   }
5004
5005   brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) {
5006      ptr._3DCommandSubOpcode = packet_headers[stage_state->stage];
5007      ptr.PointertoVSSamplerState = stage_state->sampler_offset;
5008   }
5009#endif
5010}
5011
5012UNUSED static bool
5013has_component(mesa_format format, int i)
5014{
5015   if (_mesa_is_format_color_format(format))
5016      return _mesa_format_has_color_component(format, i);
5017
5018   /* depth and stencil have only one component */
5019   return i == 0;
5020}
5021
5022/**
5023 * Upload SAMPLER_BORDER_COLOR_STATE.
5024 */
5025static void
5026genX(upload_default_color)(struct brw_context *brw,
5027                           const struct gl_sampler_object *sampler,
5028                           MAYBE_UNUSED mesa_format format, GLenum base_format,
5029                           bool is_integer_format, bool is_stencil_sampling,
5030                           uint32_t *sdc_offset)
5031{
5032   union gl_color_union color;
5033
5034   switch (base_format) {
5035   case GL_DEPTH_COMPONENT:
5036      /* GL specs that border color for depth textures is taken from the
5037       * R channel, while the hardware uses A.  Spam R into all the
5038       * channels for safety.
5039       */
5040      color.ui[0] = sampler->BorderColor.ui[0];
5041      color.ui[1] = sampler->BorderColor.ui[0];
5042      color.ui[2] = sampler->BorderColor.ui[0];
5043      color.ui[3] = sampler->BorderColor.ui[0];
5044      break;
5045   case GL_ALPHA:
5046      color.ui[0] = 0u;
5047      color.ui[1] = 0u;
5048      color.ui[2] = 0u;
5049      color.ui[3] = sampler->BorderColor.ui[3];
5050      break;
5051   case GL_INTENSITY:
5052      color.ui[0] = sampler->BorderColor.ui[0];
5053      color.ui[1] = sampler->BorderColor.ui[0];
5054      color.ui[2] = sampler->BorderColor.ui[0];
5055      color.ui[3] = sampler->BorderColor.ui[0];
5056      break;
5057   case GL_LUMINANCE:
5058      color.ui[0] = sampler->BorderColor.ui[0];
5059      color.ui[1] = sampler->BorderColor.ui[0];
5060      color.ui[2] = sampler->BorderColor.ui[0];
5061      color.ui[3] = float_as_int(1.0);
5062      break;
5063   case GL_LUMINANCE_ALPHA:
5064      color.ui[0] = sampler->BorderColor.ui[0];
5065      color.ui[1] = sampler->BorderColor.ui[0];
5066      color.ui[2] = sampler->BorderColor.ui[0];
5067      color.ui[3] = sampler->BorderColor.ui[3];
5068      break;
5069   default:
5070      color.ui[0] = sampler->BorderColor.ui[0];
5071      color.ui[1] = sampler->BorderColor.ui[1];
5072      color.ui[2] = sampler->BorderColor.ui[2];
5073      color.ui[3] = sampler->BorderColor.ui[3];
5074      break;
5075   }
5076
5077   /* In some cases we use an RGBA surface format for GL RGB textures,
5078    * where we've initialized the A channel to 1.0.  We also have to set
5079    * the border color alpha to 1.0 in that case.
5080    */
5081   if (base_format == GL_RGB)
5082      color.ui[3] = float_as_int(1.0);
5083
5084   int alignment = 32;
5085   if (GEN_GEN >= 8) {
5086      alignment = 64;
5087   } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) {
5088      alignment = 512;
5089   }
5090
5091   uint32_t *sdc = brw_state_batch(
5092      brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t),
5093      alignment, sdc_offset);
5094
5095   struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 };
5096
5097#define ASSIGN(dst, src) \
5098   do {                  \
5099      dst = src;         \
5100   } while (0)
5101
5102#define ASSIGNu16(dst, src) \
5103   do {                     \
5104      dst = (uint16_t)src;  \
5105   } while (0)
5106
5107#define ASSIGNu8(dst, src) \
5108   do {                    \
5109      dst = (uint8_t)src;  \
5110   } while (0)
5111
5112#define BORDER_COLOR_ATTR(macro, _color_type, src)              \
5113   macro(state.BorderColor ## _color_type ## Red, src[0]);   \
5114   macro(state.BorderColor ## _color_type ## Green, src[1]);   \
5115   macro(state.BorderColor ## _color_type ## Blue, src[2]);   \
5116   macro(state.BorderColor ## _color_type ## Alpha, src[3]);
5117
5118#if GEN_GEN >= 8
5119   /* On Broadwell, the border color is represented as four 32-bit floats,
5120    * integers, or unsigned values, interpreted according to the surface
5121    * format.  This matches the sampler->BorderColor union exactly; just
5122    * memcpy the values.
5123    */
5124   BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui);
5125#elif GEN_IS_HASWELL
5126   if (is_integer_format || is_stencil_sampling) {
5127      bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling;
5128      const int bits_per_channel =
5129         _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS);
5130
5131      /* From the Haswell PRM, "Command Reference: Structures", Page 36:
5132       * "If any color channel is missing from the surface format,
5133       *  corresponding border color should be programmed as zero and if
5134       *  alpha channel is missing, corresponding Alpha border color should
5135       *  be programmed as 1."
5136       */
5137      unsigned c[4] = { 0, 0, 0, 1 };
5138      for (int i = 0; i < 4; i++) {
5139         if (has_component(format, i))
5140            c[i] = color.ui[i];
5141      }
5142
5143      switch (bits_per_channel) {
5144      case 8:
5145         /* Copy RGBA in order. */
5146         BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c);
5147         break;
5148      case 10:
5149         /* R10G10B10A2_UINT is treated like a 16-bit format. */
5150      case 16:
5151         BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c);
5152         break;
5153      case 32:
5154         if (base_format == GL_RG) {
5155            /* Careful inspection of the tables reveals that for RG32 formats,
5156             * the green channel needs to go where blue normally belongs.
5157             */
5158            state.BorderColor32bitRed = c[0];
5159            state.BorderColor32bitBlue = c[1];
5160            state.BorderColor32bitAlpha = 1;
5161         } else {
5162            /* Copy RGBA in order. */
5163            BORDER_COLOR_ATTR(ASSIGN, 32bit, c);
5164         }
5165         break;
5166      default:
5167         assert(!"Invalid number of bits per channel in integer format.");
5168         break;
5169      }
5170   } else {
5171      BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5172   }
5173#elif GEN_GEN == 5 || GEN_GEN == 6
5174   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f);
5175   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f);
5176   BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f);
5177
5178#define MESA_FLOAT_TO_HALF(dst, src) \
5179   dst = _mesa_float_to_half(src);
5180
5181   BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f);
5182
5183#undef MESA_FLOAT_TO_HALF
5184
5185   state.BorderColorSnorm8Red   = state.BorderColorSnorm16Red >> 8;
5186   state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8;
5187   state.BorderColorSnorm8Blue  = state.BorderColorSnorm16Blue >> 8;
5188   state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8;
5189
5190   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5191#elif GEN_GEN == 4
5192   BORDER_COLOR_ATTR(ASSIGN, , color.f);
5193#else
5194   BORDER_COLOR_ATTR(ASSIGN, Float, color.f);
5195#endif
5196
5197#undef ASSIGN
5198#undef BORDER_COLOR_ATTR
5199
5200   GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state);
5201}
5202
5203static uint32_t
5204translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest)
5205{
5206   switch (wrap) {
5207   case GL_REPEAT:
5208      return TCM_WRAP;
5209   case GL_CLAMP:
5210#if GEN_GEN >= 8
5211      /* GL_CLAMP is the weird mode where coordinates are clamped to
5212       * [0.0, 1.0], so linear filtering of coordinates outside of
5213       * [0.0, 1.0] give you half edge texel value and half border
5214       * color.
5215       *
5216       * Gen8+ supports this natively.
5217       */
5218      return TCM_HALF_BORDER;
5219#else
5220      /* On Gen4-7.5, we clamp the coordinates in the fragment shader
5221       * and set clamp_border here, which gets the result desired.
5222       * We just use clamp(_to_edge) for nearest, because for nearest
5223       * clamping to 1.0 gives border color instead of the desired
5224       * edge texels.
5225       */
5226      if (using_nearest)
5227         return TCM_CLAMP;
5228      else
5229         return TCM_CLAMP_BORDER;
5230#endif
5231   case GL_CLAMP_TO_EDGE:
5232      return TCM_CLAMP;
5233   case GL_CLAMP_TO_BORDER:
5234      return TCM_CLAMP_BORDER;
5235   case GL_MIRRORED_REPEAT:
5236      return TCM_MIRROR;
5237   case GL_MIRROR_CLAMP_TO_EDGE:
5238      return TCM_MIRROR_ONCE;
5239   default:
5240      return TCM_WRAP;
5241   }
5242}
5243
5244/**
5245 * Return true if the given wrap mode requires the border color to exist.
5246 */
5247static bool
5248wrap_mode_needs_border_color(unsigned wrap_mode)
5249{
5250#if GEN_GEN >= 8
5251   return wrap_mode == TCM_CLAMP_BORDER ||
5252          wrap_mode == TCM_HALF_BORDER;
5253#else
5254   return wrap_mode == TCM_CLAMP_BORDER;
5255#endif
5256}
5257
5258/**
5259 * Sets the sampler state for a single unit based off of the sampler key
5260 * entry.
5261 */
5262static void
5263genX(update_sampler_state)(struct brw_context *brw,
5264                           GLenum target, bool tex_cube_map_seamless,
5265                           GLfloat tex_unit_lod_bias,
5266                           mesa_format format, GLenum base_format,
5267                           const struct gl_texture_object *texObj,
5268                           const struct gl_sampler_object *sampler,
5269                           uint32_t *sampler_state)
5270{
5271   struct GENX(SAMPLER_STATE) samp_st = { 0 };
5272
5273   /* Select min and mip filters. */
5274   switch (sampler->MinFilter) {
5275   case GL_NEAREST:
5276      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5277      samp_st.MipModeFilter = MIPFILTER_NONE;
5278      break;
5279   case GL_LINEAR:
5280      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5281      samp_st.MipModeFilter = MIPFILTER_NONE;
5282      break;
5283   case GL_NEAREST_MIPMAP_NEAREST:
5284      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5285      samp_st.MipModeFilter = MIPFILTER_NEAREST;
5286      break;
5287   case GL_LINEAR_MIPMAP_NEAREST:
5288      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5289      samp_st.MipModeFilter = MIPFILTER_NEAREST;
5290      break;
5291   case GL_NEAREST_MIPMAP_LINEAR:
5292      samp_st.MinModeFilter = MAPFILTER_NEAREST;
5293      samp_st.MipModeFilter = MIPFILTER_LINEAR;
5294      break;
5295   case GL_LINEAR_MIPMAP_LINEAR:
5296      samp_st.MinModeFilter = MAPFILTER_LINEAR;
5297      samp_st.MipModeFilter = MIPFILTER_LINEAR;
5298      break;
5299   default:
5300      unreachable("not reached");
5301   }
5302
5303   /* Select mag filter. */
5304   samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ?
5305      MAPFILTER_LINEAR : MAPFILTER_NEAREST;
5306
5307   /* Enable anisotropic filtering if desired. */
5308   samp_st.MaximumAnisotropy = RATIO21;
5309
5310   if (sampler->MaxAnisotropy > 1.0f) {
5311      if (samp_st.MinModeFilter == MAPFILTER_LINEAR)
5312         samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC;
5313      if (samp_st.MagModeFilter == MAPFILTER_LINEAR)
5314         samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC;
5315
5316      if (sampler->MaxAnisotropy > 2.0f) {
5317         samp_st.MaximumAnisotropy =
5318            MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161);
5319      }
5320   }
5321
5322   /* Set address rounding bits if not using nearest filtering. */
5323   if (samp_st.MinModeFilter != MAPFILTER_NEAREST) {
5324      samp_st.UAddressMinFilterRoundingEnable = true;
5325      samp_st.VAddressMinFilterRoundingEnable = true;
5326      samp_st.RAddressMinFilterRoundingEnable = true;
5327   }
5328
5329   if (samp_st.MagModeFilter != MAPFILTER_NEAREST) {
5330      samp_st.UAddressMagFilterRoundingEnable = true;
5331      samp_st.VAddressMagFilterRoundingEnable = true;
5332      samp_st.RAddressMagFilterRoundingEnable = true;
5333   }
5334
5335   bool either_nearest =
5336      sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST;
5337   unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest);
5338   unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest);
5339   unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest);
5340
5341   if (target == GL_TEXTURE_CUBE_MAP ||
5342       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
5343      /* Cube maps must use the same wrap mode for all three coordinate
5344       * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
5345       *
5346       * Ivybridge and Baytrail seem to have problems with CUBE mode and
5347       * integer formats.  Fall back to CLAMP for now.
5348       */
5349      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
5350          !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) {
5351         wrap_s = TCM_CUBE;
5352         wrap_t = TCM_CUBE;
5353         wrap_r = TCM_CUBE;
5354      } else {
5355         wrap_s = TCM_CLAMP;
5356         wrap_t = TCM_CLAMP;
5357         wrap_r = TCM_CLAMP;
5358      }
5359   } else if (target == GL_TEXTURE_1D) {
5360      /* There's a bug in 1D texture sampling - it actually pays
5361       * attention to the wrap_t value, though it should not.
5362       * Override the wrap_t value here to GL_REPEAT to keep
5363       * any nonexistent border pixels from floating in.
5364       */
5365      wrap_t = TCM_WRAP;
5366   }
5367
5368   samp_st.TCXAddressControlMode = wrap_s;
5369   samp_st.TCYAddressControlMode = wrap_t;
5370   samp_st.TCZAddressControlMode = wrap_r;
5371
5372   samp_st.ShadowFunction =
5373      sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ?
5374      intel_translate_shadow_compare_func(sampler->CompareFunc) : 0;
5375
5376#if GEN_GEN >= 7
5377   /* Set shadow function. */
5378   samp_st.AnisotropicAlgorithm =
5379      samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ?
5380      EWAApproximation : LEGACY;
5381#endif
5382
5383#if GEN_GEN >= 6
5384   samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE;
5385#endif
5386
5387   const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13;
5388   samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod);
5389   samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod);
5390   samp_st.TextureLODBias =
5391      CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15);
5392
5393#if GEN_GEN == 6
5394   samp_st.BaseMipLevel =
5395      CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod);
5396   samp_st.MinandMagStateNotEqual =
5397      samp_st.MinModeFilter != samp_st.MagModeFilter;
5398#endif
5399
5400   /* Upload the border color if necessary.  If not, just point it at
5401    * offset 0 (the start of the batch) - the color should be ignored,
5402    * but that address won't fault in case something reads it anyway.
5403    */
5404   uint32_t border_color_offset = 0;
5405   if (wrap_mode_needs_border_color(wrap_s) ||
5406       wrap_mode_needs_border_color(wrap_t) ||
5407       wrap_mode_needs_border_color(wrap_r)) {
5408      genX(upload_default_color)(brw, sampler, format, base_format,
5409                                 texObj->_IsIntegerFormat,
5410                                 texObj->StencilSampling,
5411                                 &border_color_offset);
5412   }
5413#if GEN_GEN < 6
5414      samp_st.BorderColorPointer =
5415         ro_bo(brw->batch.state.bo, border_color_offset);
5416#else
5417      samp_st.BorderColorPointer = border_color_offset;
5418#endif
5419
5420#if GEN_GEN >= 8
5421   samp_st.LODPreClampMode = CLAMP_MODE_OGL;
5422#else
5423   samp_st.LODPreClampEnable = true;
5424#endif
5425
5426   GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st);
5427}
5428
5429static void
5430update_sampler_state(struct brw_context *brw,
5431                     int unit,
5432                     uint32_t *sampler_state)
5433{
5434   struct gl_context *ctx = &brw->ctx;
5435   const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
5436   const struct gl_texture_object *texObj = texUnit->_Current;
5437   const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit);
5438
5439   /* These don't use samplers at all. */
5440   if (texObj->Target == GL_TEXTURE_BUFFER)
5441      return;
5442
5443   struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel];
5444   genX(update_sampler_state)(brw, texObj->Target,
5445                              ctx->Texture.CubeMapSeamless,
5446                              texUnit->LodBias,
5447                              firstImage->TexFormat, firstImage->_BaseFormat,
5448                              texObj, sampler,
5449                              sampler_state);
5450}
5451
5452static void
5453genX(upload_sampler_state_table)(struct brw_context *brw,
5454                                 struct gl_program *prog,
5455                                 struct brw_stage_state *stage_state)
5456{
5457   struct gl_context *ctx = &brw->ctx;
5458   uint32_t sampler_count = stage_state->sampler_count;
5459
5460   GLbitfield SamplersUsed = prog->SamplersUsed;
5461
5462   if (sampler_count == 0)
5463      return;
5464
5465   /* SAMPLER_STATE is 4 DWords on all platforms. */
5466   const int dwords = GENX(SAMPLER_STATE_length);
5467   const int size_in_bytes = dwords * sizeof(uint32_t);
5468
5469   uint32_t *sampler_state = brw_state_batch(brw,
5470                                             sampler_count * size_in_bytes,
5471                                             32, &stage_state->sampler_offset);
5472   /* memset(sampler_state, 0, sampler_count * size_in_bytes); */
5473
5474   for (unsigned s = 0; s < sampler_count; s++) {
5475      if (SamplersUsed & (1 << s)) {
5476         const unsigned unit = prog->SamplerUnits[s];
5477         if (ctx->Texture.Unit[unit]._Current) {
5478            update_sampler_state(brw, unit, sampler_state);
5479         }
5480      }
5481
5482      sampler_state += dwords;
5483   }
5484
5485   if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) {
5486      /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */
5487      genX(emit_sampler_state_pointers_xs)(brw, stage_state);
5488   } else {
5489      /* Flag that the sampler state table pointer has changed; later atoms
5490       * will handle it.
5491       */
5492      brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE;
5493   }
5494}
5495
5496static void
5497genX(upload_fs_samplers)(struct brw_context *brw)
5498{
5499   /* BRW_NEW_FRAGMENT_PROGRAM */
5500   struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT];
5501   genX(upload_sampler_state_table)(brw, fs, &brw->wm.base);
5502}
5503
5504static const struct brw_tracked_state genX(fs_samplers) = {
5505   .dirty = {
5506      .mesa = _NEW_TEXTURE,
5507      .brw = BRW_NEW_BATCH |
5508             BRW_NEW_BLORP |
5509             BRW_NEW_FRAGMENT_PROGRAM,
5510   },
5511   .emit = genX(upload_fs_samplers),
5512};
5513
5514static void
5515genX(upload_vs_samplers)(struct brw_context *brw)
5516{
5517   /* BRW_NEW_VERTEX_PROGRAM */
5518   struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX];
5519   genX(upload_sampler_state_table)(brw, vs, &brw->vs.base);
5520}
5521
5522static const struct brw_tracked_state genX(vs_samplers) = {
5523   .dirty = {
5524      .mesa = _NEW_TEXTURE,
5525      .brw = BRW_NEW_BATCH |
5526             BRW_NEW_BLORP |
5527             BRW_NEW_VERTEX_PROGRAM,
5528   },
5529   .emit = genX(upload_vs_samplers),
5530};
5531
5532#if GEN_GEN >= 6
5533static void
5534genX(upload_gs_samplers)(struct brw_context *brw)
5535{
5536   /* BRW_NEW_GEOMETRY_PROGRAM */
5537   struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY];
5538   if (!gs)
5539      return;
5540
5541   genX(upload_sampler_state_table)(brw, gs, &brw->gs.base);
5542}
5543
5544
5545static const struct brw_tracked_state genX(gs_samplers) = {
5546   .dirty = {
5547      .mesa = _NEW_TEXTURE,
5548      .brw = BRW_NEW_BATCH |
5549             BRW_NEW_BLORP |
5550             BRW_NEW_GEOMETRY_PROGRAM,
5551   },
5552   .emit = genX(upload_gs_samplers),
5553};
5554#endif
5555
5556#if GEN_GEN >= 7
5557static void
5558genX(upload_tcs_samplers)(struct brw_context *brw)
5559{
5560   /* BRW_NEW_TESS_PROGRAMS */
5561   struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL];
5562   if (!tcs)
5563      return;
5564
5565   genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base);
5566}
5567
5568static const struct brw_tracked_state genX(tcs_samplers) = {
5569   .dirty = {
5570      .mesa = _NEW_TEXTURE,
5571      .brw = BRW_NEW_BATCH |
5572             BRW_NEW_BLORP |
5573             BRW_NEW_TESS_PROGRAMS,
5574   },
5575   .emit = genX(upload_tcs_samplers),
5576};
5577#endif
5578
5579#if GEN_GEN >= 7
5580static void
5581genX(upload_tes_samplers)(struct brw_context *brw)
5582{
5583   /* BRW_NEW_TESS_PROGRAMS */
5584   struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL];
5585   if (!tes)
5586      return;
5587
5588   genX(upload_sampler_state_table)(brw, tes, &brw->tes.base);
5589}
5590
5591static const struct brw_tracked_state genX(tes_samplers) = {
5592   .dirty = {
5593      .mesa = _NEW_TEXTURE,
5594      .brw = BRW_NEW_BATCH |
5595             BRW_NEW_BLORP |
5596             BRW_NEW_TESS_PROGRAMS,
5597   },
5598   .emit = genX(upload_tes_samplers),
5599};
5600#endif
5601
5602#if GEN_GEN >= 7
5603static void
5604genX(upload_cs_samplers)(struct brw_context *brw)
5605{
5606   /* BRW_NEW_COMPUTE_PROGRAM */
5607   struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE];
5608   if (!cs)
5609      return;
5610
5611   genX(upload_sampler_state_table)(brw, cs, &brw->cs.base);
5612}
5613
5614const struct brw_tracked_state genX(cs_samplers) = {
5615   .dirty = {
5616      .mesa = _NEW_TEXTURE,
5617      .brw = BRW_NEW_BATCH |
5618             BRW_NEW_BLORP |
5619             BRW_NEW_COMPUTE_PROGRAM,
5620   },
5621   .emit = genX(upload_cs_samplers),
5622};
5623#endif
5624
5625/* ---------------------------------------------------------------------- */
5626
5627#if GEN_GEN <= 5
5628
5629static void genX(upload_blend_constant_color)(struct brw_context *brw)
5630{
5631   struct gl_context *ctx = &brw->ctx;
5632
5633   brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) {
5634      blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0];
5635      blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1];
5636      blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2];
5637      blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3];
5638   }
5639}
5640
5641static const struct brw_tracked_state genX(blend_constant_color) = {
5642   .dirty = {
5643      .mesa = _NEW_COLOR,
5644      .brw = BRW_NEW_CONTEXT |
5645             BRW_NEW_BLORP,
5646   },
5647   .emit = genX(upload_blend_constant_color)
5648};
5649#endif
5650
5651/* ---------------------------------------------------------------------- */
5652
5653void
5654genX(init_atoms)(struct brw_context *brw)
5655{
5656#if GEN_GEN < 6
5657   static const struct brw_tracked_state *render_atoms[] =
5658   {
5659      &genX(vf_statistics),
5660
5661      /* Once all the programs are done, we know how large urb entry
5662       * sizes need to be and can decide if we need to change the urb
5663       * layout.
5664       */
5665      &brw_curbe_offsets,
5666      &brw_recalculate_urb_fence,
5667
5668      &genX(cc_vp),
5669      &genX(color_calc_state),
5670
5671      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5672       * table upload must be last.
5673       */
5674      &brw_vs_pull_constants,
5675      &brw_wm_pull_constants,
5676      &brw_renderbuffer_surfaces,
5677      &brw_renderbuffer_read_surfaces,
5678      &brw_texture_surfaces,
5679      &brw_vs_binding_table,
5680      &brw_wm_binding_table,
5681
5682      &genX(fs_samplers),
5683      &genX(vs_samplers),
5684
5685      /* These set up state for brw_psp_urb_cbs */
5686      &genX(wm_state),
5687      &genX(sf_clip_viewport),
5688      &genX(sf_state),
5689      &genX(vs_state), /* always required, enabled or not */
5690      &genX(clip_state),
5691      &genX(gs_state),
5692
5693      /* Command packets:
5694       */
5695      &brw_binding_table_pointers,
5696      &genX(blend_constant_color),
5697
5698      &brw_depthbuffer,
5699
5700      &genX(polygon_stipple),
5701      &genX(polygon_stipple_offset),
5702
5703      &genX(line_stipple),
5704
5705      &brw_psp_urb_cbs,
5706
5707      &genX(drawing_rect),
5708      &brw_indices, /* must come before brw_vertices */
5709      &genX(index_buffer),
5710      &genX(vertices),
5711
5712      &brw_constant_buffer
5713   };
5714#elif GEN_GEN == 6
5715   static const struct brw_tracked_state *render_atoms[] =
5716   {
5717      &genX(vf_statistics),
5718
5719      &genX(sf_clip_viewport),
5720
5721      /* Command packets: */
5722
5723      &genX(cc_vp),
5724
5725      &gen6_urb,
5726      &genX(blend_state),		/* must do before cc unit */
5727      &genX(color_calc_state),	/* must do before cc unit */
5728      &genX(depth_stencil_state),	/* must do before cc unit */
5729
5730      &genX(vs_push_constants), /* Before vs_state */
5731      &genX(gs_push_constants), /* Before gs_state */
5732      &genX(wm_push_constants), /* Before wm_state */
5733
5734      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5735       * table upload must be last.
5736       */
5737      &brw_vs_pull_constants,
5738      &brw_vs_ubo_surfaces,
5739      &brw_gs_pull_constants,
5740      &brw_gs_ubo_surfaces,
5741      &brw_wm_pull_constants,
5742      &brw_wm_ubo_surfaces,
5743      &gen6_renderbuffer_surfaces,
5744      &brw_renderbuffer_read_surfaces,
5745      &brw_texture_surfaces,
5746      &gen6_sol_surface,
5747      &brw_vs_binding_table,
5748      &gen6_gs_binding_table,
5749      &brw_wm_binding_table,
5750
5751      &genX(fs_samplers),
5752      &genX(vs_samplers),
5753      &genX(gs_samplers),
5754      &gen6_sampler_state,
5755      &genX(multisample_state),
5756
5757      &genX(vs_state),
5758      &genX(gs_state),
5759      &genX(clip_state),
5760      &genX(sf_state),
5761      &genX(wm_state),
5762
5763      &genX(scissor_state),
5764
5765      &gen6_binding_table_pointers,
5766
5767      &brw_depthbuffer,
5768
5769      &genX(polygon_stipple),
5770      &genX(polygon_stipple_offset),
5771
5772      &genX(line_stipple),
5773
5774      &genX(drawing_rect),
5775
5776      &brw_indices, /* must come before brw_vertices */
5777      &genX(index_buffer),
5778      &genX(vertices),
5779   };
5780#elif GEN_GEN == 7
5781   static const struct brw_tracked_state *render_atoms[] =
5782   {
5783      &genX(vf_statistics),
5784
5785      /* Command packets: */
5786
5787      &genX(cc_vp),
5788      &genX(sf_clip_viewport),
5789
5790      &gen7_l3_state,
5791      &gen7_push_constant_space,
5792      &gen7_urb,
5793      &genX(blend_state),		/* must do before cc unit */
5794      &genX(color_calc_state),	/* must do before cc unit */
5795      &genX(depth_stencil_state),	/* must do before cc unit */
5796
5797      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5798      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5799      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5800      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5801      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5802
5803      &genX(vs_push_constants), /* Before vs_state */
5804      &genX(tcs_push_constants),
5805      &genX(tes_push_constants),
5806      &genX(gs_push_constants), /* Before gs_state */
5807      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5808
5809      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5810       * table upload must be last.
5811       */
5812      &brw_vs_pull_constants,
5813      &brw_vs_ubo_surfaces,
5814      &brw_tcs_pull_constants,
5815      &brw_tcs_ubo_surfaces,
5816      &brw_tes_pull_constants,
5817      &brw_tes_ubo_surfaces,
5818      &brw_gs_pull_constants,
5819      &brw_gs_ubo_surfaces,
5820      &brw_wm_pull_constants,
5821      &brw_wm_ubo_surfaces,
5822      &gen6_renderbuffer_surfaces,
5823      &brw_renderbuffer_read_surfaces,
5824      &brw_texture_surfaces,
5825
5826      &genX(push_constant_packets),
5827
5828      &brw_vs_binding_table,
5829      &brw_tcs_binding_table,
5830      &brw_tes_binding_table,
5831      &brw_gs_binding_table,
5832      &brw_wm_binding_table,
5833
5834      &genX(fs_samplers),
5835      &genX(vs_samplers),
5836      &genX(tcs_samplers),
5837      &genX(tes_samplers),
5838      &genX(gs_samplers),
5839      &genX(multisample_state),
5840
5841      &genX(vs_state),
5842      &genX(hs_state),
5843      &genX(te_state),
5844      &genX(ds_state),
5845      &genX(gs_state),
5846      &genX(sol_state),
5847      &genX(clip_state),
5848      &genX(sbe_state),
5849      &genX(sf_state),
5850      &genX(wm_state),
5851      &genX(ps_state),
5852
5853      &genX(scissor_state),
5854
5855      &brw_depthbuffer,
5856
5857      &genX(polygon_stipple),
5858      &genX(polygon_stipple_offset),
5859
5860      &genX(line_stipple),
5861
5862      &genX(drawing_rect),
5863
5864      &brw_indices, /* must come before brw_vertices */
5865      &genX(index_buffer),
5866      &genX(vertices),
5867
5868#if GEN_IS_HASWELL
5869      &genX(cut_index),
5870#endif
5871   };
5872#elif GEN_GEN >= 8
5873   static const struct brw_tracked_state *render_atoms[] =
5874   {
5875      &genX(vf_statistics),
5876
5877      &genX(cc_vp),
5878      &genX(sf_clip_viewport),
5879
5880      &gen7_l3_state,
5881      &gen7_push_constant_space,
5882      &gen7_urb,
5883      &genX(blend_state),
5884      &genX(color_calc_state),
5885
5886      &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */
5887      &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */
5888      &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */
5889      &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */
5890      &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */
5891
5892      &genX(vs_push_constants), /* Before vs_state */
5893      &genX(tcs_push_constants),
5894      &genX(tes_push_constants),
5895      &genX(gs_push_constants), /* Before gs_state */
5896      &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */
5897
5898      /* Surface state setup.  Must come before the VS/WM unit.  The binding
5899       * table upload must be last.
5900       */
5901      &brw_vs_pull_constants,
5902      &brw_vs_ubo_surfaces,
5903      &brw_tcs_pull_constants,
5904      &brw_tcs_ubo_surfaces,
5905      &brw_tes_pull_constants,
5906      &brw_tes_ubo_surfaces,
5907      &brw_gs_pull_constants,
5908      &brw_gs_ubo_surfaces,
5909      &brw_wm_pull_constants,
5910      &brw_wm_ubo_surfaces,
5911      &gen6_renderbuffer_surfaces,
5912      &brw_renderbuffer_read_surfaces,
5913      &brw_texture_surfaces,
5914
5915      &genX(push_constant_packets),
5916
5917      &brw_vs_binding_table,
5918      &brw_tcs_binding_table,
5919      &brw_tes_binding_table,
5920      &brw_gs_binding_table,
5921      &brw_wm_binding_table,
5922
5923      &genX(fs_samplers),
5924      &genX(vs_samplers),
5925      &genX(tcs_samplers),
5926      &genX(tes_samplers),
5927      &genX(gs_samplers),
5928      &genX(multisample_state),
5929
5930      &genX(vs_state),
5931      &genX(hs_state),
5932      &genX(te_state),
5933      &genX(ds_state),
5934      &genX(gs_state),
5935      &genX(sol_state),
5936      &genX(clip_state),
5937      &genX(raster_state),
5938      &genX(sbe_state),
5939      &genX(sf_state),
5940      &genX(ps_blend),
5941      &genX(ps_extra),
5942      &genX(ps_state),
5943      &genX(depth_stencil_state),
5944      &genX(wm_state),
5945
5946      &genX(scissor_state),
5947
5948      &brw_depthbuffer,
5949
5950      &genX(polygon_stipple),
5951      &genX(polygon_stipple_offset),
5952
5953      &genX(line_stipple),
5954
5955      &genX(drawing_rect),
5956
5957      &genX(vf_topology),
5958
5959      &brw_indices,
5960      &genX(index_buffer),
5961      &genX(vertices),
5962
5963      &genX(cut_index),
5964      &gen8_pma_fix,
5965   };
5966#endif
5967
5968   STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms));
5969   brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE,
5970                           render_atoms, ARRAY_SIZE(render_atoms));
5971
5972#if GEN_GEN >= 7
5973   static const struct brw_tracked_state *compute_atoms[] =
5974   {
5975      &gen7_l3_state,
5976      &brw_cs_image_surfaces,
5977      &genX(cs_push_constants),
5978      &genX(cs_pull_constants),
5979      &brw_cs_ubo_surfaces,
5980      &brw_cs_texture_surfaces,
5981      &brw_cs_work_groups_surface,
5982      &genX(cs_samplers),
5983      &genX(cs_state),
5984   };
5985
5986   STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms));
5987   brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE,
5988                           compute_atoms, ARRAY_SIZE(compute_atoms));
5989
5990   brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count);
5991   brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker);
5992#endif
5993}
5994