1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR.  The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30#include "brw_fs.h"
31#include "compiler/glsl_types.h"
32
33using namespace brw;
34
35/* Sample from the MCS surface attached to this multisample texture. */
36fs_reg
37fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
38                           const fs_reg &texture,
39                           const fs_reg &texture_handle)
40{
41   const fs_reg dest = vgrf(glsl_type::uvec4_type);
42
43   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
44   srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
45   srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
46   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
47   srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle;
48   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
49   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
50
51   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
52                            ARRAY_SIZE(srcs));
53
54   /* We only care about one or two regs of response, but the sampler always
55    * writes 4/8.
56    */
57   inst->size_written = 4 * dest.component_size(inst->exec_size);
58
59   return dest;
60}
61
62/**
63 * Apply workarounds for Gen6 gather with UINT/SINT
64 */
65void
66fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
67{
68   if (!wa)
69      return;
70
71   int width = (wa & WA_8BIT) ? 8 : 16;
72
73   for (int i = 0; i < 4; i++) {
74      fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
75      /* Convert from UNORM to UINT */
76      bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
77      bld.MOV(dst, dst_f);
78
79      if (wa & WA_SIGN) {
80         /* Reinterpret the UINT value as a signed INT value by
81          * shifting the sign bit into place, then shifting back
82          * preserving sign.
83          */
84         bld.SHL(dst, dst, brw_imm_d(32 - width));
85         bld.ASR(dst, dst, brw_imm_d(32 - width));
86      }
87
88      dst = offset(dst, bld, 1);
89   }
90}
91
92/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
93void
94fs_visitor::emit_dummy_fs()
95{
96   int reg_width = dispatch_width / 8;
97
98   /* Everyone's favorite color. */
99   const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
100   for (int i = 0; i < 4; i++) {
101      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
102              brw_imm_f(color[i]));
103   }
104
105   fs_inst *write;
106   write = bld.emit(FS_OPCODE_FB_WRITE);
107   write->eot = true;
108   write->last_rt = true;
109   if (devinfo->gen >= 6) {
110      write->base_mrf = 2;
111      write->mlen = 4 * reg_width;
112   } else {
113      write->header_size = 2;
114      write->base_mrf = 0;
115      write->mlen = 2 + 4 * reg_width;
116   }
117
118   /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
119    * varying to avoid GPU hangs, so set that.
120    */
121   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
122   wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
123   memset(wm_prog_data->urb_setup, -1,
124          sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
125
126   /* We don't have any uniforms. */
127   stage_prog_data->nr_params = 0;
128   stage_prog_data->nr_pull_params = 0;
129   stage_prog_data->curb_read_length = 0;
130   stage_prog_data->dispatch_grf_start_reg = 2;
131   wm_prog_data->dispatch_grf_start_reg_16 = 2;
132   wm_prog_data->dispatch_grf_start_reg_32 = 2;
133   grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
134
135   calculate_cfg();
136}
137
138/* The register location here is relative to the start of the URB
139 * data.  It will get adjusted to be a real location before
140 * generate_code() time.
141 */
142fs_reg
143fs_visitor::interp_reg(int location, int channel)
144{
145   assert(stage == MESA_SHADER_FRAGMENT);
146   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
147   int regnr = prog_data->urb_setup[location] * 4 + channel;
148   assert(prog_data->urb_setup[location] != -1);
149
150   return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F);
151}
152
153/** Emits the interpolation for the varying inputs. */
154void
155fs_visitor::emit_interpolation_setup_gen4()
156{
157   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
158
159   fs_builder abld = bld.annotate("compute pixel centers");
160   this->pixel_x = vgrf(glsl_type::uint_type);
161   this->pixel_y = vgrf(glsl_type::uint_type);
162   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
163   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
164   abld.ADD(this->pixel_x,
165            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
166            fs_reg(brw_imm_v(0x10101010)));
167   abld.ADD(this->pixel_y,
168            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
169            fs_reg(brw_imm_v(0x11001100)));
170
171   abld = bld.annotate("compute pixel deltas from v0");
172
173   this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
174      vgrf(glsl_type::vec2_type);
175   const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
176   const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
177   const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
178
179   if (devinfo->has_pln && dispatch_width == 16) {
180      for (unsigned i = 0; i < 2; i++) {
181         abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
182                          half(this->pixel_x, i), xstart);
183         abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
184                          half(this->pixel_y, i), ystart);
185      }
186   } else {
187      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
188      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
189   }
190
191   abld = bld.annotate("compute pos.w and 1/pos.w");
192   /* Compute wpos.w.  It's always in our setup, since it's needed to
193    * interpolate the other attributes.
194    */
195   this->wpos_w = vgrf(glsl_type::float_type);
196   abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
197             component(interp_reg(VARYING_SLOT_POS, 3), 0));
198   /* Compute the pixel 1/W value from wpos.w. */
199   this->pixel_w = vgrf(glsl_type::float_type);
200   abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
201}
202
203/** Emits the interpolation for the varying inputs. */
204void
205fs_visitor::emit_interpolation_setup_gen6()
206{
207   fs_builder abld = bld.annotate("compute pixel centers");
208
209   this->pixel_x = vgrf(glsl_type::float_type);
210   this->pixel_y = vgrf(glsl_type::float_type);
211
212   for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
213      const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
214      struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW);
215
216      if (devinfo->gen >= 8 || dispatch_width == 8) {
217         /* The "Register Region Restrictions" page says for BDW (and newer,
218          * presumably):
219          *
220          *     "When destination spans two registers, the source may be one or
221          *      two registers. The destination elements must be evenly split
222          *      between the two registers."
223          *
224          * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16
225          * to compute our pixel centers.
226          */
227         const fs_builder dbld =
228            abld.exec_all().group(hbld.dispatch_width() * 2, 0);
229         fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW);
230
231         dbld.ADD(int_pixel_xy,
232                  fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)),
233                  fs_reg(brw_imm_v(0x11001010)));
234
235         hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy);
236         hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy);
237      } else {
238         /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
239          *
240          *     "When destination spans two registers, the source MUST span
241          *      two registers."
242          *
243          * Since the GRF source of the ADD will only read a single register,
244          * we must do two separate ADDs in SIMD16.
245          */
246         const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW);
247         const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW);
248
249         hbld.ADD(int_pixel_x,
250                  fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)),
251                  fs_reg(brw_imm_v(0x10101010)));
252         hbld.ADD(int_pixel_y,
253                  fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)),
254                  fs_reg(brw_imm_v(0x11001100)));
255
256         /* As of gen6, we can no longer mix float and int sources.  We have
257          * to turn the integer pixel centers into floats for their actual
258          * use.
259          */
260         hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x);
261         hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y);
262      }
263   }
264
265   abld = bld.annotate("compute pos.w");
266   this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg);
267   this->wpos_w = vgrf(glsl_type::float_type);
268   abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
269
270   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
271
272   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
273      this->delta_xy[i] = fetch_payload_reg(
274         bld, payload.barycentric_coord_reg[i], BRW_REGISTER_TYPE_F, 2);
275   }
276
277   uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
278      (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
279       1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
280
281   if (devinfo->needs_unlit_centroid_workaround && centroid_modes) {
282      /* Get the pixel/sample mask into f0 so that we know which
283       * pixels are lit.  Then, for each channel that is unlit,
284       * replace the centroid data with non-centroid data.
285       */
286      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
287         bld.exec_all().group(1, 0)
288            .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW),
289                 retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW));
290      }
291
292      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
293         if (!(centroid_modes & (1 << i)))
294            continue;
295
296         const fs_reg &pixel_delta_xy = delta_xy[i - 1];
297
298         for (unsigned q = 0; q < dispatch_width / 8; q++) {
299            for (unsigned c = 0; c < 2; c++) {
300               const unsigned idx = c + (q & 2) + (q & 1) * dispatch_width / 8;
301               set_predicate_inv(
302                  BRW_PREDICATE_NORMAL, true,
303                  bld.half(q).MOV(horiz_offset(delta_xy[i], idx * 8),
304                                  horiz_offset(pixel_delta_xy, idx * 8)));
305            }
306         }
307      }
308   }
309}
310
311static enum brw_conditional_mod
312cond_for_alpha_func(GLenum func)
313{
314   switch(func) {
315      case GL_GREATER:
316         return BRW_CONDITIONAL_G;
317      case GL_GEQUAL:
318         return BRW_CONDITIONAL_GE;
319      case GL_LESS:
320         return BRW_CONDITIONAL_L;
321      case GL_LEQUAL:
322         return BRW_CONDITIONAL_LE;
323      case GL_EQUAL:
324         return BRW_CONDITIONAL_EQ;
325      case GL_NOTEQUAL:
326         return BRW_CONDITIONAL_NEQ;
327      default:
328         unreachable("Not reached");
329   }
330}
331
332/**
333 * Alpha test support for when we compile it into the shader instead
334 * of using the normal fixed-function alpha test.
335 */
336void
337fs_visitor::emit_alpha_test()
338{
339   assert(stage == MESA_SHADER_FRAGMENT);
340   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
341   const fs_builder abld = bld.annotate("Alpha test");
342
343   fs_inst *cmp;
344   if (key->alpha_test_func == GL_ALWAYS)
345      return;
346
347   if (key->alpha_test_func == GL_NEVER) {
348      /* f0.1 = 0 */
349      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
350                                      BRW_REGISTER_TYPE_UW));
351      cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
352                     BRW_CONDITIONAL_NEQ);
353   } else {
354      /* RT0 alpha */
355      fs_reg color = offset(outputs[0], bld, 3);
356
357      /* f0.1 &= func(color, ref) */
358      cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
359                     cond_for_alpha_func(key->alpha_test_func));
360   }
361   cmp->predicate = BRW_PREDICATE_NORMAL;
362   cmp->flag_subreg = 1;
363}
364
365fs_inst *
366fs_visitor::emit_single_fb_write(const fs_builder &bld,
367                                 fs_reg color0, fs_reg color1,
368                                 fs_reg src0_alpha, unsigned components)
369{
370   assert(stage == MESA_SHADER_FRAGMENT);
371   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
372
373   /* Hand over gl_FragDepth or the payload depth. */
374   const fs_reg dst_depth = fetch_payload_reg(bld, payload.dest_depth_reg);
375   fs_reg src_depth, src_stencil;
376
377   if (source_depth_to_render_target) {
378      if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
379         src_depth = frag_depth;
380      else
381         src_depth = fetch_payload_reg(bld, payload.source_depth_reg);
382   }
383
384   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
385      src_stencil = frag_stencil;
386
387   const fs_reg sources[] = {
388      color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
389      (prog_data->uses_omask ? sample_mask : fs_reg()),
390      brw_imm_ud(components)
391   };
392   assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
393   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
394                             sources, ARRAY_SIZE(sources));
395
396   if (prog_data->uses_kill) {
397      write->predicate = BRW_PREDICATE_NORMAL;
398      write->flag_subreg = 1;
399   }
400
401   return write;
402}
403
404void
405fs_visitor::emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha)
406{
407   /* We need to compute alpha to coverage dithering manually in shader
408    * and replace sample mask store with the bitwise-AND of sample mask and
409    * alpha to coverage dithering.
410    *
411    * The following formula is used to compute final sample mask:
412    *  m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
413    *  dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
414    *     0x0808 * (m & 2) | 0x0100 * (m & 1)
415    *  sample_mask = sample_mask & dither_mask
416    *
417    * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
418    * least significant bits of the result:
419    *  0.0000 0000000000000000
420    *  0.0625 0000000100000000
421    *  0.1250 0001000000010000
422    *  0.1875 0001000100010000
423    *  0.2500 1000100010001000
424    *  0.3125 1000100110001000
425    *  0.3750 1001100010011000
426    *  0.4375 1001100110011000
427    *  0.5000 1010101010101010
428    *  0.5625 1010101110101010
429    *  0.6250 1011101010111010
430    *  0.6875 1011101110111010
431    *  0.7500 1110111011101110
432    *  0.8125 1110111111101110
433    *  0.8750 1111111011111110
434    *  0.9375 1111111111111110
435    *  1.0000 1111111111111111
436    */
437   const fs_builder abld = bld.annotate("compute alpha_to_coverage & "
438      "sample_mask");
439
440   /* clamp(src0_alpha, 0.f, 1.f) */
441   const fs_reg float_tmp = abld.vgrf(BRW_REGISTER_TYPE_F);
442   set_saturate(true, abld.MOV(float_tmp, src0_alpha));
443
444   /* 16.0 * clamp(src0_alpha, 0.0, 1.0) */
445   abld.MUL(float_tmp, float_tmp, brw_imm_f(16.0));
446
447   /* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) */
448   const fs_reg m = abld.vgrf(BRW_REGISTER_TYPE_UW);
449   abld.MOV(m, float_tmp);
450
451   /* 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) */
452   const fs_reg int_tmp_1 = abld.vgrf(BRW_REGISTER_TYPE_UW);
453   const fs_reg shift_const = abld.vgrf(BRW_REGISTER_TYPE_UD);
454   abld.MOV(shift_const, brw_imm_d(0xfea80));
455   abld.AND(int_tmp_1, m, brw_imm_uw(~3));
456   abld.SHR(int_tmp_1, shift_const, int_tmp_1);
457   abld.AND(int_tmp_1, int_tmp_1, brw_imm_uw(0xf));
458   abld.MUL(int_tmp_1, int_tmp_1, brw_imm_uw(0x1111));
459
460   /* 0x0808 * (m & 2) */
461   const fs_reg int_tmp_2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
462   abld.AND(int_tmp_2, m, brw_imm_uw(2));
463   abld.MUL(int_tmp_2, int_tmp_2, brw_imm_uw(0x0808));
464
465   abld.OR(int_tmp_1, int_tmp_1, int_tmp_2);
466
467   /* 0x0100 * (m & 1) */
468   const fs_reg int_tmp_3 = abld.vgrf(BRW_REGISTER_TYPE_UW);
469   abld.AND(int_tmp_3, m, brw_imm_uw(1));
470   abld.MUL(int_tmp_3, int_tmp_3, brw_imm_uw(0x0100));
471
472   abld.OR(int_tmp_1, int_tmp_1, int_tmp_3);
473
474   /* sample_mask = sample_mask & dither_mask */
475   const fs_reg mask = abld.vgrf(BRW_REGISTER_TYPE_UD);
476   abld.AND(mask, sample_mask, int_tmp_1);
477   sample_mask = mask;
478}
479
480void
481fs_visitor::emit_fb_writes()
482{
483   assert(stage == MESA_SHADER_FRAGMENT);
484   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
485   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
486
487   fs_inst *inst = NULL;
488
489   if (source_depth_to_render_target && devinfo->gen == 6) {
490      /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
491       * would require SIMD8 moves of each half to message regs, e.g. by using
492       * the SIMD lowering pass.  Unfortunately this is more difficult than it
493       * sounds because the SIMD8 single-source message lacks channel selects
494       * for the second and third subspans.
495       */
496      limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
497   }
498
499   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
500      /* From the 'Render Target Write message' section of the docs:
501       * "Output Stencil is not supported with SIMD16 Render Target Write
502       * Messages."
503       */
504      limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
505                           "in SIMD16+ mode.\n");
506   }
507
508   /* ANV doesn't know about sample mask output during the wm key creation
509    * so we compute if we need replicate alpha and emit alpha to coverage
510    * workaround here.
511    */
512   prog_data->replicate_alpha = key->alpha_test_replicate_alpha ||
513      (key->nr_color_regions > 1 && key->alpha_to_coverage &&
514       (sample_mask.file == BAD_FILE || devinfo->gen == 6));
515
516   /* From the SKL PRM, Volume 7, "Alpha Coverage":
517    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
518    *   hardware, regardless of the state setting for this feature."
519    */
520   if (devinfo->gen > 6 && key->alpha_to_coverage &&
521       sample_mask.file != BAD_FILE && this->outputs[0].file != BAD_FILE)
522      emit_alpha_to_coverage_workaround(offset(this->outputs[0], bld, 3));
523
524   for (int target = 0; target < key->nr_color_regions; target++) {
525      /* Skip over outputs that weren't written. */
526      if (this->outputs[target].file == BAD_FILE)
527         continue;
528
529      const fs_builder abld = bld.annotate(
530         ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
531
532      fs_reg src0_alpha;
533      if (devinfo->gen >= 6 && prog_data->replicate_alpha && target != 0)
534         src0_alpha = offset(outputs[0], bld, 3);
535
536      inst = emit_single_fb_write(abld, this->outputs[target],
537                                  this->dual_src_output, src0_alpha, 4);
538      inst->target = target;
539   }
540
541   prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE &&
542                                this->outputs[0].file != BAD_FILE);
543   assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
544
545   if (inst == NULL) {
546      /* Even if there's no color buffers enabled, we still need to send
547       * alpha out the pipeline to our null renderbuffer to support
548       * alpha-testing, alpha-to-coverage, and so on.
549       */
550      /* FINISHME: Factor out this frequently recurring pattern into a
551       * helper function.
552       */
553      const fs_reg srcs[] = { reg_undef, reg_undef,
554                              reg_undef, offset(this->outputs[0], bld, 3) };
555      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
556      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
557
558      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
559      inst->target = 0;
560   }
561
562   inst->last_rt = true;
563   inst->eot = true;
564}
565
566void
567fs_visitor::setup_uniform_clipplane_values()
568{
569   const struct brw_vs_prog_key *key =
570      (const struct brw_vs_prog_key *) this->key;
571
572   if (key->nr_userclip_plane_consts == 0)
573      return;
574
575   assert(stage_prog_data->nr_params == uniforms);
576   brw_stage_prog_data_add_params(stage_prog_data,
577                                  key->nr_userclip_plane_consts * 4);
578
579   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
580      this->userplane[i] = fs_reg(UNIFORM, uniforms);
581      for (int j = 0; j < 4; ++j) {
582         stage_prog_data->param[uniforms + j] =
583            BRW_PARAM_BUILTIN_CLIP_PLANE(i, j);
584      }
585      uniforms += 4;
586   }
587}
588
589/**
590 * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
591 *
592 * This does nothing if the shader uses gl_ClipDistance or user clipping is
593 * disabled altogether.
594 */
595void fs_visitor::compute_clip_distance()
596{
597   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
598   const struct brw_vs_prog_key *key =
599      (const struct brw_vs_prog_key *) this->key;
600
601   /* Bail unless some sort of legacy clipping is enabled */
602   if (key->nr_userclip_plane_consts == 0)
603      return;
604
605   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
606    *
607    *     "If a linked set of shaders forming the vertex stage contains no
608    *     static write to gl_ClipVertex or gl_ClipDistance, but the
609    *     application has requested clipping against user clip planes through
610    *     the API, then the coordinate written to gl_Position is used for
611    *     comparison against the user clip planes."
612    *
613    * This function is only called if the shader didn't write to
614    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
615    * if the user wrote to it; otherwise we use gl_Position.
616    */
617
618   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
619   if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
620      clip_vertex = VARYING_SLOT_POS;
621
622   /* If the clip vertex isn't written, skip this.  Typically this means
623    * the GS will set up clipping. */
624   if (outputs[clip_vertex].file == BAD_FILE)
625      return;
626
627   setup_uniform_clipplane_values();
628
629   const fs_builder abld = bld.annotate("user clip distances");
630
631   this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
632   this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
633
634   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
635      fs_reg u = userplane[i];
636      const fs_reg output = offset(outputs[VARYING_SLOT_CLIP_DIST0 + i / 4],
637                                   bld, i & 3);
638
639      abld.MUL(output, outputs[clip_vertex], u);
640      for (int j = 1; j < 4; j++) {
641         u.nr = userplane[i].nr + j;
642         abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
643      }
644   }
645}
646
647void
648fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
649{
650   int slot, urb_offset, length;
651   int starting_urb_offset = 0;
652   const struct brw_vue_prog_data *vue_prog_data =
653      brw_vue_prog_data(this->prog_data);
654   const struct brw_vs_prog_key *vs_key =
655      (const struct brw_vs_prog_key *) this->key;
656   const GLbitfield64 psiz_mask =
657      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
658   const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
659   bool flush;
660   fs_reg sources[8];
661   fs_reg urb_handle;
662
663   if (stage == MESA_SHADER_TESS_EVAL)
664      urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
665   else
666      urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
667
668   opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
669   int header_size = 1;
670   fs_reg per_slot_offsets;
671
672   if (stage == MESA_SHADER_GEOMETRY) {
673      const struct brw_gs_prog_data *gs_prog_data =
674         brw_gs_prog_data(this->prog_data);
675
676      /* We need to increment the Global Offset to skip over the control data
677       * header and the extra "Vertex Count" field (1 HWord) at the beginning
678       * of the VUE.  We're counting in OWords, so the units are doubled.
679       */
680      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
681      if (gs_prog_data->static_vertex_count == -1)
682         starting_urb_offset += 2;
683
684      /* We also need to use per-slot offsets.  The per-slot offset is the
685       * Vertex Count.  SIMD8 mode processes 8 different primitives at a
686       * time; each may output a different number of vertices.
687       */
688      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
689      header_size++;
690
691      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
692      const int output_vertex_size_owords =
693         gs_prog_data->output_vertex_size_hwords * 2;
694
695      if (gs_vertex_count.file == IMM) {
696         per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
697                                       gs_vertex_count.ud);
698      } else {
699         per_slot_offsets = vgrf(glsl_type::uint_type);
700         bld.MUL(per_slot_offsets, gs_vertex_count,
701                 brw_imm_ud(output_vertex_size_owords));
702      }
703   }
704
705   length = 0;
706   urb_offset = starting_urb_offset;
707   flush = false;
708
709   /* SSO shaders can have VUE slots allocated which are never actually
710    * written to, so ignore them when looking for the last (written) slot.
711    */
712   int last_slot = vue_map->num_slots - 1;
713   while (last_slot > 0 &&
714          (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
715           outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
716      last_slot--;
717   }
718
719   bool urb_written = false;
720   for (slot = 0; slot < vue_map->num_slots; slot++) {
721      int varying = vue_map->slot_to_varying[slot];
722      switch (varying) {
723      case VARYING_SLOT_PSIZ: {
724         /* The point size varying slot is the vue header and is always in the
725          * vue map.  But often none of the special varyings that live there
726          * are written and in that case we can skip writing to the vue
727          * header, provided the corresponding state properly clamps the
728          * values further down the pipeline. */
729         if ((vue_map->slots_valid & psiz_mask) == 0) {
730            assert(length == 0);
731            urb_offset++;
732            break;
733         }
734
735         fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
736         bld.MOV(zero, brw_imm_ud(0u));
737
738         sources[length++] = zero;
739         if (vue_map->slots_valid & VARYING_BIT_LAYER)
740            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
741         else
742            sources[length++] = zero;
743
744         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
745            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
746         else
747            sources[length++] = zero;
748
749         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
750            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
751         else
752            sources[length++] = zero;
753         break;
754      }
755      case BRW_VARYING_SLOT_NDC:
756      case VARYING_SLOT_EDGE:
757         unreachable("unexpected scalar vs output");
758         break;
759
760      default:
761         /* gl_Position is always in the vue map, but isn't always written by
762          * the shader.  Other varyings (clip distances) get added to the vue
763          * map but don't always get written.  In those cases, the
764          * corresponding this->output[] slot will be invalid we and can skip
765          * the urb write for the varying.  If we've already queued up a vue
766          * slot for writing we flush a mlen 5 urb write, otherwise we just
767          * advance the urb_offset.
768          */
769         if (varying == BRW_VARYING_SLOT_PAD ||
770             this->outputs[varying].file == BAD_FILE) {
771            if (length > 0)
772               flush = true;
773            else
774               urb_offset++;
775            break;
776         }
777
778         if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
779             (varying == VARYING_SLOT_COL0 ||
780              varying == VARYING_SLOT_COL1 ||
781              varying == VARYING_SLOT_BFC0 ||
782              varying == VARYING_SLOT_BFC1)) {
783            /* We need to clamp these guys, so do a saturating MOV into a
784             * temp register and use that for the payload.
785             */
786            for (int i = 0; i < 4; i++) {
787               fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
788               fs_reg src = offset(this->outputs[varying], bld, i);
789               set_saturate(true, bld.MOV(reg, src));
790               sources[length++] = reg;
791            }
792         } else {
793            for (unsigned i = 0; i < 4; i++)
794               sources[length++] = offset(this->outputs[varying], bld, i);
795         }
796         break;
797      }
798
799      const fs_builder abld = bld.annotate("URB write");
800
801      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
802       * the last slot or if we need to flush (see BAD_FILE varying case
803       * above), emit a URB write send now to flush out the data.
804       */
805      if (length == 8 || (length > 0 && slot == last_slot))
806         flush = true;
807      if (flush) {
808         fs_reg *payload_sources =
809            ralloc_array(mem_ctx, fs_reg, length + header_size);
810         fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
811                                 BRW_REGISTER_TYPE_F);
812         payload_sources[0] = urb_handle;
813
814         if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
815            payload_sources[1] = per_slot_offsets;
816
817         memcpy(&payload_sources[header_size], sources,
818                length * sizeof sources[0]);
819
820         abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
821                           header_size);
822
823         fs_inst *inst = abld.emit(opcode, reg_undef, payload);
824
825         /* For ICL WA 1805992985 one needs additional write in the end. */
826         if (devinfo->gen == 11 && stage == MESA_SHADER_TESS_EVAL)
827            inst->eot = false;
828         else
829            inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
830
831         inst->mlen = length + header_size;
832         inst->offset = urb_offset;
833         urb_offset = starting_urb_offset + slot + 1;
834         length = 0;
835         flush = false;
836         urb_written = true;
837      }
838   }
839
840   /* If we don't have any valid slots to write, just do a minimal urb write
841    * send to terminate the shader.  This includes 1 slot of undefined data,
842    * because it's invalid to write 0 data:
843    *
844    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
845    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
846    * Write Data Payload:
847    *
848    *    "The write data payload can be between 1 and 8 message phases long."
849    */
850   if (!urb_written) {
851      /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
852       * end the thread, and emit_gs_thread_end() already emits a SEND with
853       * EOT at the end of the program for us.
854       */
855      if (stage == MESA_SHADER_GEOMETRY)
856         return;
857
858      fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
859      bld.exec_all().MOV(payload, urb_handle);
860
861      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
862      inst->eot = true;
863      inst->mlen = 2;
864      inst->offset = 1;
865      return;
866   }
867
868   /* ICL WA 1805992985:
869    *
870    * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The
871    * send cycle, which is a urb write with an eot must be 4 phases long and
872    * all 8 lanes must valid.
873    */
874   if (devinfo->gen == 11 && stage == MESA_SHADER_TESS_EVAL) {
875      fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD);
876
877      /* Workaround requires all 8 channels (lanes) to be valid. This is
878       * understood to mean they all need to be alive. First trick is to find
879       * a live channel and copy its urb handle for all the other channels to
880       * make sure all handles are valid.
881       */
882      bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle));
883
884      /* Second trick is to use masked URB write where one can tell the HW to
885       * actually write data only for selected channels even though all are
886       * active.
887       * Third trick is to take advantage of the must-be-zero (MBZ) area in
888       * the very beginning of the URB.
889       *
890       * One masks data to be written only for the first channel and uses
891       * offset zero explicitly to land data to the MBZ area avoiding trashing
892       * any other part of the URB.
893       *
894       * Since the WA says that the write needs to be 4 phases long one uses
895       * 4 slots data. All are explicitly zeros in order to to keep the MBZ
896       * area written as zeros.
897       */
898      bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u));
899      bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u));
900      bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u));
901      bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u));
902      bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u));
903
904      fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
905                                          reg_undef, payload);
906      inst->eot = true;
907      inst->mlen = 6;
908      inst->offset = 0;
909   }
910}
911
912void
913fs_visitor::emit_cs_terminate()
914{
915   assert(devinfo->gen >= 7);
916
917   /* We are getting the thread ID from the compute shader header */
918   assert(stage == MESA_SHADER_COMPUTE);
919
920   /* We can't directly send from g0, since sends with EOT have to use
921    * g112-127. So, copy it to a virtual register, The register allocator will
922    * make sure it uses the appropriate register range.
923    */
924   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
925   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
926   bld.group(8, 0).exec_all().MOV(payload, g0);
927
928   /* Send a message to the thread spawner to terminate the thread. */
929   fs_inst *inst = bld.exec_all()
930                      .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
931   inst->eot = true;
932}
933
934void
935fs_visitor::emit_barrier()
936{
937   uint32_t barrier_id_mask;
938   switch (devinfo->gen) {
939   case 7:
940   case 8:
941      barrier_id_mask = 0x0f000000u; break;
942   case 9:
943   case 10:
944      barrier_id_mask = 0x8f000000u; break;
945   case 11:
946      barrier_id_mask = 0x7f000000u; break;
947   default:
948      unreachable("barrier is only available on gen >= 7");
949   }
950
951   /* We are getting the barrier ID from the compute shader header */
952   assert(stage == MESA_SHADER_COMPUTE);
953
954   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
955
956   /* Clear the message payload */
957   bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u));
958
959   /* Copy the barrier id from r0.2 to the message payload reg.2 */
960   fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
961   bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2,
962                                  brw_imm_ud(barrier_id_mask));
963
964   /* Emit a gateway "barrier" message using the payload we set up, followed
965    * by a wait instruction.
966    */
967   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
968}
969
970fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
971                       void *mem_ctx,
972                       const void *key,
973                       struct brw_stage_prog_data *prog_data,
974                       struct gl_program *prog,
975                       const nir_shader *shader,
976                       unsigned dispatch_width,
977                       int shader_time_index,
978                       const struct brw_vue_map *input_vue_map)
979   : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
980     key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
981     input_vue_map(input_vue_map),
982     dispatch_width(dispatch_width),
983     shader_time_index(shader_time_index),
984     bld(fs_builder(this, dispatch_width).at_end())
985{
986   init();
987}
988
989fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
990                       void *mem_ctx,
991                       struct brw_gs_compile *c,
992                       struct brw_gs_prog_data *prog_data,
993                       const nir_shader *shader,
994                       int shader_time_index)
995   : backend_shader(compiler, log_data, mem_ctx, shader,
996                    &prog_data->base.base),
997     key(&c->key), gs_compile(c),
998     prog_data(&prog_data->base.base), prog(NULL),
999     dispatch_width(8),
1000     shader_time_index(shader_time_index),
1001     bld(fs_builder(this, dispatch_width).at_end())
1002{
1003   init();
1004}
1005
1006
1007void
1008fs_visitor::init()
1009{
1010   switch (stage) {
1011   case MESA_SHADER_FRAGMENT:
1012      key_tex = &((const brw_wm_prog_key *) key)->tex;
1013      break;
1014   case MESA_SHADER_VERTEX:
1015      key_tex = &((const brw_vs_prog_key *) key)->tex;
1016      break;
1017   case MESA_SHADER_TESS_CTRL:
1018      key_tex = &((const brw_tcs_prog_key *) key)->tex;
1019      break;
1020   case MESA_SHADER_TESS_EVAL:
1021      key_tex = &((const brw_tes_prog_key *) key)->tex;
1022      break;
1023   case MESA_SHADER_GEOMETRY:
1024      key_tex = &((const brw_gs_prog_key *) key)->tex;
1025      break;
1026   case MESA_SHADER_COMPUTE:
1027      key_tex = &((const brw_cs_prog_key*) key)->tex;
1028      break;
1029   default:
1030      unreachable("unhandled shader stage");
1031   }
1032
1033   this->max_dispatch_width = 32;
1034   this->prog_data = this->stage_prog_data;
1035
1036   this->failed = false;
1037
1038   this->nir_locals = NULL;
1039   this->nir_ssa_values = NULL;
1040
1041   memset(&this->payload, 0, sizeof(this->payload));
1042   this->source_depth_to_render_target = false;
1043   this->runtime_check_aads_emit = false;
1044   this->first_non_payload_grf = 0;
1045   this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1046
1047   this->virtual_grf_start = NULL;
1048   this->virtual_grf_end = NULL;
1049   this->live_intervals = NULL;
1050   this->regs_live_at_ip = NULL;
1051
1052   this->uniforms = 0;
1053   this->last_scratch = 0;
1054   this->pull_constant_loc = NULL;
1055   this->push_constant_loc = NULL;
1056
1057   this->promoted_constants = 0,
1058
1059   this->grf_used = 0;
1060   this->spilled_any_registers = false;
1061}
1062
1063fs_visitor::~fs_visitor()
1064{
1065}
1066