brw_fs_nir.cpp revision 9f464c52
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "compiler/glsl/ir.h"
25#include "brw_fs.h"
26#include "brw_nir.h"
27#include "nir_search_helpers.h"
28#include "util/u_math.h"
29#include "util/bitscan.h"
30
31using namespace brw;
32
33void
34fs_visitor::emit_nir_code()
35{
36   /* emit the arrays used for inputs and outputs - load/store intrinsics will
37    * be converted to reads/writes of these arrays
38    */
39   nir_setup_outputs();
40   nir_setup_uniforms();
41   nir_emit_system_values();
42
43   nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
44}
45
46void
47fs_visitor::nir_setup_outputs()
48{
49   if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
50      return;
51
52   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
53
54   /* Calculate the size of output registers in a separate pass, before
55    * allocating them.  With ARB_enhanced_layouts, multiple output variables
56    * may occupy the same slot, but have different type sizes.
57    */
58   nir_foreach_variable(var, &nir->outputs) {
59      const int loc = var->data.driver_location;
60      const unsigned var_vec4s =
61         var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
62                           : type_size_vec4(var->type, true);
63      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
64   }
65
66   for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
67      if (vec4s[loc] == 0) {
68         loc++;
69         continue;
70      }
71
72      unsigned reg_size = vec4s[loc];
73
74      /* Check if there are any ranges that start within this range and extend
75       * past it. If so, include them in this allocation.
76       */
77      for (unsigned i = 1; i < reg_size; i++)
78         reg_size = MAX2(vec4s[i + loc] + i, reg_size);
79
80      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
81      for (unsigned i = 0; i < reg_size; i++)
82         outputs[loc + i] = offset(reg, bld, 4 * i);
83
84      loc += reg_size;
85   }
86}
87
88void
89fs_visitor::nir_setup_uniforms()
90{
91   /* Only the first compile gets to set up uniforms. */
92   if (push_constant_loc) {
93      assert(pull_constant_loc);
94      return;
95   }
96
97   uniforms = nir->num_uniforms / 4;
98
99   if (stage == MESA_SHADER_COMPUTE) {
100      /* Add a uniform for the thread local id.  It must be the last uniform
101       * on the list.
102       */
103      assert(uniforms == prog_data->nr_params);
104      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
105      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
106      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
107   }
108}
109
110static bool
111emit_system_values_block(nir_block *block, fs_visitor *v)
112{
113   fs_reg *reg;
114
115   nir_foreach_instr(instr, block) {
116      if (instr->type != nir_instr_type_intrinsic)
117         continue;
118
119      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
120      switch (intrin->intrinsic) {
121      case nir_intrinsic_load_vertex_id:
122      case nir_intrinsic_load_base_vertex:
123         unreachable("should be lowered by nir_lower_system_values().");
124
125      case nir_intrinsic_load_vertex_id_zero_base:
126      case nir_intrinsic_load_is_indexed_draw:
127      case nir_intrinsic_load_first_vertex:
128      case nir_intrinsic_load_instance_id:
129      case nir_intrinsic_load_base_instance:
130      case nir_intrinsic_load_draw_id:
131         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
132
133      case nir_intrinsic_load_invocation_id:
134         if (v->stage == MESA_SHADER_TESS_CTRL)
135            break;
136         assert(v->stage == MESA_SHADER_GEOMETRY);
137         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
138         if (reg->file == BAD_FILE) {
139            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
140            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
141            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
142            abld.SHR(iid, g1, brw_imm_ud(27u));
143            *reg = iid;
144         }
145         break;
146
147      case nir_intrinsic_load_sample_pos:
148         assert(v->stage == MESA_SHADER_FRAGMENT);
149         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
150         if (reg->file == BAD_FILE)
151            *reg = *v->emit_samplepos_setup();
152         break;
153
154      case nir_intrinsic_load_sample_id:
155         assert(v->stage == MESA_SHADER_FRAGMENT);
156         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
157         if (reg->file == BAD_FILE)
158            *reg = *v->emit_sampleid_setup();
159         break;
160
161      case nir_intrinsic_load_sample_mask_in:
162         assert(v->stage == MESA_SHADER_FRAGMENT);
163         assert(v->devinfo->gen >= 7);
164         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
165         if (reg->file == BAD_FILE)
166            *reg = *v->emit_samplemaskin_setup();
167         break;
168
169      case nir_intrinsic_load_work_group_id:
170         assert(v->stage == MESA_SHADER_COMPUTE);
171         reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
172         if (reg->file == BAD_FILE)
173            *reg = *v->emit_cs_work_group_id_setup();
174         break;
175
176      case nir_intrinsic_load_helper_invocation:
177         assert(v->stage == MESA_SHADER_FRAGMENT);
178         reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
179         if (reg->file == BAD_FILE) {
180            const fs_builder abld =
181               v->bld.annotate("gl_HelperInvocation", NULL);
182
183            /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
184             * pixel mask is in g1.7 of the thread payload.
185             *
186             * We move the per-channel pixel enable bit to the low bit of each
187             * channel by shifting the byte containing the pixel mask by the
188             * vector immediate 0x76543210UV.
189             *
190             * The region of <1,8,0> reads only 1 byte (the pixel masks for
191             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
192             * masks for 2 and 3) in SIMD16.
193             */
194            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
195
196            for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
197               const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
198               hbld.SHR(offset(shifted, hbld, i),
199                        stride(retype(brw_vec1_grf(1 + i, 7),
200                                      BRW_REGISTER_TYPE_UB),
201                               1, 8, 0),
202                        brw_imm_v(0x76543210));
203            }
204
205            /* A set bit in the pixel mask means the channel is enabled, but
206             * that is the opposite of gl_HelperInvocation so we need to invert
207             * the mask.
208             *
209             * The negate source-modifier bit of logical instructions on Gen8+
210             * performs 1's complement negation, so we can use that instead of
211             * a NOT instruction.
212             */
213            fs_reg inverted = negate(shifted);
214            if (v->devinfo->gen < 8) {
215               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
216               abld.NOT(inverted, shifted);
217            }
218
219            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
220             * with 1 and negating.
221             */
222            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
223            abld.AND(anded, inverted, brw_imm_uw(1));
224
225            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
226            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
227            *reg = dst;
228         }
229         break;
230
231      default:
232         break;
233      }
234   }
235
236   return true;
237}
238
239void
240fs_visitor::nir_emit_system_values()
241{
242   nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
243   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
244      nir_system_values[i] = fs_reg();
245   }
246
247   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
248    * never end up using it.
249    */
250   {
251      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
252      fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
253      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
254
255      const fs_builder allbld8 = abld.group(8, 0).exec_all();
256      allbld8.MOV(reg, brw_imm_v(0x76543210));
257      if (dispatch_width > 8)
258         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
259      if (dispatch_width > 16) {
260         const fs_builder allbld16 = abld.group(16, 0).exec_all();
261         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
262      }
263   }
264
265   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
266   nir_foreach_block(block, impl)
267      emit_system_values_block(block, this);
268}
269
270/*
271 * Returns a type based on a reference_type (word, float, half-float) and a
272 * given bit_size.
273 *
274 * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
275 *
276 * @FIXME: 64-bit return types are always DF on integer types to maintain
277 * compability with uses of DF previously to the introduction of int64
278 * support.
279 */
280static brw_reg_type
281brw_reg_type_from_bit_size(const unsigned bit_size,
282                           const brw_reg_type reference_type)
283{
284   switch(reference_type) {
285   case BRW_REGISTER_TYPE_HF:
286   case BRW_REGISTER_TYPE_F:
287   case BRW_REGISTER_TYPE_DF:
288      switch(bit_size) {
289      case 16:
290         return BRW_REGISTER_TYPE_HF;
291      case 32:
292         return BRW_REGISTER_TYPE_F;
293      case 64:
294         return BRW_REGISTER_TYPE_DF;
295      default:
296         unreachable("Invalid bit size");
297      }
298   case BRW_REGISTER_TYPE_B:
299   case BRW_REGISTER_TYPE_W:
300   case BRW_REGISTER_TYPE_D:
301   case BRW_REGISTER_TYPE_Q:
302      switch(bit_size) {
303      case 8:
304         return BRW_REGISTER_TYPE_B;
305      case 16:
306         return BRW_REGISTER_TYPE_W;
307      case 32:
308         return BRW_REGISTER_TYPE_D;
309      case 64:
310         return BRW_REGISTER_TYPE_Q;
311      default:
312         unreachable("Invalid bit size");
313      }
314   case BRW_REGISTER_TYPE_UB:
315   case BRW_REGISTER_TYPE_UW:
316   case BRW_REGISTER_TYPE_UD:
317   case BRW_REGISTER_TYPE_UQ:
318      switch(bit_size) {
319      case 8:
320         return BRW_REGISTER_TYPE_UB;
321      case 16:
322         return BRW_REGISTER_TYPE_UW;
323      case 32:
324         return BRW_REGISTER_TYPE_UD;
325      case 64:
326         return BRW_REGISTER_TYPE_UQ;
327      default:
328         unreachable("Invalid bit size");
329      }
330   default:
331      unreachable("Unknown type");
332   }
333}
334
335void
336fs_visitor::nir_emit_impl(nir_function_impl *impl)
337{
338   nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
339   for (unsigned i = 0; i < impl->reg_alloc; i++) {
340      nir_locals[i] = fs_reg();
341   }
342
343   foreach_list_typed(nir_register, reg, node, &impl->registers) {
344      unsigned array_elems =
345         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
346      unsigned size = array_elems * reg->num_components;
347      const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
348         brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
349      nir_locals[reg->index] = bld.vgrf(reg_type, size);
350   }
351
352   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
353                             impl->ssa_alloc);
354
355   nir_emit_cf_list(&impl->body);
356}
357
358void
359fs_visitor::nir_emit_cf_list(exec_list *list)
360{
361   exec_list_validate(list);
362   foreach_list_typed(nir_cf_node, node, node, list) {
363      switch (node->type) {
364      case nir_cf_node_if:
365         nir_emit_if(nir_cf_node_as_if(node));
366         break;
367
368      case nir_cf_node_loop:
369         nir_emit_loop(nir_cf_node_as_loop(node));
370         break;
371
372      case nir_cf_node_block:
373         nir_emit_block(nir_cf_node_as_block(node));
374         break;
375
376      default:
377         unreachable("Invalid CFG node block");
378      }
379   }
380}
381
382void
383fs_visitor::nir_emit_if(nir_if *if_stmt)
384{
385   bool invert;
386   fs_reg cond_reg;
387
388   /* If the condition has the form !other_condition, use other_condition as
389    * the source, but invert the predicate on the if instruction.
390    */
391   nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
392   if (cond != NULL && cond->op == nir_op_inot) {
393      assert(!cond->src[0].negate);
394      assert(!cond->src[0].abs);
395
396      invert = true;
397      cond_reg = get_nir_src(cond->src[0].src);
398   } else {
399      invert = false;
400      cond_reg = get_nir_src(if_stmt->condition);
401   }
402
403   /* first, put the condition into f0 */
404   fs_inst *inst = bld.MOV(bld.null_reg_d(),
405                           retype(cond_reg, BRW_REGISTER_TYPE_D));
406   inst->conditional_mod = BRW_CONDITIONAL_NZ;
407
408   bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
409
410   nir_emit_cf_list(&if_stmt->then_list);
411
412   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
413      bld.emit(BRW_OPCODE_ELSE);
414      nir_emit_cf_list(&if_stmt->else_list);
415   }
416
417   bld.emit(BRW_OPCODE_ENDIF);
418
419   if (devinfo->gen < 7)
420      limit_dispatch_width(16, "Non-uniform control flow unsupported "
421                           "in SIMD32 mode.");
422}
423
424void
425fs_visitor::nir_emit_loop(nir_loop *loop)
426{
427   bld.emit(BRW_OPCODE_DO);
428
429   nir_emit_cf_list(&loop->body);
430
431   bld.emit(BRW_OPCODE_WHILE);
432
433   if (devinfo->gen < 7)
434      limit_dispatch_width(16, "Non-uniform control flow unsupported "
435                           "in SIMD32 mode.");
436}
437
438void
439fs_visitor::nir_emit_block(nir_block *block)
440{
441   nir_foreach_instr(instr, block) {
442      nir_emit_instr(instr);
443   }
444}
445
446void
447fs_visitor::nir_emit_instr(nir_instr *instr)
448{
449   const fs_builder abld = bld.annotate(NULL, instr);
450
451   switch (instr->type) {
452   case nir_instr_type_alu:
453      nir_emit_alu(abld, nir_instr_as_alu(instr));
454      break;
455
456   case nir_instr_type_deref:
457      unreachable("All derefs should've been lowered");
458      break;
459
460   case nir_instr_type_intrinsic:
461      switch (stage) {
462      case MESA_SHADER_VERTEX:
463         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
464         break;
465      case MESA_SHADER_TESS_CTRL:
466         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
467         break;
468      case MESA_SHADER_TESS_EVAL:
469         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
470         break;
471      case MESA_SHADER_GEOMETRY:
472         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
473         break;
474      case MESA_SHADER_FRAGMENT:
475         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
476         break;
477      case MESA_SHADER_COMPUTE:
478         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
479         break;
480      default:
481         unreachable("unsupported shader stage");
482      }
483      break;
484
485   case nir_instr_type_tex:
486      nir_emit_texture(abld, nir_instr_as_tex(instr));
487      break;
488
489   case nir_instr_type_load_const:
490      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
491      break;
492
493   case nir_instr_type_ssa_undef:
494      /* We create a new VGRF for undefs on every use (by handling
495       * them in get_nir_src()), rather than for each definition.
496       * This helps register coalescing eliminate MOVs from undef.
497       */
498      break;
499
500   case nir_instr_type_jump:
501      nir_emit_jump(abld, nir_instr_as_jump(instr));
502      break;
503
504   default:
505      unreachable("unknown instruction type");
506   }
507}
508
509/**
510 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
511 * match instr.
512 */
513bool
514fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
515                                      const fs_reg &result)
516{
517   if (!instr->src[0].src.is_ssa ||
518       !instr->src[0].src.ssa->parent_instr)
519      return false;
520
521   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
522      return false;
523
524   nir_alu_instr *src0 =
525      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
526
527   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
528       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
529      return false;
530
531   /* If either opcode has source modifiers, bail.
532    *
533    * TODO: We can potentially handle source modifiers if both of the opcodes
534    * we're combining are signed integers.
535    */
536   if (instr->src[0].abs || instr->src[0].negate ||
537       src0->src[0].abs || src0->src[0].negate)
538      return false;
539
540   unsigned element = nir_src_as_uint(src0->src[1].src);
541
542   /* Element type to extract.*/
543   const brw_reg_type type = brw_int_type(
544      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
545      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
546
547   fs_reg op0 = get_nir_src(src0->src[0].src);
548   op0.type = brw_type_for_nir_type(devinfo,
549      (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
550                     nir_src_bit_size(src0->src[0].src)));
551   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
552
553   set_saturate(instr->dest.saturate,
554                bld.MOV(result, subscript(op0, type, element)));
555   return true;
556}
557
558bool
559fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
560                                         const fs_reg &result)
561{
562   nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
563   if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
564      return false;
565
566   if (!nir_src_is_const(instr->src[1].src) ||
567       !nir_src_is_const(instr->src[2].src))
568      return false;
569
570   const float value1 = nir_src_as_float(instr->src[1].src);
571   const float value2 = nir_src_as_float(instr->src[2].src);
572   if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
573      return false;
574
575   /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
576   assert(value1 == -value2);
577
578   fs_reg tmp = vgrf(glsl_type::int_type);
579
580   if (devinfo->gen >= 6) {
581      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
582      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
583
584      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
585       *
586       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
587       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
588       *
589       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
590       *
591       * This negation looks like it's safe in practice, because bits 0:4 will
592       * surely be TRIANGLES
593       */
594
595      if (value1 == -1.0f) {
596         g0.negate = true;
597      }
598
599      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
600             g0, brw_imm_uw(0x3f80));
601   } else {
602      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
603      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
604
605      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
606       *
607       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
608       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
609       *
610       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
611       *
612       * This negation looks like it's safe in practice, because bits 0:4 will
613       * surely be TRIANGLES
614       */
615
616      if (value1 == -1.0f) {
617         g1_6.negate = true;
618      }
619
620      bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
621   }
622   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
623
624   return true;
625}
626
627static void
628emit_find_msb_using_lzd(const fs_builder &bld,
629                        const fs_reg &result,
630                        const fs_reg &src,
631                        bool is_signed)
632{
633   fs_inst *inst;
634   fs_reg temp = src;
635
636   if (is_signed) {
637      /* LZD of an absolute value source almost always does the right
638       * thing.  There are two problem values:
639       *
640       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
641       *   0.  However, findMSB(int(0x80000000)) == 30.
642       *
643       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
644       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
645       *
646       *    For a value of zero or negative one, -1 will be returned.
647       *
648       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
649       *   findMSB(-(1<<x)) should return x-1.
650       *
651       * For all negative number cases, including 0x80000000 and
652       * 0xffffffff, the correct value is obtained from LZD if instead of
653       * negating the (already negative) value the logical-not is used.  A
654       * conditonal logical-not can be achieved in two instructions.
655       */
656      temp = bld.vgrf(BRW_REGISTER_TYPE_D);
657
658      bld.ASR(temp, src, brw_imm_d(31));
659      bld.XOR(temp, temp, src);
660   }
661
662   bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
663           retype(temp, BRW_REGISTER_TYPE_UD));
664
665   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
666    * from the LSB side. Subtract the result from 31 to convert the MSB
667    * count into an LSB count.  If no bits are set, LZD will return 32.
668    * 31-32 = -1, which is exactly what findMSB() is supposed to return.
669    */
670   inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
671   inst->src[0].negate = true;
672}
673
674static brw_rnd_mode
675brw_rnd_mode_from_nir_op (const nir_op op) {
676   switch (op) {
677   case nir_op_f2f16_rtz:
678      return BRW_RND_MODE_RTZ;
679   case nir_op_f2f16_rtne:
680      return BRW_RND_MODE_RTNE;
681   default:
682      unreachable("Operation doesn't support rounding mode");
683   }
684}
685
686fs_reg
687fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
688                                                nir_alu_instr *instr,
689                                                fs_reg *op,
690                                                bool need_dest)
691{
692   fs_reg result =
693      need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
694
695   result.type = brw_type_for_nir_type(devinfo,
696      (nir_alu_type)(nir_op_infos[instr->op].output_type |
697                     nir_dest_bit_size(instr->dest.dest)));
698
699   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
700      op[i] = get_nir_src(instr->src[i].src);
701      op[i].type = brw_type_for_nir_type(devinfo,
702         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
703                        nir_src_bit_size(instr->src[i].src)));
704      op[i].abs = instr->src[i].abs;
705      op[i].negate = instr->src[i].negate;
706   }
707
708   /* Move and vecN instrutions may still be vectored.  Return the raw,
709    * vectored source and destination so that fs_visitor::nir_emit_alu can
710    * handle it.  Other callers should not have to handle these kinds of
711    * instructions.
712    */
713   switch (instr->op) {
714   case nir_op_imov:
715   case nir_op_fmov:
716   case nir_op_vec2:
717   case nir_op_vec3:
718   case nir_op_vec4:
719      return result;
720   default:
721      break;
722   }
723
724   /* At this point, we have dealt with any instruction that operates on
725    * more than a single channel.  Therefore, we can just adjust the source
726    * and destination registers for that channel and emit the instruction.
727    */
728   unsigned channel = 0;
729   if (nir_op_infos[instr->op].output_size == 0) {
730      /* Since NIR is doing the scalarizing for us, we should only ever see
731       * vectorized operations with a single channel.
732       */
733      assert(util_bitcount(instr->dest.write_mask) == 1);
734      channel = ffs(instr->dest.write_mask) - 1;
735
736      result = offset(result, bld, channel);
737   }
738
739   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
740      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
741      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
742   }
743
744   return result;
745}
746
747void
748fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
749                                 fs_reg *op)
750{
751   for (unsigned i = 0; i < 2; i++) {
752      nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
753
754      if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
755          !inot_instr->src[0].abs && !inot_instr->src[0].negate) {
756         /* The source of the inot is now the source of instr. */
757         prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
758
759         assert(!op[i].negate);
760         op[i].negate = true;
761      } else {
762         op[i] = resolve_source_modifiers(op[i]);
763      }
764   }
765}
766
767bool
768fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
769                                  fs_reg result,
770                                  nir_alu_instr *instr)
771{
772   if (devinfo->gen < 6 || devinfo->gen >= 12)
773      return false;
774
775   nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
776
777   if (inot_instr == NULL || inot_instr->op != nir_op_inot)
778      return false;
779
780   /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
781    * of valid size-changing combinations is a bit more complex.
782    *
783    * The source restriction is just because I was lazy about generating the
784    * constant below.
785    */
786   if (nir_dest_bit_size(instr->dest.dest) != 32 ||
787       nir_src_bit_size(inot_instr->src[0].src) != 32)
788      return false;
789
790   /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
791    * this is float(1 + a).
792    */
793   fs_reg op;
794
795   prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
796
797   /* Ignore the saturate modifier, if there is one.  The result of the
798    * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
799    */
800   bld.ADD(result, op, brw_imm_d(1));
801
802   return true;
803}
804
805/**
806 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
807 *
808 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
809 * the source of \c instr that is a \c nir_op_fsign.
810 */
811void
812fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
813                       fs_reg result, fs_reg *op, unsigned fsign_src)
814{
815   fs_inst *inst;
816
817   assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
818   assert(fsign_src < nir_op_infos[instr->op].num_inputs);
819
820   if (instr->op != nir_op_fsign) {
821      const nir_alu_instr *const fsign_instr =
822         nir_src_as_alu_instr(instr->src[fsign_src].src);
823
824      assert(!fsign_instr->dest.saturate);
825
826      /* op[fsign_src] has the nominal result of the fsign, and op[1 -
827       * fsign_src] has the other multiply source.  This must be rearranged so
828       * that op[0] is the source of the fsign op[1] is the other multiply
829       * source.
830       */
831      if (fsign_src != 0)
832         op[1] = op[0];
833
834      op[0] = get_nir_src(fsign_instr->src[0].src);
835
836      const nir_alu_type t =
837         (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
838                        nir_src_bit_size(fsign_instr->src[0].src));
839
840      op[0].type = brw_type_for_nir_type(devinfo, t);
841      op[0].abs = fsign_instr->src[0].abs;
842      op[0].negate = fsign_instr->src[0].negate;
843
844      unsigned channel = 0;
845      if (nir_op_infos[instr->op].output_size == 0) {
846         /* Since NIR is doing the scalarizing for us, we should only ever see
847          * vectorized operations with a single channel.
848          */
849         assert(util_bitcount(instr->dest.write_mask) == 1);
850         channel = ffs(instr->dest.write_mask) - 1;
851      }
852
853      op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
854   } else {
855      assert(!instr->dest.saturate);
856   }
857
858   if (op[0].abs) {
859      /* Straightforward since the source can be assumed to be either strictly
860       * >= 0 or strictly <= 0 depending on the setting of the negate flag.
861       */
862      set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
863
864      if (instr->op == nir_op_fsign) {
865         inst = (op[0].negate)
866            ? bld.MOV(result, brw_imm_f(-1.0f))
867            : bld.MOV(result, brw_imm_f(1.0f));
868      } else {
869         op[1].negate = (op[0].negate != op[1].negate);
870         inst = bld.MOV(result, op[1]);
871      }
872
873      set_predicate(BRW_PREDICATE_NORMAL, inst);
874   } else if (type_sz(op[0].type) == 2) {
875      /* AND(val, 0x8000) gives the sign bit.
876       *
877       * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
878       */
879      fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
880      bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
881
882      op[0].type = BRW_REGISTER_TYPE_UW;
883      result.type = BRW_REGISTER_TYPE_UW;
884      bld.AND(result, op[0], brw_imm_uw(0x8000u));
885
886      if (instr->op == nir_op_fsign)
887         inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
888      else {
889         /* Use XOR here to get the result sign correct. */
890         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
891      }
892
893      inst->predicate = BRW_PREDICATE_NORMAL;
894   } else if (type_sz(op[0].type) == 4) {
895      /* AND(val, 0x80000000) gives the sign bit.
896       *
897       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
898       * zero.
899       */
900      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
901
902      op[0].type = BRW_REGISTER_TYPE_UD;
903      result.type = BRW_REGISTER_TYPE_UD;
904      bld.AND(result, op[0], brw_imm_ud(0x80000000u));
905
906      if (instr->op == nir_op_fsign)
907         inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
908      else {
909         /* Use XOR here to get the result sign correct. */
910         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
911      }
912
913      inst->predicate = BRW_PREDICATE_NORMAL;
914   } else {
915      /* For doubles we do the same but we need to consider:
916       *
917       * - 2-src instructions can't operate with 64-bit immediates
918       * - The sign is encoded in the high 32-bit of each DF
919       * - We need to produce a DF result.
920       */
921
922      fs_reg zero = vgrf(glsl_type::double_type);
923      bld.MOV(zero, setup_imm_df(bld, 0.0));
924      bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
925
926      bld.MOV(result, zero);
927
928      fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
929      bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
930              brw_imm_ud(0x80000000u));
931
932      if (instr->op == nir_op_fsign) {
933         set_predicate(BRW_PREDICATE_NORMAL,
934                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
935      } else {
936         /* This could be done better in some cases.  If the scale is an
937          * immediate with the low 32-bits all 0, emitting a separate XOR and
938          * OR would allow an algebraic optimization to remove the OR.  There
939          * are currently zero instances of fsign(double(x))*IMM in shader-db
940          * or any test suite, so it is hard to care at this time.
941          */
942         fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
943         inst = bld.XOR(result_int64, result_int64,
944                        retype(op[1], BRW_REGISTER_TYPE_UQ));
945      }
946   }
947}
948
949/**
950 * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
951 *
952 * Checks the operands of a \c nir_op_fmul to determine whether or not
953 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
954 *
955 * \param instr  The multiplication instruction
956 *
957 * \param fsign_src The source of \c instr that may or may not be a
958 *                  \c nir_op_fsign
959 */
960static bool
961can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
962{
963   assert(instr->op == nir_op_fmul);
964
965   nir_alu_instr *const fsign_instr =
966      nir_src_as_alu_instr(instr->src[fsign_src].src);
967
968   /* Rules:
969    *
970    * 1. instr->src[fsign_src] must be a nir_op_fsign.
971    * 2. The nir_op_fsign can only be used by this multiplication.
972    * 3. The source that is the nir_op_fsign does not have source modifiers.
973    *    \c emit_fsign only examines the source modifiers of the source of the
974    *    \c nir_op_fsign.
975    *
976    * The nir_op_fsign must also not have the saturate modifier, but steps
977    * have already been taken (in nir_opt_algebraic) to ensure that.
978    */
979   return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
980          is_used_once(fsign_instr) &&
981          !instr->src[fsign_src].abs && !instr->src[fsign_src].negate;
982}
983
984void
985fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
986{
987   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
988   fs_inst *inst;
989
990   fs_reg op[4];
991   fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true);
992
993   switch (instr->op) {
994   case nir_op_imov:
995   case nir_op_fmov:
996   case nir_op_vec2:
997   case nir_op_vec3:
998   case nir_op_vec4: {
999      fs_reg temp = result;
1000      bool need_extra_copy = false;
1001      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1002         if (!instr->src[i].src.is_ssa &&
1003             instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
1004            need_extra_copy = true;
1005            temp = bld.vgrf(result.type, 4);
1006            break;
1007         }
1008      }
1009
1010      for (unsigned i = 0; i < 4; i++) {
1011         if (!(instr->dest.write_mask & (1 << i)))
1012            continue;
1013
1014         if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
1015            inst = bld.MOV(offset(temp, bld, i),
1016                           offset(op[0], bld, instr->src[0].swizzle[i]));
1017         } else {
1018            inst = bld.MOV(offset(temp, bld, i),
1019                           offset(op[i], bld, instr->src[i].swizzle[0]));
1020         }
1021         inst->saturate = instr->dest.saturate;
1022      }
1023
1024      /* In this case the source and destination registers were the same,
1025       * so we need to insert an extra set of moves in order to deal with
1026       * any swizzling.
1027       */
1028      if (need_extra_copy) {
1029         for (unsigned i = 0; i < 4; i++) {
1030            if (!(instr->dest.write_mask & (1 << i)))
1031               continue;
1032
1033            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1034         }
1035      }
1036      return;
1037   }
1038
1039   case nir_op_i2f32:
1040   case nir_op_u2f32:
1041      if (optimize_extract_to_float(instr, result))
1042         return;
1043      inst = bld.MOV(result, op[0]);
1044      inst->saturate = instr->dest.saturate;
1045      break;
1046
1047   case nir_op_f2f16_rtne:
1048   case nir_op_f2f16_rtz:
1049      bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1050               brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
1051      /* fallthrough */
1052   case nir_op_f2f16:
1053      /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
1054       * on the HW gen, it is a special hw opcode or just a MOV, and
1055       * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
1056       *
1057       * But if we want to use that opcode, we need to provide support on
1058       * different optimizations and lowerings. As right now HF support is
1059       * only for gen8+, it will be better to use directly the MOV, and use
1060       * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
1061       */
1062      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1063      inst = bld.MOV(result, op[0]);
1064      inst->saturate = instr->dest.saturate;
1065      break;
1066
1067   case nir_op_b2i8:
1068   case nir_op_b2i16:
1069   case nir_op_b2i32:
1070   case nir_op_b2i64:
1071   case nir_op_b2f16:
1072   case nir_op_b2f32:
1073   case nir_op_b2f64:
1074      if (try_emit_b2fi_of_inot(bld, result, instr))
1075         break;
1076      op[0].type = BRW_REGISTER_TYPE_D;
1077      op[0].negate = !op[0].negate;
1078      /* fallthrough */
1079   case nir_op_i2f64:
1080   case nir_op_i2i64:
1081   case nir_op_u2f64:
1082   case nir_op_u2u64:
1083   case nir_op_f2f64:
1084   case nir_op_f2i64:
1085   case nir_op_f2u64:
1086   case nir_op_i2i32:
1087   case nir_op_u2u32:
1088   case nir_op_f2f32:
1089   case nir_op_f2i32:
1090   case nir_op_f2u32:
1091   case nir_op_i2f16:
1092   case nir_op_i2i16:
1093   case nir_op_u2f16:
1094   case nir_op_u2u16:
1095   case nir_op_f2i16:
1096   case nir_op_f2u16:
1097   case nir_op_i2i8:
1098   case nir_op_u2u8:
1099   case nir_op_f2i8:
1100   case nir_op_f2u8:
1101      if (result.type == BRW_REGISTER_TYPE_B ||
1102          result.type == BRW_REGISTER_TYPE_UB ||
1103          result.type == BRW_REGISTER_TYPE_HF)
1104         assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1105
1106      if (op[0].type == BRW_REGISTER_TYPE_B ||
1107          op[0].type == BRW_REGISTER_TYPE_UB ||
1108          op[0].type == BRW_REGISTER_TYPE_HF)
1109         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1110
1111      inst = bld.MOV(result, op[0]);
1112      inst->saturate = instr->dest.saturate;
1113      break;
1114
1115   case nir_op_fsign:
1116      emit_fsign(bld, instr, result, op, 0);
1117      break;
1118
1119   case nir_op_frcp:
1120      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1121      inst->saturate = instr->dest.saturate;
1122      break;
1123
1124   case nir_op_fexp2:
1125      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1126      inst->saturate = instr->dest.saturate;
1127      break;
1128
1129   case nir_op_flog2:
1130      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1131      inst->saturate = instr->dest.saturate;
1132      break;
1133
1134   case nir_op_fsin:
1135      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1136      inst->saturate = instr->dest.saturate;
1137      break;
1138
1139   case nir_op_fcos:
1140      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1141      inst->saturate = instr->dest.saturate;
1142      break;
1143
1144   case nir_op_fddx:
1145      if (fs_key->high_quality_derivatives) {
1146         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1147      } else {
1148         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1149      }
1150      inst->saturate = instr->dest.saturate;
1151      break;
1152   case nir_op_fddx_fine:
1153      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1154      inst->saturate = instr->dest.saturate;
1155      break;
1156   case nir_op_fddx_coarse:
1157      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1158      inst->saturate = instr->dest.saturate;
1159      break;
1160   case nir_op_fddy:
1161      if (fs_key->high_quality_derivatives) {
1162         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1163      } else {
1164         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1165      }
1166      inst->saturate = instr->dest.saturate;
1167      break;
1168   case nir_op_fddy_fine:
1169      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1170      inst->saturate = instr->dest.saturate;
1171      break;
1172   case nir_op_fddy_coarse:
1173      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1174      inst->saturate = instr->dest.saturate;
1175      break;
1176
1177   case nir_op_iadd:
1178   case nir_op_fadd:
1179      inst = bld.ADD(result, op[0], op[1]);
1180      inst->saturate = instr->dest.saturate;
1181      break;
1182
1183   case nir_op_uadd_sat:
1184      inst = bld.ADD(result, op[0], op[1]);
1185      inst->saturate = true;
1186      break;
1187
1188   case nir_op_fmul:
1189      for (unsigned i = 0; i < 2; i++) {
1190         if (can_fuse_fmul_fsign(instr, i)) {
1191            emit_fsign(bld, instr, result, op, i);
1192            return;
1193         }
1194      }
1195
1196      inst = bld.MUL(result, op[0], op[1]);
1197      inst->saturate = instr->dest.saturate;
1198      break;
1199
1200   case nir_op_imul_2x32_64:
1201   case nir_op_umul_2x32_64:
1202      bld.MUL(result, op[0], op[1]);
1203      break;
1204
1205   case nir_op_imul:
1206      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1207      bld.MUL(result, op[0], op[1]);
1208      break;
1209
1210   case nir_op_imul_high:
1211   case nir_op_umul_high:
1212      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1213      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1214      break;
1215
1216   case nir_op_idiv:
1217   case nir_op_udiv:
1218      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1219      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1220      break;
1221
1222   case nir_op_uadd_carry:
1223      unreachable("Should have been lowered by carry_to_arith().");
1224
1225   case nir_op_usub_borrow:
1226      unreachable("Should have been lowered by borrow_to_arith().");
1227
1228   case nir_op_umod:
1229   case nir_op_irem:
1230      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1231       * appears that our hardware just does the right thing for signed
1232       * remainder.
1233       */
1234      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1235      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1236      break;
1237
1238   case nir_op_imod: {
1239      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1240      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1241
1242      /* Math instructions don't support conditional mod */
1243      inst = bld.MOV(bld.null_reg_d(), result);
1244      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1245
1246      /* Now, we need to determine if signs of the sources are different.
1247       * When we XOR the sources, the top bit is 0 if they are the same and 1
1248       * if they are different.  We can then use a conditional modifier to
1249       * turn that into a predicate.  This leads us to an XOR.l instruction.
1250       *
1251       * Technically, according to the PRM, you're not allowed to use .l on a
1252       * XOR instruction.  However, emperical experiments and Curro's reading
1253       * of the simulator source both indicate that it's safe.
1254       */
1255      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1256      inst = bld.XOR(tmp, op[0], op[1]);
1257      inst->predicate = BRW_PREDICATE_NORMAL;
1258      inst->conditional_mod = BRW_CONDITIONAL_L;
1259
1260      /* If the result of the initial remainder operation is non-zero and the
1261       * two sources have different signs, add in a copy of op[1] to get the
1262       * final integer modulus value.
1263       */
1264      inst = bld.ADD(result, result, op[1]);
1265      inst->predicate = BRW_PREDICATE_NORMAL;
1266      break;
1267   }
1268
1269   case nir_op_flt32:
1270   case nir_op_fge32:
1271   case nir_op_feq32:
1272   case nir_op_fne32: {
1273      fs_reg dest = result;
1274
1275      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1276      if (bit_size != 32)
1277         dest = bld.vgrf(op[0].type, 1);
1278
1279      brw_conditional_mod cond;
1280      switch (instr->op) {
1281      case nir_op_flt32:
1282         cond = BRW_CONDITIONAL_L;
1283         break;
1284      case nir_op_fge32:
1285         cond = BRW_CONDITIONAL_GE;
1286         break;
1287      case nir_op_feq32:
1288         cond = BRW_CONDITIONAL_Z;
1289         break;
1290      case nir_op_fne32:
1291         cond = BRW_CONDITIONAL_NZ;
1292         break;
1293      default:
1294         unreachable("bad opcode");
1295      }
1296
1297      bld.CMP(dest, op[0], op[1], cond);
1298
1299      if (bit_size > 32) {
1300         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1301      } else if(bit_size < 32) {
1302         /* When we convert the result to 32-bit we need to be careful and do
1303          * it as a signed conversion to get sign extension (for 32-bit true)
1304          */
1305         const brw_reg_type src_type =
1306            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1307
1308         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1309      }
1310      break;
1311   }
1312
1313   case nir_op_ilt32:
1314   case nir_op_ult32:
1315   case nir_op_ige32:
1316   case nir_op_uge32:
1317   case nir_op_ieq32:
1318   case nir_op_ine32: {
1319      fs_reg dest = result;
1320
1321      /* On Gen11 we have an additional issue being that src1 cannot be a byte
1322       * type. So we convert both operands for the comparison.
1323       */
1324      fs_reg temp_op[2];
1325      temp_op[0] = bld.fix_byte_src(op[0]);
1326      temp_op[1] = bld.fix_byte_src(op[1]);
1327
1328      const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1329      if (bit_size != 32)
1330         dest = bld.vgrf(temp_op[0].type, 1);
1331
1332      brw_conditional_mod cond;
1333      switch (instr->op) {
1334      case nir_op_ilt32:
1335      case nir_op_ult32:
1336         cond = BRW_CONDITIONAL_L;
1337         break;
1338      case nir_op_ige32:
1339      case nir_op_uge32:
1340         cond = BRW_CONDITIONAL_GE;
1341         break;
1342      case nir_op_ieq32:
1343         cond = BRW_CONDITIONAL_Z;
1344         break;
1345      case nir_op_ine32:
1346         cond = BRW_CONDITIONAL_NZ;
1347         break;
1348      default:
1349         unreachable("bad opcode");
1350      }
1351      bld.CMP(dest, temp_op[0], temp_op[1], cond);
1352
1353      if (bit_size > 32) {
1354         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1355      } else if (bit_size < 32) {
1356         /* When we convert the result to 32-bit we need to be careful and do
1357          * it as a signed conversion to get sign extension (for 32-bit true)
1358          */
1359         const brw_reg_type src_type =
1360            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1361
1362         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1363      }
1364      break;
1365   }
1366
1367   case nir_op_inot:
1368      if (devinfo->gen >= 8) {
1369         nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1370
1371         if (inot_src_instr != NULL &&
1372             (inot_src_instr->op == nir_op_ior ||
1373              inot_src_instr->op == nir_op_ixor ||
1374              inot_src_instr->op == nir_op_iand) &&
1375             !inot_src_instr->src[0].abs &&
1376             !inot_src_instr->src[0].negate &&
1377             !inot_src_instr->src[1].abs &&
1378             !inot_src_instr->src[1].negate) {
1379            /* The sources of the source logical instruction are now the
1380             * sources of the instruction that will be generated.
1381             */
1382            prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1383            resolve_inot_sources(bld, inot_src_instr, op);
1384
1385            /* Smash all of the sources and destination to be signed.  This
1386             * doesn't matter for the operation of the instruction, but cmod
1387             * propagation fails on unsigned sources with negation (due to
1388             * fs_inst::can_do_cmod returning false).
1389             */
1390            result.type =
1391               brw_type_for_nir_type(devinfo,
1392                                     (nir_alu_type)(nir_type_int |
1393                                                    nir_dest_bit_size(instr->dest.dest)));
1394            op[0].type =
1395               brw_type_for_nir_type(devinfo,
1396                                     (nir_alu_type)(nir_type_int |
1397                                                    nir_src_bit_size(inot_src_instr->src[0].src)));
1398            op[1].type =
1399               brw_type_for_nir_type(devinfo,
1400                                     (nir_alu_type)(nir_type_int |
1401                                                    nir_src_bit_size(inot_src_instr->src[1].src)));
1402
1403            /* For XOR, only invert one of the sources.  Arbitrarily choose
1404             * the first source.
1405             */
1406            op[0].negate = !op[0].negate;
1407            if (inot_src_instr->op != nir_op_ixor)
1408               op[1].negate = !op[1].negate;
1409
1410            switch (inot_src_instr->op) {
1411            case nir_op_ior:
1412               bld.AND(result, op[0], op[1]);
1413               return;
1414
1415            case nir_op_iand:
1416               bld.OR(result, op[0], op[1]);
1417               return;
1418
1419            case nir_op_ixor:
1420               bld.XOR(result, op[0], op[1]);
1421               return;
1422
1423            default:
1424               unreachable("impossible opcode");
1425            }
1426         }
1427         op[0] = resolve_source_modifiers(op[0]);
1428      }
1429      bld.NOT(result, op[0]);
1430      break;
1431   case nir_op_ixor:
1432      if (devinfo->gen >= 8) {
1433         resolve_inot_sources(bld, instr, op);
1434      }
1435      bld.XOR(result, op[0], op[1]);
1436      break;
1437   case nir_op_ior:
1438      if (devinfo->gen >= 8) {
1439         resolve_inot_sources(bld, instr, op);
1440      }
1441      bld.OR(result, op[0], op[1]);
1442      break;
1443   case nir_op_iand:
1444      if (devinfo->gen >= 8) {
1445         resolve_inot_sources(bld, instr, op);
1446      }
1447      bld.AND(result, op[0], op[1]);
1448      break;
1449
1450   case nir_op_fdot2:
1451   case nir_op_fdot3:
1452   case nir_op_fdot4:
1453   case nir_op_b32all_fequal2:
1454   case nir_op_b32all_iequal2:
1455   case nir_op_b32all_fequal3:
1456   case nir_op_b32all_iequal3:
1457   case nir_op_b32all_fequal4:
1458   case nir_op_b32all_iequal4:
1459   case nir_op_b32any_fnequal2:
1460   case nir_op_b32any_inequal2:
1461   case nir_op_b32any_fnequal3:
1462   case nir_op_b32any_inequal3:
1463   case nir_op_b32any_fnequal4:
1464   case nir_op_b32any_inequal4:
1465      unreachable("Lowered by nir_lower_alu_reductions");
1466
1467   case nir_op_fnoise1_1:
1468   case nir_op_fnoise1_2:
1469   case nir_op_fnoise1_3:
1470   case nir_op_fnoise1_4:
1471   case nir_op_fnoise2_1:
1472   case nir_op_fnoise2_2:
1473   case nir_op_fnoise2_3:
1474   case nir_op_fnoise2_4:
1475   case nir_op_fnoise3_1:
1476   case nir_op_fnoise3_2:
1477   case nir_op_fnoise3_3:
1478   case nir_op_fnoise3_4:
1479   case nir_op_fnoise4_1:
1480   case nir_op_fnoise4_2:
1481   case nir_op_fnoise4_3:
1482   case nir_op_fnoise4_4:
1483      unreachable("not reached: should be handled by lower_noise");
1484
1485   case nir_op_ldexp:
1486      unreachable("not reached: should be handled by ldexp_to_arith()");
1487
1488   case nir_op_fsqrt:
1489      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1490      inst->saturate = instr->dest.saturate;
1491      break;
1492
1493   case nir_op_frsq:
1494      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1495      inst->saturate = instr->dest.saturate;
1496      break;
1497
1498   case nir_op_i2b32:
1499   case nir_op_f2b32: {
1500      uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1501      if (bit_size == 64) {
1502         /* two-argument instructions can't take 64-bit immediates */
1503         fs_reg zero;
1504         fs_reg tmp;
1505
1506         if (instr->op == nir_op_f2b32) {
1507            zero = vgrf(glsl_type::double_type);
1508            tmp = vgrf(glsl_type::double_type);
1509            bld.MOV(zero, setup_imm_df(bld, 0.0));
1510         } else {
1511            zero = vgrf(glsl_type::int64_t_type);
1512            tmp = vgrf(glsl_type::int64_t_type);
1513            bld.MOV(zero, brw_imm_q(0));
1514         }
1515
1516         /* A SIMD16 execution needs to be split in two instructions, so use
1517          * a vgrf instead of the flag register as dst so instruction splitting
1518          * works
1519          */
1520         bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1521         bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1522      } else {
1523         fs_reg zero;
1524         if (bit_size == 32) {
1525            zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1526         } else {
1527            assert(bit_size == 16);
1528            zero = instr->op == nir_op_f2b32 ?
1529               retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1530         }
1531         bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1532      }
1533      break;
1534   }
1535
1536   case nir_op_ftrunc:
1537      inst = bld.RNDZ(result, op[0]);
1538      inst->saturate = instr->dest.saturate;
1539      break;
1540
1541   case nir_op_fceil: {
1542      op[0].negate = !op[0].negate;
1543      fs_reg temp = vgrf(glsl_type::float_type);
1544      bld.RNDD(temp, op[0]);
1545      temp.negate = true;
1546      inst = bld.MOV(result, temp);
1547      inst->saturate = instr->dest.saturate;
1548      break;
1549   }
1550   case nir_op_ffloor:
1551      inst = bld.RNDD(result, op[0]);
1552      inst->saturate = instr->dest.saturate;
1553      break;
1554   case nir_op_ffract:
1555      inst = bld.FRC(result, op[0]);
1556      inst->saturate = instr->dest.saturate;
1557      break;
1558   case nir_op_fround_even:
1559      inst = bld.RNDE(result, op[0]);
1560      inst->saturate = instr->dest.saturate;
1561      break;
1562
1563   case nir_op_fquantize2f16: {
1564      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1565      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1566      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1567
1568      /* The destination stride must be at least as big as the source stride. */
1569      tmp16.type = BRW_REGISTER_TYPE_W;
1570      tmp16.stride = 2;
1571
1572      /* Check for denormal */
1573      fs_reg abs_src0 = op[0];
1574      abs_src0.abs = true;
1575      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1576              BRW_CONDITIONAL_L);
1577      /* Get the appropriately signed zero */
1578      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1579              retype(op[0], BRW_REGISTER_TYPE_UD),
1580              brw_imm_ud(0x80000000));
1581      /* Do the actual F32 -> F16 -> F32 conversion */
1582      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1583      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1584      /* Select that or zero based on normal status */
1585      inst = bld.SEL(result, zero, tmp32);
1586      inst->predicate = BRW_PREDICATE_NORMAL;
1587      inst->saturate = instr->dest.saturate;
1588      break;
1589   }
1590
1591   case nir_op_imin:
1592   case nir_op_umin:
1593   case nir_op_fmin:
1594      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1595      inst->saturate = instr->dest.saturate;
1596      break;
1597
1598   case nir_op_imax:
1599   case nir_op_umax:
1600   case nir_op_fmax:
1601      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1602      inst->saturate = instr->dest.saturate;
1603      break;
1604
1605   case nir_op_pack_snorm_2x16:
1606   case nir_op_pack_snorm_4x8:
1607   case nir_op_pack_unorm_2x16:
1608   case nir_op_pack_unorm_4x8:
1609   case nir_op_unpack_snorm_2x16:
1610   case nir_op_unpack_snorm_4x8:
1611   case nir_op_unpack_unorm_2x16:
1612   case nir_op_unpack_unorm_4x8:
1613   case nir_op_unpack_half_2x16:
1614   case nir_op_pack_half_2x16:
1615      unreachable("not reached: should be handled by lower_packing_builtins");
1616
1617   case nir_op_unpack_half_2x16_split_x:
1618      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1619                      subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1620      inst->saturate = instr->dest.saturate;
1621      break;
1622   case nir_op_unpack_half_2x16_split_y:
1623      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1624                      subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1625      inst->saturate = instr->dest.saturate;
1626      break;
1627
1628   case nir_op_pack_64_2x32_split:
1629   case nir_op_pack_32_2x16_split:
1630      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1631      break;
1632
1633   case nir_op_unpack_64_2x32_split_x:
1634   case nir_op_unpack_64_2x32_split_y: {
1635      if (instr->op == nir_op_unpack_64_2x32_split_x)
1636         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1637      else
1638         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1639      break;
1640   }
1641
1642   case nir_op_unpack_32_2x16_split_x:
1643   case nir_op_unpack_32_2x16_split_y: {
1644      if (instr->op == nir_op_unpack_32_2x16_split_x)
1645         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1646      else
1647         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1648      break;
1649   }
1650
1651   case nir_op_fpow:
1652      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1653      inst->saturate = instr->dest.saturate;
1654      break;
1655
1656   case nir_op_bitfield_reverse:
1657      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1658      bld.BFREV(result, op[0]);
1659      break;
1660
1661   case nir_op_bit_count:
1662      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1663      bld.CBIT(result, op[0]);
1664      break;
1665
1666   case nir_op_ufind_msb: {
1667      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1668      emit_find_msb_using_lzd(bld, result, op[0], false);
1669      break;
1670   }
1671
1672   case nir_op_ifind_msb: {
1673      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1674
1675      if (devinfo->gen < 7) {
1676         emit_find_msb_using_lzd(bld, result, op[0], true);
1677      } else {
1678         bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1679
1680         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1681          * count from the LSB side. If FBH didn't return an error
1682          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1683          * count into an LSB count.
1684          */
1685         bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1686
1687         inst = bld.ADD(result, result, brw_imm_d(31));
1688         inst->predicate = BRW_PREDICATE_NORMAL;
1689         inst->src[0].negate = true;
1690      }
1691      break;
1692   }
1693
1694   case nir_op_find_lsb:
1695      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1696
1697      if (devinfo->gen < 7) {
1698         fs_reg temp = vgrf(glsl_type::int_type);
1699
1700         /* (x & -x) generates a value that consists of only the LSB of x.
1701          * For all powers of 2, findMSB(y) == findLSB(y).
1702          */
1703         fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1704         fs_reg negated_src = src;
1705
1706         /* One must be negated, and the other must be non-negated.  It
1707          * doesn't matter which is which.
1708          */
1709         negated_src.negate = true;
1710         src.negate = false;
1711
1712         bld.AND(temp, src, negated_src);
1713         emit_find_msb_using_lzd(bld, result, temp, false);
1714      } else {
1715         bld.FBL(result, op[0]);
1716      }
1717      break;
1718
1719   case nir_op_ubitfield_extract:
1720   case nir_op_ibitfield_extract:
1721      unreachable("should have been lowered");
1722   case nir_op_ubfe:
1723   case nir_op_ibfe:
1724      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1725      bld.BFE(result, op[2], op[1], op[0]);
1726      break;
1727   case nir_op_bfm:
1728      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1729      bld.BFI1(result, op[0], op[1]);
1730      break;
1731   case nir_op_bfi:
1732      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1733      bld.BFI2(result, op[0], op[1], op[2]);
1734      break;
1735
1736   case nir_op_bitfield_insert:
1737      unreachable("not reached: should have been lowered");
1738
1739   case nir_op_ishl:
1740      bld.SHL(result, op[0], op[1]);
1741      break;
1742   case nir_op_ishr:
1743      bld.ASR(result, op[0], op[1]);
1744      break;
1745   case nir_op_ushr:
1746      bld.SHR(result, op[0], op[1]);
1747      break;
1748
1749   case nir_op_pack_half_2x16_split:
1750      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751      break;
1752
1753   case nir_op_ffma:
1754      inst = bld.MAD(result, op[2], op[1], op[0]);
1755      inst->saturate = instr->dest.saturate;
1756      break;
1757
1758   case nir_op_flrp:
1759      inst = bld.LRP(result, op[0], op[1], op[2]);
1760      inst->saturate = instr->dest.saturate;
1761      break;
1762
1763   case nir_op_b32csel:
1764      if (optimize_frontfacing_ternary(instr, result))
1765         return;
1766
1767      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1768      inst = bld.SEL(result, op[1], op[2]);
1769      inst->predicate = BRW_PREDICATE_NORMAL;
1770      break;
1771
1772   case nir_op_extract_u8:
1773   case nir_op_extract_i8: {
1774      unsigned byte = nir_src_as_uint(instr->src[1].src);
1775
1776      /* The PRMs say:
1777       *
1778       *    BDW+
1779       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1780       *    Use two instructions and a word or DWord intermediate integer type.
1781       */
1782      if (nir_dest_bit_size(instr->dest.dest) == 64) {
1783         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1784
1785         if (instr->op == nir_op_extract_i8) {
1786            /* If we need to sign extend, extract to a word first */
1787            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1788            bld.MOV(w_temp, subscript(op[0], type, byte));
1789            bld.MOV(result, w_temp);
1790         } else if (byte & 1) {
1791            /* Extract the high byte from the word containing the desired byte
1792             * offset.
1793             */
1794            bld.SHR(result,
1795                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1796                    brw_imm_uw(8));
1797         } else {
1798            /* Otherwise use an AND with 0xff and a word type */
1799            bld.AND(result,
1800                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1801                    brw_imm_uw(0xff));
1802         }
1803      } else {
1804         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1805         bld.MOV(result, subscript(op[0], type, byte));
1806      }
1807      break;
1808   }
1809
1810   case nir_op_extract_u16:
1811   case nir_op_extract_i16: {
1812      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1813      unsigned word = nir_src_as_uint(instr->src[1].src);
1814      bld.MOV(result, subscript(op[0], type, word));
1815      break;
1816   }
1817
1818   default:
1819      unreachable("unhandled instruction");
1820   }
1821
1822   /* If we need to do a boolean resolve, replace the result with -(x & 1)
1823    * to sign extend the low bit to 0/~0
1824    */
1825   if (devinfo->gen <= 5 &&
1826       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1827      fs_reg masked = vgrf(glsl_type::int_type);
1828      bld.AND(masked, result, brw_imm_d(1));
1829      masked.negate = true;
1830      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1831   }
1832}
1833
1834void
1835fs_visitor::nir_emit_load_const(const fs_builder &bld,
1836                                nir_load_const_instr *instr)
1837{
1838   const brw_reg_type reg_type =
1839      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1840   fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1841
1842   switch (instr->def.bit_size) {
1843   case 8:
1844      for (unsigned i = 0; i < instr->def.num_components; i++)
1845         bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
1846      break;
1847
1848   case 16:
1849      for (unsigned i = 0; i < instr->def.num_components; i++)
1850         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
1851      break;
1852
1853   case 32:
1854      for (unsigned i = 0; i < instr->def.num_components; i++)
1855         bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
1856      break;
1857
1858   case 64:
1859      assert(devinfo->gen >= 7);
1860      if (devinfo->gen == 7) {
1861         /* We don't get 64-bit integer types until gen8 */
1862         for (unsigned i = 0; i < instr->def.num_components; i++) {
1863            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1864                    setup_imm_df(bld, instr->value[i].f64));
1865         }
1866      } else {
1867         for (unsigned i = 0; i < instr->def.num_components; i++)
1868            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
1869      }
1870      break;
1871
1872   default:
1873      unreachable("Invalid bit size");
1874   }
1875
1876   nir_ssa_values[instr->def.index] = reg;
1877}
1878
1879fs_reg
1880fs_visitor::get_nir_src(const nir_src &src)
1881{
1882   fs_reg reg;
1883   if (src.is_ssa) {
1884      if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1885         const brw_reg_type reg_type =
1886            brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1887         reg = bld.vgrf(reg_type, src.ssa->num_components);
1888      } else {
1889         reg = nir_ssa_values[src.ssa->index];
1890      }
1891   } else {
1892      /* We don't handle indirects on locals */
1893      assert(src.reg.indirect == NULL);
1894      reg = offset(nir_locals[src.reg.reg->index], bld,
1895                   src.reg.base_offset * src.reg.reg->num_components);
1896   }
1897
1898   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1899      /* The only 64-bit type available on gen7 is DF, so use that. */
1900      reg.type = BRW_REGISTER_TYPE_DF;
1901   } else {
1902      /* To avoid floating-point denorm flushing problems, set the type by
1903       * default to an integer type - instructions that need floating point
1904       * semantics will set this to F if they need to
1905       */
1906      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1907                                            BRW_REGISTER_TYPE_D);
1908   }
1909
1910   return reg;
1911}
1912
1913/**
1914 * Return an IMM for constants; otherwise call get_nir_src() as normal.
1915 *
1916 * This function should not be called on any value which may be 64 bits.
1917 * We could theoretically support 64-bit on gen8+ but we choose not to
1918 * because it wouldn't work in general (no gen7 support) and there are
1919 * enough restrictions in 64-bit immediates that you can't take the return
1920 * value and treat it the same as the result of get_nir_src().
1921 */
1922fs_reg
1923fs_visitor::get_nir_src_imm(const nir_src &src)
1924{
1925   assert(nir_src_bit_size(src) == 32);
1926   return nir_src_is_const(src) ?
1927          fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
1928}
1929
1930fs_reg
1931fs_visitor::get_nir_dest(const nir_dest &dest)
1932{
1933   if (dest.is_ssa) {
1934      const brw_reg_type reg_type =
1935         brw_reg_type_from_bit_size(dest.ssa.bit_size,
1936                                    dest.ssa.bit_size == 8 ?
1937                                    BRW_REGISTER_TYPE_D :
1938                                    BRW_REGISTER_TYPE_F);
1939      nir_ssa_values[dest.ssa.index] =
1940         bld.vgrf(reg_type, dest.ssa.num_components);
1941      return nir_ssa_values[dest.ssa.index];
1942   } else {
1943      /* We don't handle indirects on locals */
1944      assert(dest.reg.indirect == NULL);
1945      return offset(nir_locals[dest.reg.reg->index], bld,
1946                    dest.reg.base_offset * dest.reg.reg->num_components);
1947   }
1948}
1949
1950void
1951fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1952                         unsigned wr_mask)
1953{
1954   for (unsigned i = 0; i < 4; i++) {
1955      if (!((wr_mask >> i) & 1))
1956         continue;
1957
1958      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1959      new_inst->dst = offset(new_inst->dst, bld, i);
1960      for (unsigned j = 0; j < new_inst->sources; j++)
1961         if (new_inst->src[j].file == VGRF)
1962            new_inst->src[j] = offset(new_inst->src[j], bld, i);
1963
1964      bld.emit(new_inst);
1965   }
1966}
1967
1968static fs_inst *
1969emit_pixel_interpolater_send(const fs_builder &bld,
1970                             enum opcode opcode,
1971                             const fs_reg &dst,
1972                             const fs_reg &src,
1973                             const fs_reg &desc,
1974                             glsl_interp_mode interpolation)
1975{
1976   struct brw_wm_prog_data *wm_prog_data =
1977      brw_wm_prog_data(bld.shader->stage_prog_data);
1978
1979   fs_inst *inst = bld.emit(opcode, dst, src, desc);
1980   /* 2 floats per slot returned */
1981   inst->size_written = 2 * dst.component_size(inst->exec_size);
1982   inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1983
1984   wm_prog_data->pulls_bary = true;
1985
1986   return inst;
1987}
1988
1989/**
1990 * Computes 1 << x, given a D/UD register containing some value x.
1991 */
1992static fs_reg
1993intexp2(const fs_builder &bld, const fs_reg &x)
1994{
1995   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1996
1997   fs_reg result = bld.vgrf(x.type, 1);
1998   fs_reg one = bld.vgrf(x.type, 1);
1999
2000   bld.MOV(one, retype(brw_imm_d(1), one.type));
2001   bld.SHL(result, one, x);
2002   return result;
2003}
2004
2005void
2006fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
2007{
2008   assert(stage == MESA_SHADER_GEOMETRY);
2009
2010   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2011
2012   if (gs_compile->control_data_header_size_bits == 0)
2013      return;
2014
2015   /* We can only do EndPrimitive() functionality when the control data
2016    * consists of cut bits.  Fortunately, the only time it isn't is when the
2017    * output type is points, in which case EndPrimitive() is a no-op.
2018    */
2019   if (gs_prog_data->control_data_format !=
2020       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2021      return;
2022   }
2023
2024   /* Cut bits use one bit per vertex. */
2025   assert(gs_compile->control_data_bits_per_vertex == 1);
2026
2027   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2028   vertex_count.type = BRW_REGISTER_TYPE_UD;
2029
2030   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2031    * vertex n, 0 otherwise.  So all we need to do here is mark bit
2032    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2033    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2034    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2035    *
2036    * Note that if EndPrimitive() is called before emitting any vertices, this
2037    * will cause us to set bit 31 of the control_data_bits register to 1.
2038    * That's fine because:
2039    *
2040    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2041    *   output, so the hardware will ignore cut bit 31.
2042    *
2043    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2044    *   last vertex, so setting cut bit 31 has no effect (since the primitive
2045    *   is automatically ended when the GS terminates).
2046    *
2047    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2048    *   control_data_bits register to 0 when the first vertex is emitted.
2049    */
2050
2051   const fs_builder abld = bld.annotate("end primitive");
2052
2053   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2054   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2055   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2056   fs_reg mask = intexp2(abld, prev_count);
2057   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2058    * attention to the lower 5 bits of its second source argument, so on this
2059    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2060    * ((vertex_count - 1) % 32).
2061    */
2062   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2063}
2064
2065void
2066fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2067{
2068   assert(stage == MESA_SHADER_GEOMETRY);
2069   assert(gs_compile->control_data_bits_per_vertex != 0);
2070
2071   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2072
2073   const fs_builder abld = bld.annotate("emit control data bits");
2074   const fs_builder fwa_bld = bld.exec_all();
2075
2076   /* We use a single UD register to accumulate control data bits (32 bits
2077    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2078    * at a time.
2079    *
2080    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2081    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2082    * use the Channel Mask phase to enable/disable which DWord within that
2083    * group to write.  (Remember, different SIMD8 channels may have emitted
2084    * different numbers of vertices, so we may need per-slot offsets.)
2085    *
2086    * Channel masking presents an annoying problem: we may have to replicate
2087    * the data up to 4 times:
2088    *
2089    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2090    *
2091    * To avoid penalizing shaders that emit a small number of vertices, we
2092    * can avoid these sometimes: if the size of the control data header is
2093    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2094    * land in the same 128-bit group, so we can skip per-slot offsets.
2095    *
2096    * Similarly, if the control data header is <= 32 bits, there is only one
2097    * DWord, so we can skip channel masks.
2098    */
2099   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
2100
2101   fs_reg channel_mask, per_slot_offset;
2102
2103   if (gs_compile->control_data_header_size_bits > 32) {
2104      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2105      channel_mask = vgrf(glsl_type::uint_type);
2106   }
2107
2108   if (gs_compile->control_data_header_size_bits > 128) {
2109      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
2110      per_slot_offset = vgrf(glsl_type::uint_type);
2111   }
2112
2113   /* Figure out which DWord we're trying to write to using the formula:
2114    *
2115    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2116    *
2117    * Since bits_per_vertex is a power of two, and is known at compile
2118    * time, this can be optimized to:
2119    *
2120    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2121    */
2122   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
2123      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2124      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2125      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2126      unsigned log2_bits_per_vertex =
2127         util_last_bit(gs_compile->control_data_bits_per_vertex);
2128      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2129
2130      if (per_slot_offset.file != BAD_FILE) {
2131         /* Set the per-slot offset to dword_index / 4, so that we'll write to
2132          * the appropriate OWord within the control data header.
2133          */
2134         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2135      }
2136
2137      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2138       * write to the appropriate DWORD within the OWORD.
2139       */
2140      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2141      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2142      channel_mask = intexp2(fwa_bld, channel);
2143      /* Then the channel masks need to be in bits 23:16. */
2144      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2145   }
2146
2147   /* Store the control data bits in the message payload and send it. */
2148   unsigned mlen = 2;
2149   if (channel_mask.file != BAD_FILE)
2150      mlen += 4; /* channel masks, plus 3 extra copies of the data */
2151   if (per_slot_offset.file != BAD_FILE)
2152      mlen++;
2153
2154   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2155   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
2156   unsigned i = 0;
2157   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2158   if (per_slot_offset.file != BAD_FILE)
2159      sources[i++] = per_slot_offset;
2160   if (channel_mask.file != BAD_FILE)
2161      sources[i++] = channel_mask;
2162   while (i < mlen) {
2163      sources[i++] = this->control_data_bits;
2164   }
2165
2166   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2167   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2168   inst->mlen = mlen;
2169   /* We need to increment Global Offset by 256-bits to make room for
2170    * Broadwell's extra "Vertex Count" payload at the beginning of the
2171    * URB entry.  Since this is an OWord message, Global Offset is counted
2172    * in 128-bit units, so we must set it to 2.
2173    */
2174   if (gs_prog_data->static_vertex_count == -1)
2175      inst->offset = 2;
2176}
2177
2178void
2179fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2180                                            unsigned stream_id)
2181{
2182   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2183
2184   /* Note: we are calling this *before* increasing vertex_count, so
2185    * this->vertex_count == vertex_count - 1 in the formula above.
2186    */
2187
2188   /* Stream mode uses 2 bits per vertex */
2189   assert(gs_compile->control_data_bits_per_vertex == 2);
2190
2191   /* Must be a valid stream */
2192   assert(stream_id < MAX_VERTEX_STREAMS);
2193
2194   /* Control data bits are initialized to 0 so we don't have to set any
2195    * bits when sending vertices to stream 0.
2196    */
2197   if (stream_id == 0)
2198      return;
2199
2200   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2201
2202   /* reg::sid = stream_id */
2203   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2204   abld.MOV(sid, brw_imm_ud(stream_id));
2205
2206   /* reg:shift_count = 2 * (vertex_count - 1) */
2207   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2208   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2209
2210   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2211    * attention to the lower 5 bits of its second source argument, so on this
2212    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2213    * stream_id << ((2 * (vertex_count - 1)) % 32).
2214    */
2215   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2216   abld.SHL(mask, sid, shift_count);
2217   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2218}
2219
2220void
2221fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2222                           unsigned stream_id)
2223{
2224   assert(stage == MESA_SHADER_GEOMETRY);
2225
2226   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2227
2228   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2229   vertex_count.type = BRW_REGISTER_TYPE_UD;
2230
2231   /* Haswell and later hardware ignores the "Render Stream Select" bits
2232    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2233    * and instead sends all primitives down the pipeline for rasterization.
2234    * If the SOL stage is enabled, "Render Stream Select" is honored and
2235    * primitives bound to non-zero streams are discarded after stream output.
2236    *
2237    * Since the only purpose of primives sent to non-zero streams is to
2238    * be recorded by transform feedback, we can simply discard all geometry
2239    * bound to these streams when transform feedback is disabled.
2240    */
2241   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2242      return;
2243
2244   /* If we're outputting 32 control data bits or less, then we can wait
2245    * until the shader is over to output them all.  Otherwise we need to
2246    * output them as we go.  Now is the time to do it, since we're about to
2247    * output the vertex_count'th vertex, so it's guaranteed that the
2248    * control data bits associated with the (vertex_count - 1)th vertex are
2249    * correct.
2250    */
2251   if (gs_compile->control_data_header_size_bits > 32) {
2252      const fs_builder abld =
2253         bld.annotate("emit vertex: emit control data bits");
2254
2255      /* Only emit control data bits if we've finished accumulating a batch
2256       * of 32 bits.  This is the case when:
2257       *
2258       *     (vertex_count * bits_per_vertex) % 32 == 0
2259       *
2260       * (in other words, when the last 5 bits of vertex_count *
2261       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2262       * integer n (which is always the case, since bits_per_vertex is
2263       * always 1 or 2), this is equivalent to requiring that the last 5-n
2264       * bits of vertex_count are 0:
2265       *
2266       *     vertex_count & (2^(5-n) - 1) == 0
2267       *
2268       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2269       * equivalent to:
2270       *
2271       *     vertex_count & (32 / bits_per_vertex - 1) == 0
2272       *
2273       * TODO: If vertex_count is an immediate, we could do some of this math
2274       *       at compile time...
2275       */
2276      fs_inst *inst =
2277         abld.AND(bld.null_reg_d(), vertex_count,
2278                  brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2279      inst->conditional_mod = BRW_CONDITIONAL_Z;
2280
2281      abld.IF(BRW_PREDICATE_NORMAL);
2282      /* If vertex_count is 0, then no control data bits have been
2283       * accumulated yet, so we can skip emitting them.
2284       */
2285      abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2286               BRW_CONDITIONAL_NEQ);
2287      abld.IF(BRW_PREDICATE_NORMAL);
2288      emit_gs_control_data_bits(vertex_count);
2289      abld.emit(BRW_OPCODE_ENDIF);
2290
2291      /* Reset control_data_bits to 0 so we can start accumulating a new
2292       * batch.
2293       *
2294       * Note: in the case where vertex_count == 0, this neutralizes the
2295       * effect of any call to EndPrimitive() that the shader may have
2296       * made before outputting its first vertex.
2297       */
2298      inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2299      inst->force_writemask_all = true;
2300      abld.emit(BRW_OPCODE_ENDIF);
2301   }
2302
2303   emit_urb_writes(vertex_count);
2304
2305   /* In stream mode we have to set control data bits for all vertices
2306    * unless we have disabled control data bits completely (which we do
2307    * do for GL_POINTS outputs that don't use streams).
2308    */
2309   if (gs_compile->control_data_header_size_bits > 0 &&
2310       gs_prog_data->control_data_format ==
2311          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2312      set_gs_stream_control_data_bits(vertex_count, stream_id);
2313   }
2314}
2315
2316void
2317fs_visitor::emit_gs_input_load(const fs_reg &dst,
2318                               const nir_src &vertex_src,
2319                               unsigned base_offset,
2320                               const nir_src &offset_src,
2321                               unsigned num_components,
2322                               unsigned first_component)
2323{
2324   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2325   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2326
2327   /* TODO: figure out push input layout for invocations == 1 */
2328   /* TODO: make this work with 64-bit inputs */
2329   if (gs_prog_data->invocations == 1 &&
2330       type_sz(dst.type) <= 4 &&
2331       nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2332       4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2333      int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2334                       nir_src_as_uint(vertex_src) * push_reg_count;
2335      for (unsigned i = 0; i < num_components; i++) {
2336         bld.MOV(offset(dst, bld, i),
2337                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2338      }
2339      return;
2340   }
2341
2342   /* Resort to the pull model.  Ensure the VUE handles are provided. */
2343   assert(gs_prog_data->base.include_vue_handles);
2344
2345   unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2346   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2347
2348   if (gs_prog_data->invocations == 1) {
2349      if (nir_src_is_const(vertex_src)) {
2350         /* The vertex index is constant; just select the proper URB handle. */
2351         icp_handle =
2352            retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2353                   BRW_REGISTER_TYPE_UD);
2354      } else {
2355         /* The vertex index is non-constant.  We need to use indirect
2356          * addressing to fetch the proper URB handle.
2357          *
2358          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2359          * indicating that channel <n> should read the handle from
2360          * DWord <n>.  We convert that to bytes by multiplying by 4.
2361          *
2362          * Next, we convert the vertex index to bytes by multiplying
2363          * by 32 (shifting by 5), and add the two together.  This is
2364          * the final indirect byte offset.
2365          */
2366         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2367         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2368         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2369         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2370
2371         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2372         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2373         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2374         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2375         /* Convert vertex_index to bytes (multiply by 32) */
2376         bld.SHL(vertex_offset_bytes,
2377                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2378                 brw_imm_ud(5u));
2379         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2380
2381         /* Use first_icp_handle as the base offset.  There is one register
2382          * of URB handles per vertex, so inform the register allocator that
2383          * we might read up to nir->info.gs.vertices_in registers.
2384          */
2385         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2386                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2387                  fs_reg(icp_offset_bytes),
2388                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2389      }
2390   } else {
2391      assert(gs_prog_data->invocations > 1);
2392
2393      if (nir_src_is_const(vertex_src)) {
2394         unsigned vertex = nir_src_as_uint(vertex_src);
2395         assert(devinfo->gen >= 9 || vertex <= 5);
2396         bld.MOV(icp_handle,
2397                 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2398                        BRW_REGISTER_TYPE_UD));
2399      } else {
2400         /* The vertex index is non-constant.  We need to use indirect
2401          * addressing to fetch the proper URB handle.
2402          *
2403          */
2404         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2405
2406         /* Convert vertex_index to bytes (multiply by 4) */
2407         bld.SHL(icp_offset_bytes,
2408                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2409                 brw_imm_ud(2u));
2410
2411         /* Use first_icp_handle as the base offset.  There is one DWord
2412          * of URB handles per vertex, so inform the register allocator that
2413          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2414          */
2415         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2416                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2417                  fs_reg(icp_offset_bytes),
2418                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2419                             REG_SIZE));
2420      }
2421   }
2422
2423   fs_inst *inst;
2424
2425   fs_reg tmp_dst = dst;
2426   fs_reg indirect_offset = get_nir_src(offset_src);
2427   unsigned num_iterations = 1;
2428   unsigned orig_num_components = num_components;
2429
2430   if (type_sz(dst.type) == 8) {
2431      if (num_components > 2) {
2432         num_iterations = 2;
2433         num_components = 2;
2434      }
2435      fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2436      tmp_dst = tmp;
2437      first_component = first_component / 2;
2438   }
2439
2440   for (unsigned iter = 0; iter < num_iterations; iter++) {
2441      if (nir_src_is_const(offset_src)) {
2442         /* Constant indexing - use global offset. */
2443         if (first_component != 0) {
2444            unsigned read_components = num_components + first_component;
2445            fs_reg tmp = bld.vgrf(dst.type, read_components);
2446            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2447            inst->size_written = read_components *
2448                                 tmp.component_size(inst->exec_size);
2449            for (unsigned i = 0; i < num_components; i++) {
2450               bld.MOV(offset(tmp_dst, bld, i),
2451                       offset(tmp, bld, i + first_component));
2452            }
2453         } else {
2454            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2455                            icp_handle);
2456            inst->size_written = num_components *
2457                                 tmp_dst.component_size(inst->exec_size);
2458         }
2459         inst->offset = base_offset + nir_src_as_uint(offset_src);
2460         inst->mlen = 1;
2461      } else {
2462         /* Indirect indexing - use per-slot offsets as well. */
2463         const fs_reg srcs[] = { icp_handle, indirect_offset };
2464         unsigned read_components = num_components + first_component;
2465         fs_reg tmp = bld.vgrf(dst.type, read_components);
2466         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2467         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2468         if (first_component != 0) {
2469            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2470                            payload);
2471            inst->size_written = read_components *
2472                                 tmp.component_size(inst->exec_size);
2473            for (unsigned i = 0; i < num_components; i++) {
2474               bld.MOV(offset(tmp_dst, bld, i),
2475                       offset(tmp, bld, i + first_component));
2476            }
2477         } else {
2478            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2479                         payload);
2480            inst->size_written = num_components *
2481                                 tmp_dst.component_size(inst->exec_size);
2482         }
2483         inst->offset = base_offset;
2484         inst->mlen = 2;
2485      }
2486
2487      if (type_sz(dst.type) == 8) {
2488         shuffle_from_32bit_read(bld,
2489                                 offset(dst, bld, iter * 2),
2490                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
2491                                 0,
2492                                 num_components);
2493      }
2494
2495      if (num_iterations > 1) {
2496         num_components = orig_num_components - 2;
2497         if(nir_src_is_const(offset_src)) {
2498            base_offset++;
2499         } else {
2500            fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2501            bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2502            indirect_offset = new_indirect;
2503         }
2504      }
2505   }
2506}
2507
2508fs_reg
2509fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2510{
2511   nir_src *offset_src = nir_get_io_offset_src(instr);
2512
2513   if (nir_src_is_const(*offset_src)) {
2514      /* The only constant offset we should find is 0.  brw_nir.c's
2515       * add_const_offset_to_base() will fold other constant offsets
2516       * into instr->const_index[0].
2517       */
2518      assert(nir_src_as_uint(*offset_src) == 0);
2519      return fs_reg();
2520   }
2521
2522   return get_nir_src(*offset_src);
2523}
2524
2525void
2526fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2527                                  nir_intrinsic_instr *instr)
2528{
2529   assert(stage == MESA_SHADER_VERTEX);
2530
2531   fs_reg dest;
2532   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2533      dest = get_nir_dest(instr->dest);
2534
2535   switch (instr->intrinsic) {
2536   case nir_intrinsic_load_vertex_id:
2537   case nir_intrinsic_load_base_vertex:
2538      unreachable("should be lowered by nir_lower_system_values()");
2539
2540   case nir_intrinsic_load_input: {
2541      fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2542      unsigned first_component = nir_intrinsic_component(instr);
2543      unsigned num_components = instr->num_components;
2544
2545      src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2546
2547      if (type_sz(dest.type) == 8)
2548         first_component /= 2;
2549
2550      /* For 16-bit support maybe a temporary will be needed to copy from
2551       * the ATTR file.
2552       */
2553      shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2554                              first_component, num_components);
2555      break;
2556   }
2557
2558   case nir_intrinsic_load_vertex_id_zero_base:
2559   case nir_intrinsic_load_instance_id:
2560   case nir_intrinsic_load_base_instance:
2561   case nir_intrinsic_load_draw_id:
2562   case nir_intrinsic_load_first_vertex:
2563   case nir_intrinsic_load_is_indexed_draw:
2564      unreachable("lowered by brw_nir_lower_vs_inputs");
2565
2566   default:
2567      nir_emit_intrinsic(bld, instr);
2568      break;
2569   }
2570}
2571
2572void
2573fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2574                                   nir_intrinsic_instr *instr)
2575{
2576   assert(stage == MESA_SHADER_TESS_CTRL);
2577   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2578   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2579
2580   fs_reg dst;
2581   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2582      dst = get_nir_dest(instr->dest);
2583
2584   switch (instr->intrinsic) {
2585   case nir_intrinsic_load_primitive_id:
2586      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2587      break;
2588   case nir_intrinsic_load_invocation_id:
2589      bld.MOV(retype(dst, invocation_id.type), invocation_id);
2590      break;
2591   case nir_intrinsic_load_patch_vertices_in:
2592      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2593              brw_imm_d(tcs_key->input_vertices));
2594      break;
2595
2596   case nir_intrinsic_barrier: {
2597      if (tcs_prog_data->instances == 1)
2598         break;
2599
2600      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2601      fs_reg m0_2 = component(m0, 2);
2602
2603      const fs_builder chanbld = bld.exec_all().group(1, 0);
2604
2605      /* Zero the message header */
2606      bld.exec_all().MOV(m0, brw_imm_ud(0u));
2607
2608      if (devinfo->gen < 11) {
2609         /* Copy "Barrier ID" from r0.2, bits 16:13 */
2610         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2611                     brw_imm_ud(INTEL_MASK(16, 13)));
2612
2613         /* Shift it up to bits 27:24. */
2614         chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2615      } else {
2616         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2617                     brw_imm_ud(INTEL_MASK(30, 24)));
2618      }
2619
2620      /* Set the Barrier Count and the enable bit */
2621      if (devinfo->gen < 11) {
2622         chanbld.OR(m0_2, m0_2,
2623                    brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2624      } else {
2625         chanbld.OR(m0_2, m0_2,
2626                    brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2627      }
2628
2629      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2630      break;
2631   }
2632
2633   case nir_intrinsic_load_input:
2634      unreachable("nir_lower_io should never give us these.");
2635      break;
2636
2637   case nir_intrinsic_load_per_vertex_input: {
2638      fs_reg indirect_offset = get_indirect_offset(instr);
2639      unsigned imm_offset = instr->const_index[0];
2640
2641      const nir_src &vertex_src = instr->src[0];
2642
2643      fs_inst *inst;
2644
2645      fs_reg icp_handle;
2646
2647      if (nir_src_is_const(vertex_src)) {
2648         /* Emit a MOV to resolve <0,1,0> regioning. */
2649         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2650         unsigned vertex = nir_src_as_uint(vertex_src);
2651         bld.MOV(icp_handle,
2652                 retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2653                        BRW_REGISTER_TYPE_UD));
2654      } else if (tcs_prog_data->instances == 1 &&
2655                 nir_src_as_intrinsic(vertex_src) != NULL &&
2656                 nir_src_as_intrinsic(vertex_src)->intrinsic == nir_intrinsic_load_invocation_id) {
2657         /* For the common case of only 1 instance, an array index of
2658          * gl_InvocationID means reading g1.  Skip all the indirect work.
2659          */
2660         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2661      } else {
2662         /* The vertex index is non-constant.  We need to use indirect
2663          * addressing to fetch the proper URB handle.
2664          */
2665         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2666
2667         /* Each ICP handle is a single DWord (4 bytes) */
2668         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2669         bld.SHL(vertex_offset_bytes,
2670                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2671                 brw_imm_ud(2u));
2672
2673         /* Start at g1.  We might read up to 4 registers. */
2674         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2675                  retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2676                  brw_imm_ud(4 * REG_SIZE));
2677      }
2678
2679      /* We can only read two double components with each URB read, so
2680       * we send two read messages in that case, each one loading up to
2681       * two double components.
2682       */
2683      unsigned num_iterations = 1;
2684      unsigned num_components = instr->num_components;
2685      unsigned first_component = nir_intrinsic_component(instr);
2686      fs_reg orig_dst = dst;
2687      if (type_sz(dst.type) == 8) {
2688         first_component = first_component / 2;
2689         if (instr->num_components > 2) {
2690            num_iterations = 2;
2691            num_components = 2;
2692         }
2693
2694         fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2695         dst = tmp;
2696      }
2697
2698      for (unsigned iter = 0; iter < num_iterations; iter++) {
2699         if (indirect_offset.file == BAD_FILE) {
2700            /* Constant indexing - use global offset. */
2701            if (first_component != 0) {
2702               unsigned read_components = num_components + first_component;
2703               fs_reg tmp = bld.vgrf(dst.type, read_components);
2704               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2705               for (unsigned i = 0; i < num_components; i++) {
2706                  bld.MOV(offset(dst, bld, i),
2707                          offset(tmp, bld, i + first_component));
2708               }
2709            } else {
2710               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2711            }
2712            inst->offset = imm_offset;
2713            inst->mlen = 1;
2714         } else {
2715            /* Indirect indexing - use per-slot offsets as well. */
2716            const fs_reg srcs[] = { icp_handle, indirect_offset };
2717            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2718            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2719            if (first_component != 0) {
2720               unsigned read_components = num_components + first_component;
2721               fs_reg tmp = bld.vgrf(dst.type, read_components);
2722               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2723                               payload);
2724               for (unsigned i = 0; i < num_components; i++) {
2725                  bld.MOV(offset(dst, bld, i),
2726                          offset(tmp, bld, i + first_component));
2727               }
2728            } else {
2729               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2730                               payload);
2731            }
2732            inst->offset = imm_offset;
2733            inst->mlen = 2;
2734         }
2735         inst->size_written = (num_components + first_component) *
2736                              inst->dst.component_size(inst->exec_size);
2737
2738         /* If we are reading 64-bit data using 32-bit read messages we need
2739          * build proper 64-bit data elements by shuffling the low and high
2740          * 32-bit components around like we do for other things like UBOs
2741          * or SSBOs.
2742          */
2743         if (type_sz(dst.type) == 8) {
2744            shuffle_from_32bit_read(bld,
2745                                    offset(orig_dst, bld, iter * 2),
2746                                    retype(dst, BRW_REGISTER_TYPE_D),
2747                                    0, num_components);
2748         }
2749
2750         /* Copy the temporary to the destination to deal with writemasking.
2751          *
2752          * Also attempt to deal with gl_PointSize being in the .w component.
2753          */
2754         if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2755            assert(type_sz(dst.type) < 8);
2756            inst->dst = bld.vgrf(dst.type, 4);
2757            inst->size_written = 4 * REG_SIZE;
2758            bld.MOV(dst, offset(inst->dst, bld, 3));
2759         }
2760
2761         /* If we are loading double data and we need a second read message
2762          * adjust the write offset
2763          */
2764         if (num_iterations > 1) {
2765            num_components = instr->num_components - 2;
2766            imm_offset++;
2767         }
2768      }
2769      break;
2770   }
2771
2772   case nir_intrinsic_load_output:
2773   case nir_intrinsic_load_per_vertex_output: {
2774      fs_reg indirect_offset = get_indirect_offset(instr);
2775      unsigned imm_offset = instr->const_index[0];
2776      unsigned first_component = nir_intrinsic_component(instr);
2777
2778      fs_inst *inst;
2779      if (indirect_offset.file == BAD_FILE) {
2780         /* Replicate the patch handle to all enabled channels */
2781         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2782         bld.MOV(patch_handle,
2783                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2784
2785         {
2786            if (first_component != 0) {
2787               unsigned read_components =
2788                  instr->num_components + first_component;
2789               fs_reg tmp = bld.vgrf(dst.type, read_components);
2790               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2791                               patch_handle);
2792               inst->size_written = read_components * REG_SIZE;
2793               for (unsigned i = 0; i < instr->num_components; i++) {
2794                  bld.MOV(offset(dst, bld, i),
2795                          offset(tmp, bld, i + first_component));
2796               }
2797            } else {
2798               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2799                               patch_handle);
2800               inst->size_written = instr->num_components * REG_SIZE;
2801            }
2802            inst->offset = imm_offset;
2803            inst->mlen = 1;
2804         }
2805      } else {
2806         /* Indirect indexing - use per-slot offsets as well. */
2807         const fs_reg srcs[] = {
2808            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2809            indirect_offset
2810         };
2811         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2812         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2813         if (first_component != 0) {
2814            unsigned read_components =
2815               instr->num_components + first_component;
2816            fs_reg tmp = bld.vgrf(dst.type, read_components);
2817            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2818                            payload);
2819            inst->size_written = read_components * REG_SIZE;
2820            for (unsigned i = 0; i < instr->num_components; i++) {
2821               bld.MOV(offset(dst, bld, i),
2822                       offset(tmp, bld, i + first_component));
2823            }
2824         } else {
2825            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2826                            payload);
2827            inst->size_written = instr->num_components * REG_SIZE;
2828         }
2829         inst->offset = imm_offset;
2830         inst->mlen = 2;
2831      }
2832      break;
2833   }
2834
2835   case nir_intrinsic_store_output:
2836   case nir_intrinsic_store_per_vertex_output: {
2837      fs_reg value = get_nir_src(instr->src[0]);
2838      bool is_64bit = (instr->src[0].is_ssa ?
2839         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2840      fs_reg indirect_offset = get_indirect_offset(instr);
2841      unsigned imm_offset = instr->const_index[0];
2842      unsigned mask = instr->const_index[1];
2843      unsigned header_regs = 0;
2844      fs_reg srcs[7];
2845      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2846
2847      if (indirect_offset.file != BAD_FILE) {
2848         srcs[header_regs++] = indirect_offset;
2849      }
2850
2851      if (mask == 0)
2852         break;
2853
2854      unsigned num_components = util_last_bit(mask);
2855      enum opcode opcode;
2856
2857      /* We can only pack two 64-bit components in a single message, so send
2858       * 2 messages if we have more components
2859       */
2860      unsigned num_iterations = 1;
2861      unsigned iter_components = num_components;
2862      unsigned first_component = nir_intrinsic_component(instr);
2863      if (is_64bit) {
2864         first_component = first_component / 2;
2865         if (instr->num_components > 2) {
2866            num_iterations = 2;
2867            iter_components = 2;
2868         }
2869      }
2870
2871      mask = mask << first_component;
2872
2873      for (unsigned iter = 0; iter < num_iterations; iter++) {
2874         if (!is_64bit && mask != WRITEMASK_XYZW) {
2875            srcs[header_regs++] = brw_imm_ud(mask << 16);
2876            opcode = indirect_offset.file != BAD_FILE ?
2877               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2878               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2879         } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2880            /* Expand the 64-bit mask to 32-bit channels. We only handle
2881             * two channels in each iteration, so we only care about X/Y.
2882             */
2883            unsigned mask32 = 0;
2884            if (mask & WRITEMASK_X)
2885               mask32 |= WRITEMASK_XY;
2886            if (mask & WRITEMASK_Y)
2887               mask32 |= WRITEMASK_ZW;
2888
2889            /* If the mask does not include any of the channels X or Y there
2890             * is nothing to do in this iteration. Move on to the next couple
2891             * of 64-bit channels.
2892             */
2893            if (!mask32) {
2894               mask >>= 2;
2895               imm_offset++;
2896               continue;
2897            }
2898
2899            srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2900            opcode = indirect_offset.file != BAD_FILE ?
2901               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2902               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2903         } else {
2904            opcode = indirect_offset.file != BAD_FILE ?
2905               SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2906               SHADER_OPCODE_URB_WRITE_SIMD8;
2907         }
2908
2909         for (unsigned i = 0; i < iter_components; i++) {
2910            if (!(mask & (1 << (i + first_component))))
2911               continue;
2912
2913            if (!is_64bit) {
2914               srcs[header_regs + i + first_component] = offset(value, bld, i);
2915            } else {
2916               /* We need to shuffle the 64-bit data to match the layout
2917                * expected by our 32-bit URB write messages. We use a temporary
2918                * for that.
2919                */
2920               unsigned channel = iter * 2 + i;
2921               fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2922
2923               srcs[header_regs + (i + first_component) * 2] = dest;
2924               srcs[header_regs + (i + first_component) * 2 + 1] =
2925                  offset(dest, bld, 1);
2926            }
2927         }
2928
2929         unsigned mlen =
2930            header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2931            (is_64bit ? 2 * first_component : first_component);
2932         fs_reg payload =
2933            bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2934         bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2935
2936         fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2937         inst->offset = imm_offset;
2938         inst->mlen = mlen;
2939
2940         /* If this is a 64-bit attribute, select the next two 64-bit channels
2941          * to be handled in the next iteration.
2942          */
2943         if (is_64bit) {
2944            mask >>= 2;
2945            imm_offset++;
2946         }
2947      }
2948      break;
2949   }
2950
2951   default:
2952      nir_emit_intrinsic(bld, instr);
2953      break;
2954   }
2955}
2956
2957void
2958fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2959                                   nir_intrinsic_instr *instr)
2960{
2961   assert(stage == MESA_SHADER_TESS_EVAL);
2962   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2963
2964   fs_reg dest;
2965   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2966      dest = get_nir_dest(instr->dest);
2967
2968   switch (instr->intrinsic) {
2969   case nir_intrinsic_load_primitive_id:
2970      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2971      break;
2972   case nir_intrinsic_load_tess_coord:
2973      /* gl_TessCoord is part of the payload in g1-3 */
2974      for (unsigned i = 0; i < 3; i++) {
2975         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2976      }
2977      break;
2978
2979   case nir_intrinsic_load_input:
2980   case nir_intrinsic_load_per_vertex_input: {
2981      fs_reg indirect_offset = get_indirect_offset(instr);
2982      unsigned imm_offset = instr->const_index[0];
2983      unsigned first_component = nir_intrinsic_component(instr);
2984
2985      if (type_sz(dest.type) == 8) {
2986         first_component = first_component / 2;
2987      }
2988
2989      fs_inst *inst;
2990      if (indirect_offset.file == BAD_FILE) {
2991         /* Arbitrarily only push up to 32 vec4 slots worth of data,
2992          * which is 16 registers (since each holds 2 vec4 slots).
2993          */
2994         unsigned slot_count = 1;
2995         if (type_sz(dest.type) == 8 && instr->num_components > 2)
2996            slot_count++;
2997
2998         const unsigned max_push_slots = 32;
2999         if (imm_offset + slot_count <= max_push_slots) {
3000            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
3001            for (int i = 0; i < instr->num_components; i++) {
3002               unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
3003                  i + first_component;
3004               bld.MOV(offset(dest, bld, i), component(src, comp));
3005            }
3006
3007            tes_prog_data->base.urb_read_length =
3008               MAX2(tes_prog_data->base.urb_read_length,
3009                    DIV_ROUND_UP(imm_offset + slot_count, 2));
3010         } else {
3011            /* Replicate the patch handle to all enabled channels */
3012            const fs_reg srcs[] = {
3013               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
3014            };
3015            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
3016            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
3017
3018            if (first_component != 0) {
3019               unsigned read_components =
3020                  instr->num_components + first_component;
3021               fs_reg tmp = bld.vgrf(dest.type, read_components);
3022               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
3023                               patch_handle);
3024               inst->size_written = read_components * REG_SIZE;
3025               for (unsigned i = 0; i < instr->num_components; i++) {
3026                  bld.MOV(offset(dest, bld, i),
3027                          offset(tmp, bld, i + first_component));
3028               }
3029            } else {
3030               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
3031                               patch_handle);
3032               inst->size_written = instr->num_components * REG_SIZE;
3033            }
3034            inst->mlen = 1;
3035            inst->offset = imm_offset;
3036         }
3037      } else {
3038         /* Indirect indexing - use per-slot offsets as well. */
3039
3040         /* We can only read two double components with each URB read, so
3041          * we send two read messages in that case, each one loading up to
3042          * two double components.
3043          */
3044         unsigned num_iterations = 1;
3045         unsigned num_components = instr->num_components;
3046         fs_reg orig_dest = dest;
3047         if (type_sz(dest.type) == 8) {
3048            if (instr->num_components > 2) {
3049               num_iterations = 2;
3050               num_components = 2;
3051            }
3052            fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
3053            dest = tmp;
3054         }
3055
3056         for (unsigned iter = 0; iter < num_iterations; iter++) {
3057            const fs_reg srcs[] = {
3058               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
3059               indirect_offset
3060            };
3061            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3062            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
3063
3064            if (first_component != 0) {
3065               unsigned read_components =
3066                   num_components + first_component;
3067               fs_reg tmp = bld.vgrf(dest.type, read_components);
3068               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
3069                               payload);
3070               for (unsigned i = 0; i < num_components; i++) {
3071                  bld.MOV(offset(dest, bld, i),
3072                          offset(tmp, bld, i + first_component));
3073               }
3074            } else {
3075               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
3076                               payload);
3077            }
3078            inst->mlen = 2;
3079            inst->offset = imm_offset;
3080            inst->size_written = (num_components + first_component) *
3081                                 inst->dst.component_size(inst->exec_size);
3082
3083            /* If we are reading 64-bit data using 32-bit read messages we need
3084             * build proper 64-bit data elements by shuffling the low and high
3085             * 32-bit components around like we do for other things like UBOs
3086             * or SSBOs.
3087             */
3088            if (type_sz(dest.type) == 8) {
3089               shuffle_from_32bit_read(bld,
3090                                       offset(orig_dest, bld, iter * 2),
3091                                       retype(dest, BRW_REGISTER_TYPE_D),
3092                                       0, num_components);
3093            }
3094
3095            /* If we are loading double data and we need a second read message
3096             * adjust the offset
3097             */
3098            if (num_iterations > 1) {
3099               num_components = instr->num_components - 2;
3100               imm_offset++;
3101            }
3102         }
3103      }
3104      break;
3105   }
3106   default:
3107      nir_emit_intrinsic(bld, instr);
3108      break;
3109   }
3110}
3111
3112void
3113fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3114                                  nir_intrinsic_instr *instr)
3115{
3116   assert(stage == MESA_SHADER_GEOMETRY);
3117   fs_reg indirect_offset;
3118
3119   fs_reg dest;
3120   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3121      dest = get_nir_dest(instr->dest);
3122
3123   switch (instr->intrinsic) {
3124   case nir_intrinsic_load_primitive_id:
3125      assert(stage == MESA_SHADER_GEOMETRY);
3126      assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3127      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3128              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3129      break;
3130
3131   case nir_intrinsic_load_input:
3132      unreachable("load_input intrinsics are invalid for the GS stage");
3133
3134   case nir_intrinsic_load_per_vertex_input:
3135      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3136                         instr->src[1], instr->num_components,
3137                         nir_intrinsic_component(instr));
3138      break;
3139
3140   case nir_intrinsic_emit_vertex_with_counter:
3141      emit_gs_vertex(instr->src[0], instr->const_index[0]);
3142      break;
3143
3144   case nir_intrinsic_end_primitive_with_counter:
3145      emit_gs_end_primitive(instr->src[0]);
3146      break;
3147
3148   case nir_intrinsic_set_vertex_count:
3149      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3150      break;
3151
3152   case nir_intrinsic_load_invocation_id: {
3153      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3154      assert(val.file != BAD_FILE);
3155      dest.type = val.type;
3156      bld.MOV(dest, val);
3157      break;
3158   }
3159
3160   default:
3161      nir_emit_intrinsic(bld, instr);
3162      break;
3163   }
3164}
3165
3166/**
3167 * Fetch the current render target layer index.
3168 */
3169static fs_reg
3170fetch_render_target_array_index(const fs_builder &bld)
3171{
3172   if (bld.shader->devinfo->gen >= 6) {
3173      /* The render target array index is provided in the thread payload as
3174       * bits 26:16 of r0.0.
3175       */
3176      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3177      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3178              brw_imm_uw(0x7ff));
3179      return idx;
3180   } else {
3181      /* Pre-SNB we only ever render into the first layer of the framebuffer
3182       * since layered rendering is not implemented.
3183       */
3184      return brw_imm_ud(0);
3185   }
3186}
3187
3188/**
3189 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3190 * framebuffer at the current fragment coordinates and sample index.
3191 */
3192fs_inst *
3193fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3194                                      unsigned target)
3195{
3196   const struct gen_device_info *devinfo = bld.shader->devinfo;
3197
3198   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3199   const brw_wm_prog_key *wm_key =
3200      reinterpret_cast<const brw_wm_prog_key *>(key);
3201   assert(!wm_key->coherent_fb_fetch);
3202   const struct brw_wm_prog_data *wm_prog_data =
3203      brw_wm_prog_data(stage_prog_data);
3204
3205   /* Calculate the surface index relative to the start of the texture binding
3206    * table block, since that's what the texturing messages expect.
3207    */
3208   const unsigned surface = target +
3209      wm_prog_data->binding_table.render_target_read_start -
3210      wm_prog_data->base.binding_table.texture_start;
3211
3212   /* Calculate the fragment coordinates. */
3213   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3214   bld.MOV(offset(coords, bld, 0), pixel_x);
3215   bld.MOV(offset(coords, bld, 1), pixel_y);
3216   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3217
3218   /* Calculate the sample index and MCS payload when multisampling.  Luckily
3219    * the MCS fetch message behaves deterministically for UMS surfaces, so it
3220    * shouldn't be necessary to recompile based on whether the framebuffer is
3221    * CMS or UMS.
3222    */
3223   if (wm_key->multisample_fbo &&
3224       nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3225      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3226
3227   const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3228   const fs_reg mcs = wm_key->multisample_fbo ?
3229      emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
3230
3231   /* Use either a normal or a CMS texel fetch message depending on whether
3232    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3233    * message just in case the framebuffer uses 16x multisampling, it should
3234    * be equivalent to the normal CMS fetch for lower multisampling modes.
3235    */
3236   const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3237                     devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3238                     SHADER_OPCODE_TXF_CMS_LOGICAL;
3239
3240   /* Emit the instruction. */
3241   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3242   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3243   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3244   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3245   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3246   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3247   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3248   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3249   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3250
3251   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3252   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3253
3254   return inst;
3255}
3256
3257/**
3258 * Actual coherent framebuffer read implemented using the native render target
3259 * read message.  Requires SKL+.
3260 */
3261static fs_inst *
3262emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3263{
3264   assert(bld.shader->devinfo->gen >= 9);
3265   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3266   inst->target = target;
3267   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3268
3269   return inst;
3270}
3271
3272static fs_reg
3273alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3274{
3275   if (n && regs[0].file != BAD_FILE) {
3276      return regs[0];
3277
3278   } else {
3279      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3280
3281      for (unsigned i = 0; i < n; i++)
3282         regs[i] = tmp;
3283
3284      return tmp;
3285   }
3286}
3287
3288static fs_reg
3289alloc_frag_output(fs_visitor *v, unsigned location)
3290{
3291   assert(v->stage == MESA_SHADER_FRAGMENT);
3292   const brw_wm_prog_key *const key =
3293      reinterpret_cast<const brw_wm_prog_key *>(v->key);
3294   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3295   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3296
3297   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3298      return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3299
3300   else if (l == FRAG_RESULT_COLOR)
3301      return alloc_temporary(v->bld, 4, v->outputs,
3302                             MAX2(key->nr_color_regions, 1));
3303
3304   else if (l == FRAG_RESULT_DEPTH)
3305      return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3306
3307   else if (l == FRAG_RESULT_STENCIL)
3308      return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3309
3310   else if (l == FRAG_RESULT_SAMPLE_MASK)
3311      return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3312
3313   else if (l >= FRAG_RESULT_DATA0 &&
3314            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3315      return alloc_temporary(v->bld, 4,
3316                             &v->outputs[l - FRAG_RESULT_DATA0], 1);
3317
3318   else
3319      unreachable("Invalid location");
3320}
3321
3322void
3323fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3324                                  nir_intrinsic_instr *instr)
3325{
3326   assert(stage == MESA_SHADER_FRAGMENT);
3327
3328   fs_reg dest;
3329   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3330      dest = get_nir_dest(instr->dest);
3331
3332   switch (instr->intrinsic) {
3333   case nir_intrinsic_load_front_face:
3334      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3335              *emit_frontfacing_interpolation());
3336      break;
3337
3338   case nir_intrinsic_load_sample_pos: {
3339      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3340      assert(sample_pos.file != BAD_FILE);
3341      dest.type = sample_pos.type;
3342      bld.MOV(dest, sample_pos);
3343      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3344      break;
3345   }
3346
3347   case nir_intrinsic_load_layer_id:
3348      dest.type = BRW_REGISTER_TYPE_UD;
3349      bld.MOV(dest, fetch_render_target_array_index(bld));
3350      break;
3351
3352   case nir_intrinsic_load_helper_invocation:
3353   case nir_intrinsic_load_sample_mask_in:
3354   case nir_intrinsic_load_sample_id: {
3355      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3356      fs_reg val = nir_system_values[sv];
3357      assert(val.file != BAD_FILE);
3358      dest.type = val.type;
3359      bld.MOV(dest, val);
3360      break;
3361   }
3362
3363   case nir_intrinsic_store_output: {
3364      const fs_reg src = get_nir_src(instr->src[0]);
3365      const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3366      const unsigned location = nir_intrinsic_base(instr) +
3367         SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3368      const fs_reg new_dest = retype(alloc_frag_output(this, location),
3369                                     src.type);
3370
3371      for (unsigned j = 0; j < instr->num_components; j++)
3372         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3373                 offset(src, bld, j));
3374
3375      break;
3376   }
3377
3378   case nir_intrinsic_load_output: {
3379      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3380                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
3381      assert(l >= FRAG_RESULT_DATA0);
3382      const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3383      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3384      const fs_reg tmp = bld.vgrf(dest.type, 4);
3385
3386      if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3387         emit_coherent_fb_read(bld, tmp, target);
3388      else
3389         emit_non_coherent_fb_read(bld, tmp, target);
3390
3391      for (unsigned j = 0; j < instr->num_components; j++) {
3392         bld.MOV(offset(dest, bld, j),
3393                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3394      }
3395
3396      break;
3397   }
3398
3399   case nir_intrinsic_discard:
3400   case nir_intrinsic_discard_if: {
3401      /* We track our discarded pixels in f0.1.  By predicating on it, we can
3402       * update just the flag bits that aren't yet discarded.  If there's no
3403       * condition, we emit a CMP of g0 != g0, so all currently executing
3404       * channels will get turned off.
3405       */
3406      fs_inst *cmp;
3407      if (instr->intrinsic == nir_intrinsic_discard_if) {
3408         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3409                       brw_imm_d(0), BRW_CONDITIONAL_Z);
3410      } else {
3411         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3412                                       BRW_REGISTER_TYPE_UW));
3413         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3414      }
3415      cmp->predicate = BRW_PREDICATE_NORMAL;
3416      cmp->flag_subreg = 1;
3417
3418      if (devinfo->gen >= 6) {
3419         emit_discard_jump();
3420      }
3421
3422      limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3423      break;
3424   }
3425
3426   case nir_intrinsic_load_input: {
3427      /* load_input is only used for flat inputs */
3428      unsigned base = nir_intrinsic_base(instr);
3429      unsigned comp = nir_intrinsic_component(instr);
3430      unsigned num_components = instr->num_components;
3431      fs_reg orig_dest = dest;
3432      enum brw_reg_type type = dest.type;
3433
3434      /* Special case fields in the VUE header */
3435      if (base == VARYING_SLOT_LAYER)
3436         comp = 1;
3437      else if (base == VARYING_SLOT_VIEWPORT)
3438         comp = 2;
3439
3440      if (nir_dest_bit_size(instr->dest) == 64) {
3441         /* const_index is in 32-bit type size units that could not be aligned
3442          * with DF. We need to read the double vector as if it was a float
3443          * vector of twice the number of components to fetch the right data.
3444          */
3445         type = BRW_REGISTER_TYPE_F;
3446         num_components *= 2;
3447         dest = bld.vgrf(type, num_components);
3448      }
3449
3450      for (unsigned int i = 0; i < num_components; i++) {
3451         bld.MOV(offset(retype(dest, type), bld, i),
3452                 retype(component(interp_reg(base, comp + i), 3), type));
3453      }
3454
3455      if (nir_dest_bit_size(instr->dest) == 64) {
3456         shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3457                                 instr->num_components);
3458      }
3459      break;
3460   }
3461
3462   case nir_intrinsic_load_barycentric_pixel:
3463   case nir_intrinsic_load_barycentric_centroid:
3464   case nir_intrinsic_load_barycentric_sample:
3465      /* Do nothing - load_interpolated_input handling will handle it later. */
3466      break;
3467
3468   case nir_intrinsic_load_barycentric_at_sample: {
3469      const glsl_interp_mode interpolation =
3470         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3471
3472      if (nir_src_is_const(instr->src[0])) {
3473         unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3474
3475         emit_pixel_interpolater_send(bld,
3476                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3477                                      dest,
3478                                      fs_reg(), /* src */
3479                                      brw_imm_ud(msg_data),
3480                                      interpolation);
3481      } else {
3482         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3483                                          BRW_REGISTER_TYPE_UD);
3484
3485         if (nir_src_is_dynamically_uniform(instr->src[0])) {
3486            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3487            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3488            bld.exec_all().group(1, 0)
3489               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3490            emit_pixel_interpolater_send(bld,
3491                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3492                                         dest,
3493                                         fs_reg(), /* src */
3494                                         msg_data,
3495                                         interpolation);
3496         } else {
3497            /* Make a loop that sends a message to the pixel interpolater
3498             * for the sample number in each live channel. If there are
3499             * multiple channels with the same sample number then these
3500             * will be handled simultaneously with a single interation of
3501             * the loop.
3502             */
3503            bld.emit(BRW_OPCODE_DO);
3504
3505            /* Get the next live sample number into sample_id_reg */
3506            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3507
3508            /* Set the flag register so that we can perform the send
3509             * message on all channels that have the same sample number
3510             */
3511            bld.CMP(bld.null_reg_ud(),
3512                    sample_src, sample_id,
3513                    BRW_CONDITIONAL_EQ);
3514            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3515            bld.exec_all().group(1, 0)
3516               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3517            fs_inst *inst =
3518               emit_pixel_interpolater_send(bld,
3519                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3520                                            dest,
3521                                            fs_reg(), /* src */
3522                                            component(msg_data, 0),
3523                                            interpolation);
3524            set_predicate(BRW_PREDICATE_NORMAL, inst);
3525
3526            /* Continue the loop if there are any live channels left */
3527            set_predicate_inv(BRW_PREDICATE_NORMAL,
3528                              true, /* inverse */
3529                              bld.emit(BRW_OPCODE_WHILE));
3530         }
3531      }
3532      break;
3533   }
3534
3535   case nir_intrinsic_load_barycentric_at_offset: {
3536      const glsl_interp_mode interpolation =
3537         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3538
3539      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3540
3541      if (const_offset) {
3542         assert(nir_src_bit_size(instr->src[0]) == 32);
3543         unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf;
3544         unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf;
3545
3546         emit_pixel_interpolater_send(bld,
3547                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3548                                      dest,
3549                                      fs_reg(), /* src */
3550                                      brw_imm_ud(off_x | (off_y << 4)),
3551                                      interpolation);
3552      } else {
3553         fs_reg src = vgrf(glsl_type::ivec2_type);
3554         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3555                                    BRW_REGISTER_TYPE_F);
3556         for (int i = 0; i < 2; i++) {
3557            fs_reg temp = vgrf(glsl_type::float_type);
3558            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3559            fs_reg itemp = vgrf(glsl_type::int_type);
3560            /* float to int */
3561            bld.MOV(itemp, temp);
3562
3563            /* Clamp the upper end of the range to +7/16.
3564             * ARB_gpu_shader5 requires that we support a maximum offset
3565             * of +0.5, which isn't representable in a S0.4 value -- if
3566             * we didn't clamp it, we'd end up with -8/16, which is the
3567             * opposite of what the shader author wanted.
3568             *
3569             * This is legal due to ARB_gpu_shader5's quantization
3570             * rules:
3571             *
3572             * "Not all values of <offset> may be supported; x and y
3573             * offsets may be rounded to fixed-point values with the
3574             * number of fraction bits given by the
3575             * implementation-dependent constant
3576             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3577             */
3578            set_condmod(BRW_CONDITIONAL_L,
3579                        bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3580         }
3581
3582         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3583         emit_pixel_interpolater_send(bld,
3584                                      opcode,
3585                                      dest,
3586                                      src,
3587                                      brw_imm_ud(0u),
3588                                      interpolation);
3589      }
3590      break;
3591   }
3592
3593   case nir_intrinsic_load_interpolated_input: {
3594      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3595         emit_fragcoord_interpolation(dest);
3596         break;
3597      }
3598
3599      assert(instr->src[0].ssa &&
3600             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3601      nir_intrinsic_instr *bary_intrinsic =
3602         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3603      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3604      enum glsl_interp_mode interp_mode =
3605         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3606      fs_reg dst_xy;
3607
3608      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3609          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3610         /* Use the result of the PI message */
3611         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3612      } else {
3613         /* Use the delta_xy values computed from the payload */
3614         enum brw_barycentric_mode bary =
3615            brw_barycentric_mode(interp_mode, bary_intrin);
3616
3617         dst_xy = this->delta_xy[bary];
3618      }
3619
3620      for (unsigned int i = 0; i < instr->num_components; i++) {
3621         fs_reg interp =
3622            component(interp_reg(nir_intrinsic_base(instr),
3623                                 nir_intrinsic_component(instr) + i), 0);
3624         interp.type = BRW_REGISTER_TYPE_F;
3625         dest.type = BRW_REGISTER_TYPE_F;
3626
3627         if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3628            fs_reg tmp = vgrf(glsl_type::float_type);
3629            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3630            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3631         } else {
3632            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3633         }
3634      }
3635      break;
3636   }
3637
3638   default:
3639      nir_emit_intrinsic(bld, instr);
3640      break;
3641   }
3642}
3643
3644static int
3645get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3646{
3647   if (nir_src_is_const(instr->src[src])) {
3648      int64_t add_val = nir_src_as_int(instr->src[src]);
3649      if (add_val == 1)
3650         return BRW_AOP_INC;
3651      else if (add_val == -1)
3652         return BRW_AOP_DEC;
3653   }
3654
3655   return BRW_AOP_ADD;
3656}
3657
3658void
3659fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3660                                  nir_intrinsic_instr *instr)
3661{
3662   assert(stage == MESA_SHADER_COMPUTE);
3663   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3664
3665   fs_reg dest;
3666   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3667      dest = get_nir_dest(instr->dest);
3668
3669   switch (instr->intrinsic) {
3670   case nir_intrinsic_barrier:
3671      emit_barrier();
3672      cs_prog_data->uses_barrier = true;
3673      break;
3674
3675   case nir_intrinsic_load_subgroup_id:
3676      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3677      break;
3678
3679   case nir_intrinsic_load_local_invocation_id:
3680   case nir_intrinsic_load_work_group_id: {
3681      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3682      fs_reg val = nir_system_values[sv];
3683      assert(val.file != BAD_FILE);
3684      dest.type = val.type;
3685      for (unsigned i = 0; i < 3; i++)
3686         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3687      break;
3688   }
3689
3690   case nir_intrinsic_load_num_work_groups: {
3691      const unsigned surface =
3692         cs_prog_data->binding_table.work_groups_start;
3693
3694      cs_prog_data->uses_num_work_groups = true;
3695
3696      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3697      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3698      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3699      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
3700
3701      /* Read the 3 GLuint components of gl_NumWorkGroups */
3702      for (unsigned i = 0; i < 3; i++) {
3703         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
3704         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3705                  offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
3706      }
3707      break;
3708   }
3709
3710   case nir_intrinsic_shared_atomic_add:
3711      nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3712      break;
3713   case nir_intrinsic_shared_atomic_imin:
3714      nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3715      break;
3716   case nir_intrinsic_shared_atomic_umin:
3717      nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3718      break;
3719   case nir_intrinsic_shared_atomic_imax:
3720      nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3721      break;
3722   case nir_intrinsic_shared_atomic_umax:
3723      nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3724      break;
3725   case nir_intrinsic_shared_atomic_and:
3726      nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3727      break;
3728   case nir_intrinsic_shared_atomic_or:
3729      nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3730      break;
3731   case nir_intrinsic_shared_atomic_xor:
3732      nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3733      break;
3734   case nir_intrinsic_shared_atomic_exchange:
3735      nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3736      break;
3737   case nir_intrinsic_shared_atomic_comp_swap:
3738      nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3739      break;
3740   case nir_intrinsic_shared_atomic_fmin:
3741      nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3742      break;
3743   case nir_intrinsic_shared_atomic_fmax:
3744      nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3745      break;
3746   case nir_intrinsic_shared_atomic_fcomp_swap:
3747      nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3748      break;
3749
3750   case nir_intrinsic_load_shared: {
3751      assert(devinfo->gen >= 7);
3752      assert(stage == MESA_SHADER_COMPUTE);
3753
3754      const unsigned bit_size = nir_dest_bit_size(instr->dest);
3755      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3756      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3757      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3758      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3759
3760      /* Make dest unsigned because that's what the temporary will be */
3761      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3762
3763      /* Read the vector */
3764      if (nir_intrinsic_align(instr) >= 4) {
3765         assert(nir_dest_bit_size(instr->dest) == 32);
3766         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3767         fs_inst *inst =
3768            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3769                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3770         inst->size_written = instr->num_components * dispatch_width * 4;
3771      } else {
3772         assert(nir_dest_bit_size(instr->dest) <= 32);
3773         assert(nir_dest_num_components(instr->dest) == 1);
3774         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3775
3776         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3777         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3778                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3779         bld.MOV(dest, read_result);
3780      }
3781      break;
3782   }
3783
3784   case nir_intrinsic_store_shared: {
3785      assert(devinfo->gen >= 7);
3786      assert(stage == MESA_SHADER_COMPUTE);
3787
3788      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3789      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3790      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3791      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3792      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3793
3794      fs_reg data = get_nir_src(instr->src[0]);
3795      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3796
3797      assert(nir_intrinsic_write_mask(instr) ==
3798             (1u << instr->num_components) - 1);
3799      if (nir_intrinsic_align(instr) >= 4) {
3800         assert(nir_src_bit_size(instr->src[0]) == 32);
3801         assert(nir_src_num_components(instr->src[0]) <= 4);
3802         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3803         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3804         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3805                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3806      } else {
3807         assert(nir_src_bit_size(instr->src[0]) <= 32);
3808         assert(nir_src_num_components(instr->src[0]) == 1);
3809         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3810
3811         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3812         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3813
3814         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3815                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3816      }
3817      break;
3818   }
3819
3820   default:
3821      nir_emit_intrinsic(bld, instr);
3822      break;
3823   }
3824}
3825
3826static fs_reg
3827brw_nir_reduction_op_identity(const fs_builder &bld,
3828                              nir_op op, brw_reg_type type)
3829{
3830   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3831   switch (type_sz(type)) {
3832   case 2:
3833      assert(type != BRW_REGISTER_TYPE_HF);
3834      return retype(brw_imm_uw(value.u16), type);
3835   case 4:
3836      return retype(brw_imm_ud(value.u32), type);
3837   case 8:
3838      if (type == BRW_REGISTER_TYPE_DF)
3839         return setup_imm_df(bld, value.f64);
3840      else
3841         return retype(brw_imm_u64(value.u64), type);
3842   default:
3843      unreachable("Invalid type size");
3844   }
3845}
3846
3847static opcode
3848brw_op_for_nir_reduction_op(nir_op op)
3849{
3850   switch (op) {
3851   case nir_op_iadd: return BRW_OPCODE_ADD;
3852   case nir_op_fadd: return BRW_OPCODE_ADD;
3853   case nir_op_imul: return BRW_OPCODE_MUL;
3854   case nir_op_fmul: return BRW_OPCODE_MUL;
3855   case nir_op_imin: return BRW_OPCODE_SEL;
3856   case nir_op_umin: return BRW_OPCODE_SEL;
3857   case nir_op_fmin: return BRW_OPCODE_SEL;
3858   case nir_op_imax: return BRW_OPCODE_SEL;
3859   case nir_op_umax: return BRW_OPCODE_SEL;
3860   case nir_op_fmax: return BRW_OPCODE_SEL;
3861   case nir_op_iand: return BRW_OPCODE_AND;
3862   case nir_op_ior:  return BRW_OPCODE_OR;
3863   case nir_op_ixor: return BRW_OPCODE_XOR;
3864   default:
3865      unreachable("Invalid reduction operation");
3866   }
3867}
3868
3869static brw_conditional_mod
3870brw_cond_mod_for_nir_reduction_op(nir_op op)
3871{
3872   switch (op) {
3873   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3874   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3875   case nir_op_imul: return BRW_CONDITIONAL_NONE;
3876   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3877   case nir_op_imin: return BRW_CONDITIONAL_L;
3878   case nir_op_umin: return BRW_CONDITIONAL_L;
3879   case nir_op_fmin: return BRW_CONDITIONAL_L;
3880   case nir_op_imax: return BRW_CONDITIONAL_GE;
3881   case nir_op_umax: return BRW_CONDITIONAL_GE;
3882   case nir_op_fmax: return BRW_CONDITIONAL_GE;
3883   case nir_op_iand: return BRW_CONDITIONAL_NONE;
3884   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3885   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3886   default:
3887      unreachable("Invalid reduction operation");
3888   }
3889}
3890
3891fs_reg
3892fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3893                                          nir_intrinsic_instr *instr)
3894{
3895   fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3896
3897   if (stage_prog_data->binding_table.image_start > 0) {
3898      if (image.file == BRW_IMMEDIATE_VALUE) {
3899         image.d += stage_prog_data->binding_table.image_start;
3900      } else {
3901         bld.ADD(image, image,
3902                 brw_imm_d(stage_prog_data->binding_table.image_start));
3903      }
3904   }
3905
3906   return bld.emit_uniformize(image);
3907}
3908
3909fs_reg
3910fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3911                                         nir_intrinsic_instr *instr)
3912{
3913   /* SSBO stores are weird in that their index is in src[1] */
3914   const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3915
3916   fs_reg surf_index;
3917   if (nir_src_is_const(instr->src[src])) {
3918      unsigned index = stage_prog_data->binding_table.ssbo_start +
3919                       nir_src_as_uint(instr->src[src]);
3920      surf_index = brw_imm_ud(index);
3921   } else {
3922      surf_index = vgrf(glsl_type::uint_type);
3923      bld.ADD(surf_index, get_nir_src(instr->src[src]),
3924              brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3925   }
3926
3927   return bld.emit_uniformize(surf_index);
3928}
3929
3930static unsigned
3931image_intrinsic_coord_components(nir_intrinsic_instr *instr)
3932{
3933   switch (nir_intrinsic_image_dim(instr)) {
3934   case GLSL_SAMPLER_DIM_1D:
3935      return 1 + nir_intrinsic_image_array(instr);
3936   case GLSL_SAMPLER_DIM_2D:
3937   case GLSL_SAMPLER_DIM_RECT:
3938      return 2 + nir_intrinsic_image_array(instr);
3939   case GLSL_SAMPLER_DIM_3D:
3940   case GLSL_SAMPLER_DIM_CUBE:
3941      return 3;
3942   case GLSL_SAMPLER_DIM_BUF:
3943      return 1;
3944   case GLSL_SAMPLER_DIM_MS:
3945      return 2 + nir_intrinsic_image_array(instr);
3946   default:
3947      unreachable("Invalid image dimension");
3948   }
3949}
3950
3951void
3952fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3953{
3954   fs_reg dest;
3955   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3956      dest = get_nir_dest(instr->dest);
3957
3958   switch (instr->intrinsic) {
3959   case nir_intrinsic_image_load:
3960   case nir_intrinsic_image_store:
3961   case nir_intrinsic_image_atomic_add:
3962   case nir_intrinsic_image_atomic_min:
3963   case nir_intrinsic_image_atomic_max:
3964   case nir_intrinsic_image_atomic_and:
3965   case nir_intrinsic_image_atomic_or:
3966   case nir_intrinsic_image_atomic_xor:
3967   case nir_intrinsic_image_atomic_exchange:
3968   case nir_intrinsic_image_atomic_comp_swap:
3969   case nir_intrinsic_bindless_image_load:
3970   case nir_intrinsic_bindless_image_store:
3971   case nir_intrinsic_bindless_image_atomic_add:
3972   case nir_intrinsic_bindless_image_atomic_min:
3973   case nir_intrinsic_bindless_image_atomic_max:
3974   case nir_intrinsic_bindless_image_atomic_and:
3975   case nir_intrinsic_bindless_image_atomic_or:
3976   case nir_intrinsic_bindless_image_atomic_xor:
3977   case nir_intrinsic_bindless_image_atomic_exchange:
3978   case nir_intrinsic_bindless_image_atomic_comp_swap: {
3979      if (stage == MESA_SHADER_FRAGMENT &&
3980          instr->intrinsic != nir_intrinsic_image_load)
3981         brw_wm_prog_data(prog_data)->has_side_effects = true;
3982
3983      /* Get some metadata from the image intrinsic. */
3984      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3985      const GLenum format = nir_intrinsic_format(instr);
3986
3987      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3988
3989      switch (instr->intrinsic) {
3990      case nir_intrinsic_image_load:
3991      case nir_intrinsic_image_store:
3992      case nir_intrinsic_image_atomic_add:
3993      case nir_intrinsic_image_atomic_min:
3994      case nir_intrinsic_image_atomic_max:
3995      case nir_intrinsic_image_atomic_and:
3996      case nir_intrinsic_image_atomic_or:
3997      case nir_intrinsic_image_atomic_xor:
3998      case nir_intrinsic_image_atomic_exchange:
3999      case nir_intrinsic_image_atomic_comp_swap:
4000         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4001            get_nir_image_intrinsic_image(bld, instr);
4002         break;
4003
4004      default:
4005         /* Bindless */
4006         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4007            bld.emit_uniformize(get_nir_src(instr->src[0]));
4008         break;
4009      }
4010
4011      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4012      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4013         brw_imm_ud(image_intrinsic_coord_components(instr));
4014
4015      /* Emit an image load, store or atomic op. */
4016      if (instr->intrinsic == nir_intrinsic_image_load ||
4017          instr->intrinsic == nir_intrinsic_bindless_image_load) {
4018         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4019         fs_inst *inst =
4020            bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4021                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4022         inst->size_written = instr->num_components * dispatch_width * 4;
4023      } else if (instr->intrinsic == nir_intrinsic_image_store ||
4024                 instr->intrinsic == nir_intrinsic_bindless_image_store) {
4025         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4026         srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
4027         bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4028                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4029      } else {
4030         int op;
4031         unsigned num_srcs = info->num_srcs;
4032
4033         switch (instr->intrinsic) {
4034         case nir_intrinsic_image_atomic_add:
4035         case nir_intrinsic_bindless_image_atomic_add:
4036            assert(num_srcs == 4);
4037
4038            op = get_op_for_atomic_add(instr, 3);
4039
4040            if (op != BRW_AOP_ADD)
4041               num_srcs = 3;
4042            break;
4043         case nir_intrinsic_image_atomic_min:
4044         case nir_intrinsic_bindless_image_atomic_min:
4045            assert(format == GL_R32UI || format == GL_R32I);
4046            op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
4047            break;
4048         case nir_intrinsic_image_atomic_max:
4049         case nir_intrinsic_bindless_image_atomic_max:
4050            assert(format == GL_R32UI || format == GL_R32I);
4051            op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
4052            break;
4053         case nir_intrinsic_image_atomic_and:
4054         case nir_intrinsic_bindless_image_atomic_and:
4055            op = BRW_AOP_AND;
4056            break;
4057         case nir_intrinsic_image_atomic_or:
4058         case nir_intrinsic_bindless_image_atomic_or:
4059            op = BRW_AOP_OR;
4060            break;
4061         case nir_intrinsic_image_atomic_xor:
4062         case nir_intrinsic_bindless_image_atomic_xor:
4063            op = BRW_AOP_XOR;
4064            break;
4065         case nir_intrinsic_image_atomic_exchange:
4066         case nir_intrinsic_bindless_image_atomic_exchange:
4067            op = BRW_AOP_MOV;
4068            break;
4069         case nir_intrinsic_image_atomic_comp_swap:
4070         case nir_intrinsic_bindless_image_atomic_comp_swap:
4071            op = BRW_AOP_CMPWR;
4072            break;
4073         default:
4074            unreachable("Not reachable.");
4075         }
4076
4077         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4078
4079         fs_reg data;
4080         if (num_srcs >= 4)
4081            data = get_nir_src(instr->src[3]);
4082         if (num_srcs >= 5) {
4083            fs_reg tmp = bld.vgrf(data.type, 2);
4084            fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
4085            bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4086            data = tmp;
4087         }
4088         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4089
4090         bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4091                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4092      }
4093      break;
4094   }
4095
4096   case nir_intrinsic_image_size:
4097   case nir_intrinsic_bindless_image_size: {
4098      /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4099       * into will handle the binding table index for us in the geneerator.
4100       * Incidentally, this means that we can handle bindless with exactly the
4101       * same code.
4102       */
4103      fs_reg image = retype(get_nir_src_imm(instr->src[0]),
4104                            BRW_REGISTER_TYPE_UD);
4105      image = bld.emit_uniformize(image);
4106
4107      fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4108      if (instr->intrinsic == nir_intrinsic_image_size)
4109         srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4110      else
4111         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4112      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
4113      srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
4114      srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
4115
4116      /* Since the image size is always uniform, we can just emit a SIMD8
4117       * query instruction and splat the result out.
4118       */
4119      const fs_builder ubld = bld.exec_all().group(8, 0);
4120
4121      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4122      fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4123                                tmp, srcs, ARRAY_SIZE(srcs));
4124      inst->size_written = 4 * REG_SIZE;
4125
4126      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
4127         if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
4128            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
4129                     offset(retype(dest, tmp.type), bld, c),
4130                     component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
4131         } else {
4132            bld.MOV(offset(retype(dest, tmp.type), bld, c),
4133                    component(offset(tmp, ubld, c), 0));
4134         }
4135      }
4136      break;
4137   }
4138
4139   case nir_intrinsic_image_load_raw_intel: {
4140      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4141      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4142         get_nir_image_intrinsic_image(bld, instr);
4143      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4144      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4145      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4146
4147      fs_inst *inst =
4148         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4149                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4150      inst->size_written = instr->num_components * dispatch_width * 4;
4151      break;
4152   }
4153
4154   case nir_intrinsic_image_store_raw_intel: {
4155      if (stage == MESA_SHADER_FRAGMENT)
4156         brw_wm_prog_data(prog_data)->has_side_effects = true;
4157
4158      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4159      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4160         get_nir_image_intrinsic_image(bld, instr);
4161      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4162      srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
4163      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4164      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4165
4166      bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4167               fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4168      break;
4169   }
4170
4171   case nir_intrinsic_group_memory_barrier:
4172   case nir_intrinsic_memory_barrier_shared:
4173   case nir_intrinsic_memory_barrier_atomic_counter:
4174   case nir_intrinsic_memory_barrier_buffer:
4175   case nir_intrinsic_memory_barrier_image:
4176   case nir_intrinsic_memory_barrier: {
4177      const fs_builder ubld = bld.group(8, 0);
4178      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4179      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
4180                brw_vec8_grf(0, 0), brw_imm_ud(0))
4181         ->size_written = 2 * REG_SIZE;
4182      break;
4183   }
4184
4185   case nir_intrinsic_shader_clock: {
4186      /* We cannot do anything if there is an event, so ignore it for now */
4187      const fs_reg shader_clock = get_timestamp(bld);
4188      const fs_reg srcs[] = { component(shader_clock, 0),
4189                              component(shader_clock, 1) };
4190      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4191      break;
4192   }
4193
4194   case nir_intrinsic_image_samples:
4195      /* The driver does not support multi-sampled images. */
4196      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4197      break;
4198
4199   case nir_intrinsic_load_uniform: {
4200      /* Offsets are in bytes but they should always aligned to
4201       * the type size
4202       */
4203      assert(instr->const_index[0] % 4 == 0 ||
4204             instr->const_index[0] % type_sz(dest.type) == 0);
4205
4206      fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4207
4208      if (nir_src_is_const(instr->src[0])) {
4209         unsigned load_offset = nir_src_as_uint(instr->src[0]);
4210         assert(load_offset % type_sz(dest.type) == 0);
4211         /* For 16-bit types we add the module of the const_index[0]
4212          * offset to access to not 32-bit aligned element
4213          */
4214         src.offset = load_offset + instr->const_index[0] % 4;
4215
4216         for (unsigned j = 0; j < instr->num_components; j++) {
4217            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4218         }
4219      } else {
4220         fs_reg indirect = retype(get_nir_src(instr->src[0]),
4221                                  BRW_REGISTER_TYPE_UD);
4222
4223         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4224          * go past the end of the uniform.  In order to keep the n'th
4225          * component from running past, we subtract off the size of all but
4226          * one component of the vector.
4227          */
4228         assert(instr->const_index[1] >=
4229                instr->num_components * (int) type_sz(dest.type));
4230         unsigned read_size = instr->const_index[1] -
4231            (instr->num_components - 1) * type_sz(dest.type);
4232
4233         bool supports_64bit_indirects =
4234            !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4235
4236         if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4237            for (unsigned j = 0; j < instr->num_components; j++) {
4238               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4239                        offset(dest, bld, j), offset(src, bld, j),
4240                        indirect, brw_imm_ud(read_size));
4241            }
4242         } else {
4243            const unsigned num_mov_indirects =
4244               type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4245            /* We read a little bit less per MOV INDIRECT, as they are now
4246             * 32-bits ones instead of 64-bit. Fix read_size then.
4247             */
4248            const unsigned read_size_32bit = read_size -
4249                (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4250            for (unsigned j = 0; j < instr->num_components; j++) {
4251               for (unsigned i = 0; i < num_mov_indirects; i++) {
4252                  bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4253                           subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4254                           subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4255                           indirect, brw_imm_ud(read_size_32bit));
4256               }
4257            }
4258         }
4259      }
4260      break;
4261   }
4262
4263   case nir_intrinsic_load_ubo: {
4264      fs_reg surf_index;
4265      if (nir_src_is_const(instr->src[0])) {
4266         const unsigned index = stage_prog_data->binding_table.ubo_start +
4267                                nir_src_as_uint(instr->src[0]);
4268         surf_index = brw_imm_ud(index);
4269      } else {
4270         /* The block index is not a constant. Evaluate the index expression
4271          * per-channel and add the base UBO index; we have to select a value
4272          * from any live channel.
4273          */
4274         surf_index = vgrf(glsl_type::uint_type);
4275         bld.ADD(surf_index, get_nir_src(instr->src[0]),
4276                 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4277         surf_index = bld.emit_uniformize(surf_index);
4278      }
4279
4280      if (!nir_src_is_const(instr->src[1])) {
4281         fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4282                                     BRW_REGISTER_TYPE_UD);
4283
4284         for (int i = 0; i < instr->num_components; i++)
4285            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4286                                       base_offset, i * type_sz(dest.type));
4287      } else {
4288         /* Even if we are loading doubles, a pull constant load will load
4289          * a 32-bit vec4, so should only reserve vgrf space for that. If we
4290          * need to load a full dvec4 we will have to emit 2 loads. This is
4291          * similar to demote_pull_constants(), except that in that case we
4292          * see individual accesses to each component of the vector and then
4293          * we let CSE deal with duplicate loads. Here we see a vector access
4294          * and we have to split it if necessary.
4295          */
4296         const unsigned type_size = type_sz(dest.type);
4297         const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4298
4299         /* See if we've selected this as a push constant candidate */
4300         if (nir_src_is_const(instr->src[0])) {
4301            const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4302            const unsigned offset_256b = load_offset / 32;
4303
4304            fs_reg push_reg;
4305            for (int i = 0; i < 4; i++) {
4306               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4307               if (range->block == ubo_block &&
4308                   offset_256b >= range->start &&
4309                   offset_256b < range->start + range->length) {
4310
4311                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4312                  push_reg.offset = load_offset - 32 * range->start;
4313                  break;
4314               }
4315            }
4316
4317            if (push_reg.file != BAD_FILE) {
4318               for (unsigned i = 0; i < instr->num_components; i++) {
4319                  bld.MOV(offset(dest, bld, i),
4320                          byte_offset(push_reg, i * type_size));
4321               }
4322               break;
4323            }
4324         }
4325
4326         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4327         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4328         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4329
4330         for (unsigned c = 0; c < instr->num_components;) {
4331            const unsigned base = load_offset + c * type_size;
4332            /* Number of usable components in the next block-aligned load. */
4333            const unsigned count = MIN2(instr->num_components - c,
4334                                        (block_sz - base % block_sz) / type_size);
4335
4336            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4337                      packed_consts, surf_index,
4338                      brw_imm_ud(base & ~(block_sz - 1)));
4339
4340            const fs_reg consts =
4341               retype(byte_offset(packed_consts, base & (block_sz - 1)),
4342                      dest.type);
4343
4344            for (unsigned d = 0; d < count; d++)
4345               bld.MOV(offset(dest, bld, c + d), component(consts, d));
4346
4347            c += count;
4348         }
4349      }
4350      break;
4351   }
4352
4353   case nir_intrinsic_load_global: {
4354      assert(devinfo->gen >= 8);
4355
4356      if (nir_intrinsic_align(instr) >= 4) {
4357         assert(nir_dest_bit_size(instr->dest) == 32);
4358         fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4359                                  dest,
4360                                  get_nir_src(instr->src[0]), /* Address */
4361                                  fs_reg(), /* No source data */
4362                                  brw_imm_ud(instr->num_components));
4363         inst->size_written = instr->num_components *
4364                              inst->dst.component_size(inst->exec_size);
4365      } else {
4366         const unsigned bit_size = nir_dest_bit_size(instr->dest);
4367         assert(bit_size <= 32);
4368         assert(nir_dest_num_components(instr->dest) == 1);
4369         brw_reg_type data_type =
4370            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4371         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4372         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4373                  tmp,
4374                  get_nir_src(instr->src[0]), /* Address */
4375                  fs_reg(), /* No source data */
4376                  brw_imm_ud(bit_size));
4377         bld.MOV(retype(dest, data_type), tmp);
4378      }
4379      break;
4380   }
4381
4382   case nir_intrinsic_store_global:
4383      assert(devinfo->gen >= 8);
4384
4385      if (stage == MESA_SHADER_FRAGMENT)
4386         brw_wm_prog_data(prog_data)->has_side_effects = true;
4387
4388      if (nir_intrinsic_align(instr) >= 4) {
4389         assert(nir_src_bit_size(instr->src[0]) == 32);
4390         bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4391                  fs_reg(),
4392                  get_nir_src(instr->src[1]), /* Address */
4393                  get_nir_src(instr->src[0]), /* Data */
4394                  brw_imm_ud(instr->num_components));
4395      } else {
4396         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4397         assert(bit_size <= 32);
4398         assert(nir_src_num_components(instr->src[0]) == 1);
4399         brw_reg_type data_type =
4400            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4401         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4402         bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4403         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4404                  fs_reg(),
4405                  get_nir_src(instr->src[1]), /* Address */
4406                  tmp, /* Data */
4407                  brw_imm_ud(nir_src_bit_size(instr->src[0])));
4408      }
4409      break;
4410
4411   case nir_intrinsic_global_atomic_add:
4412      nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
4413      break;
4414   case nir_intrinsic_global_atomic_imin:
4415      nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr);
4416      break;
4417   case nir_intrinsic_global_atomic_umin:
4418      nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr);
4419      break;
4420   case nir_intrinsic_global_atomic_imax:
4421      nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr);
4422      break;
4423   case nir_intrinsic_global_atomic_umax:
4424      nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr);
4425      break;
4426   case nir_intrinsic_global_atomic_and:
4427      nir_emit_global_atomic(bld, BRW_AOP_AND, instr);
4428      break;
4429   case nir_intrinsic_global_atomic_or:
4430      nir_emit_global_atomic(bld, BRW_AOP_OR, instr);
4431      break;
4432   case nir_intrinsic_global_atomic_xor:
4433      nir_emit_global_atomic(bld, BRW_AOP_XOR, instr);
4434      break;
4435   case nir_intrinsic_global_atomic_exchange:
4436      nir_emit_global_atomic(bld, BRW_AOP_MOV, instr);
4437      break;
4438   case nir_intrinsic_global_atomic_comp_swap:
4439      nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr);
4440      break;
4441   case nir_intrinsic_global_atomic_fmin:
4442      nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr);
4443      break;
4444   case nir_intrinsic_global_atomic_fmax:
4445      nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr);
4446      break;
4447   case nir_intrinsic_global_atomic_fcomp_swap:
4448      nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4449      break;
4450
4451   case nir_intrinsic_load_ssbo: {
4452      assert(devinfo->gen >= 7);
4453
4454      const unsigned bit_size = nir_dest_bit_size(instr->dest);
4455      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4456      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4457         get_nir_ssbo_intrinsic_index(bld, instr);
4458      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4459      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4460
4461      /* Make dest unsigned because that's what the temporary will be */
4462      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4463
4464      /* Read the vector */
4465      if (nir_intrinsic_align(instr) >= 4) {
4466         assert(nir_dest_bit_size(instr->dest) == 32);
4467         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4468         fs_inst *inst =
4469            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4470                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4471         inst->size_written = instr->num_components * dispatch_width * 4;
4472      } else {
4473         assert(nir_dest_bit_size(instr->dest) <= 32);
4474         assert(nir_dest_num_components(instr->dest) == 1);
4475         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4476
4477         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4478         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4479                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4480         bld.MOV(dest, read_result);
4481      }
4482      break;
4483   }
4484
4485   case nir_intrinsic_store_ssbo: {
4486      assert(devinfo->gen >= 7);
4487
4488      if (stage == MESA_SHADER_FRAGMENT)
4489         brw_wm_prog_data(prog_data)->has_side_effects = true;
4490
4491      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4492      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4493      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4494         get_nir_ssbo_intrinsic_index(bld, instr);
4495      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4496      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4497
4498      fs_reg data = get_nir_src(instr->src[0]);
4499      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4500
4501      assert(nir_intrinsic_write_mask(instr) ==
4502             (1u << instr->num_components) - 1);
4503      if (nir_intrinsic_align(instr) >= 4) {
4504         assert(nir_src_bit_size(instr->src[0]) == 32);
4505         assert(nir_src_num_components(instr->src[0]) <= 4);
4506         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4507         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4508         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4509                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4510      } else {
4511         assert(nir_src_bit_size(instr->src[0]) <= 32);
4512         assert(nir_src_num_components(instr->src[0]) == 1);
4513         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4514
4515         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4516         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4517
4518         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4519                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4520      }
4521      break;
4522   }
4523
4524   case nir_intrinsic_store_output: {
4525      fs_reg src = get_nir_src(instr->src[0]);
4526
4527      unsigned store_offset = nir_src_as_uint(instr->src[1]);
4528      unsigned num_components = instr->num_components;
4529      unsigned first_component = nir_intrinsic_component(instr);
4530      if (nir_src_bit_size(instr->src[0]) == 64) {
4531         src = shuffle_for_32bit_write(bld, src, 0, num_components);
4532         num_components *= 2;
4533      }
4534
4535      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4536                                      4 * store_offset), src.type);
4537      for (unsigned j = 0; j < num_components; j++) {
4538         bld.MOV(offset(new_dest, bld, j + first_component),
4539                 offset(src, bld, j));
4540      }
4541      break;
4542   }
4543
4544   case nir_intrinsic_ssbo_atomic_add:
4545      nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4546      break;
4547   case nir_intrinsic_ssbo_atomic_imin:
4548      nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4549      break;
4550   case nir_intrinsic_ssbo_atomic_umin:
4551      nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4552      break;
4553   case nir_intrinsic_ssbo_atomic_imax:
4554      nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4555      break;
4556   case nir_intrinsic_ssbo_atomic_umax:
4557      nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4558      break;
4559   case nir_intrinsic_ssbo_atomic_and:
4560      nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4561      break;
4562   case nir_intrinsic_ssbo_atomic_or:
4563      nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4564      break;
4565   case nir_intrinsic_ssbo_atomic_xor:
4566      nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4567      break;
4568   case nir_intrinsic_ssbo_atomic_exchange:
4569      nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4570      break;
4571   case nir_intrinsic_ssbo_atomic_comp_swap:
4572      nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4573      break;
4574   case nir_intrinsic_ssbo_atomic_fmin:
4575      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4576      break;
4577   case nir_intrinsic_ssbo_atomic_fmax:
4578      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4579      break;
4580   case nir_intrinsic_ssbo_atomic_fcomp_swap:
4581      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4582      break;
4583
4584   case nir_intrinsic_get_buffer_size: {
4585      assert(nir_src_num_components(instr->src[0]) == 1);
4586      unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4587                            nir_src_as_uint(instr->src[0]) : 0;
4588
4589      /* A resinfo's sampler message is used to get the buffer size.  The
4590       * SIMD8's writeback message consists of four registers and SIMD16's
4591       * writeback message consists of 8 destination registers (two per each
4592       * component).  Because we are only interested on the first channel of
4593       * the first returned component, where resinfo returns the buffer size
4594       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4595       * the dispatch width.
4596       */
4597      const fs_builder ubld = bld.exec_all().group(8, 0);
4598      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4599      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4600
4601      /* Set LOD = 0 */
4602      ubld.MOV(src_payload, brw_imm_d(0));
4603
4604      const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4605      fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4606                                src_payload, brw_imm_ud(index));
4607      inst->header_size = 0;
4608      inst->mlen = 1;
4609      inst->size_written = 4 * REG_SIZE;
4610
4611      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4612       *
4613       * "Out-of-bounds checking is always performed at a DWord granularity. If
4614       * any part of the DWord is out-of-bounds then the whole DWord is
4615       * considered out-of-bounds."
4616       *
4617       * This implies that types with size smaller than 4-bytes need to be
4618       * padded if they don't complete the last dword of the buffer. But as we
4619       * need to maintain the original size we need to reverse the padding
4620       * calculation to return the correct size to know the number of elements
4621       * of an unsized array. As we stored in the last two bits of the surface
4622       * size the needed padding for the buffer, we calculate here the
4623       * original buffer_size reversing the surface_size calculation:
4624       *
4625       * surface_size = isl_align(buffer_size, 4) +
4626       *                (isl_align(buffer_size) - buffer_size)
4627       *
4628       * buffer_size = surface_size & ~3 - surface_size & 3
4629       */
4630
4631      fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4632      fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4633      fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4634
4635      ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4636      ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4637      ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4638
4639      bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4640      break;
4641   }
4642
4643   case nir_intrinsic_load_subgroup_invocation:
4644      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4645              nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4646      break;
4647
4648   case nir_intrinsic_load_subgroup_eq_mask:
4649   case nir_intrinsic_load_subgroup_ge_mask:
4650   case nir_intrinsic_load_subgroup_gt_mask:
4651   case nir_intrinsic_load_subgroup_le_mask:
4652   case nir_intrinsic_load_subgroup_lt_mask:
4653      unreachable("not reached");
4654
4655   case nir_intrinsic_vote_any: {
4656      const fs_builder ubld = bld.exec_all().group(1, 0);
4657
4658      /* The any/all predicates do not consider channel enables. To prevent
4659       * dead channels from affecting the result, we initialize the flag with
4660       * with the identity value for the logical operation.
4661       */
4662      if (dispatch_width == 32) {
4663         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4664         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4665                         brw_imm_ud(0));
4666      } else {
4667         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4668      }
4669      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4670
4671      /* For some reason, the any/all predicates don't work properly with
4672       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4673       * doesn't read the correct subset of the flag register and you end up
4674       * getting garbage in the second half.  Work around this by using a pair
4675       * of 1-wide MOVs and scattering the result.
4676       */
4677      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4678      ubld.MOV(res1, brw_imm_d(0));
4679      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4680                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4681                                           BRW_PREDICATE_ALIGN1_ANY32H,
4682                    ubld.MOV(res1, brw_imm_d(-1)));
4683
4684      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4685      break;
4686   }
4687   case nir_intrinsic_vote_all: {
4688      const fs_builder ubld = bld.exec_all().group(1, 0);
4689
4690      /* The any/all predicates do not consider channel enables. To prevent
4691       * dead channels from affecting the result, we initialize the flag with
4692       * with the identity value for the logical operation.
4693       */
4694      if (dispatch_width == 32) {
4695         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4696         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4697                         brw_imm_ud(0xffffffff));
4698      } else {
4699         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4700      }
4701      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4702
4703      /* For some reason, the any/all predicates don't work properly with
4704       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4705       * doesn't read the correct subset of the flag register and you end up
4706       * getting garbage in the second half.  Work around this by using a pair
4707       * of 1-wide MOVs and scattering the result.
4708       */
4709      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4710      ubld.MOV(res1, brw_imm_d(0));
4711      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4712                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4713                                           BRW_PREDICATE_ALIGN1_ALL32H,
4714                    ubld.MOV(res1, brw_imm_d(-1)));
4715
4716      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4717      break;
4718   }
4719   case nir_intrinsic_vote_feq:
4720   case nir_intrinsic_vote_ieq: {
4721      fs_reg value = get_nir_src(instr->src[0]);
4722      if (instr->intrinsic == nir_intrinsic_vote_feq) {
4723         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4724         value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
4725            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4726      }
4727
4728      fs_reg uniformized = bld.emit_uniformize(value);
4729      const fs_builder ubld = bld.exec_all().group(1, 0);
4730
4731      /* The any/all predicates do not consider channel enables. To prevent
4732       * dead channels from affecting the result, we initialize the flag with
4733       * with the identity value for the logical operation.
4734       */
4735      if (dispatch_width == 32) {
4736         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4737         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4738                         brw_imm_ud(0xffffffff));
4739      } else {
4740         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4741      }
4742      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4743
4744      /* For some reason, the any/all predicates don't work properly with
4745       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4746       * doesn't read the correct subset of the flag register and you end up
4747       * getting garbage in the second half.  Work around this by using a pair
4748       * of 1-wide MOVs and scattering the result.
4749       */
4750      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4751      ubld.MOV(res1, brw_imm_d(0));
4752      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4753                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4754                                           BRW_PREDICATE_ALIGN1_ALL32H,
4755                    ubld.MOV(res1, brw_imm_d(-1)));
4756
4757      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4758      break;
4759   }
4760
4761   case nir_intrinsic_ballot: {
4762      const fs_reg value = retype(get_nir_src(instr->src[0]),
4763                                  BRW_REGISTER_TYPE_UD);
4764      struct brw_reg flag = brw_flag_reg(0, 0);
4765      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4766       * as f0.0.  This is a problem for fragment programs as we currently use
4767       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4768       * programs yet so this isn't a problem.  When we do, something will
4769       * have to change.
4770       */
4771      if (dispatch_width == 32)
4772         flag.type = BRW_REGISTER_TYPE_UD;
4773
4774      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4775      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4776
4777      if (instr->dest.ssa.bit_size > 32) {
4778         dest.type = BRW_REGISTER_TYPE_UQ;
4779      } else {
4780         dest.type = BRW_REGISTER_TYPE_UD;
4781      }
4782      bld.MOV(dest, flag);
4783      break;
4784   }
4785
4786   case nir_intrinsic_read_invocation: {
4787      const fs_reg value = get_nir_src(instr->src[0]);
4788      const fs_reg invocation = get_nir_src(instr->src[1]);
4789      fs_reg tmp = bld.vgrf(value.type);
4790
4791      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4792                          bld.emit_uniformize(invocation));
4793
4794      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4795      break;
4796   }
4797
4798   case nir_intrinsic_read_first_invocation: {
4799      const fs_reg value = get_nir_src(instr->src[0]);
4800      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4801      break;
4802   }
4803
4804   case nir_intrinsic_shuffle: {
4805      const fs_reg value = get_nir_src(instr->src[0]);
4806      const fs_reg index = get_nir_src(instr->src[1]);
4807
4808      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4809      break;
4810   }
4811
4812   case nir_intrinsic_first_invocation: {
4813      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4814      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4815      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4816              fs_reg(component(tmp, 0)));
4817      break;
4818   }
4819
4820   case nir_intrinsic_quad_broadcast: {
4821      const fs_reg value = get_nir_src(instr->src[0]);
4822      const unsigned index = nir_src_as_uint(instr->src[1]);
4823
4824      bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4825               value, brw_imm_ud(index), brw_imm_ud(4));
4826      break;
4827   }
4828
4829   case nir_intrinsic_quad_swap_horizontal: {
4830      const fs_reg value = get_nir_src(instr->src[0]);
4831      const fs_reg tmp = bld.vgrf(value.type);
4832      if (devinfo->gen <= 7) {
4833         /* The hardware doesn't seem to support these crazy regions with
4834          * compressed instructions on gen7 and earlier so we fall back to
4835          * using quad swizzles.  Fortunately, we don't support 64-bit
4836          * anything in Vulkan on gen7.
4837          */
4838         assert(nir_src_bit_size(instr->src[0]) == 32);
4839         const fs_builder ubld = bld.exec_all();
4840         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4841                   brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
4842         bld.MOV(retype(dest, value.type), tmp);
4843      } else {
4844         const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4845
4846         const fs_reg src_left = horiz_stride(value, 2);
4847         const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4848         const fs_reg tmp_left = horiz_stride(tmp, 2);
4849         const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4850
4851         ubld.MOV(tmp_left, src_right);
4852         ubld.MOV(tmp_right, src_left);
4853
4854      }
4855      bld.MOV(retype(dest, value.type), tmp);
4856      break;
4857   }
4858
4859   case nir_intrinsic_quad_swap_vertical: {
4860      const fs_reg value = get_nir_src(instr->src[0]);
4861      if (nir_src_bit_size(instr->src[0]) == 32) {
4862         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4863         const fs_reg tmp = bld.vgrf(value.type);
4864         const fs_builder ubld = bld.exec_all();
4865         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4866                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4867         bld.MOV(retype(dest, value.type), tmp);
4868      } else {
4869         /* For larger data types, we have to either emit dispatch_width many
4870          * MOVs or else fall back to doing indirects.
4871          */
4872         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4873         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4874                      brw_imm_w(0x2));
4875         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4876      }
4877      break;
4878   }
4879
4880   case nir_intrinsic_quad_swap_diagonal: {
4881      const fs_reg value = get_nir_src(instr->src[0]);
4882      if (nir_src_bit_size(instr->src[0]) == 32) {
4883         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4884         const fs_reg tmp = bld.vgrf(value.type);
4885         const fs_builder ubld = bld.exec_all();
4886         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4887                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4888         bld.MOV(retype(dest, value.type), tmp);
4889      } else {
4890         /* For larger data types, we have to either emit dispatch_width many
4891          * MOVs or else fall back to doing indirects.
4892          */
4893         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4894         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4895                      brw_imm_w(0x3));
4896         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4897      }
4898      break;
4899   }
4900
4901   case nir_intrinsic_reduce: {
4902      fs_reg src = get_nir_src(instr->src[0]);
4903      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4904      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4905      if (cluster_size == 0 || cluster_size > dispatch_width)
4906         cluster_size = dispatch_width;
4907
4908      /* Figure out the source type */
4909      src.type = brw_type_for_nir_type(devinfo,
4910         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4911                        nir_src_bit_size(instr->src[0])));
4912
4913      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4914      opcode brw_op = brw_op_for_nir_reduction_op(redop);
4915      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4916
4917      /* Set up a register for all of our scratching around and initialize it
4918       * to reduction operation's identity value.
4919       */
4920      fs_reg scan = bld.vgrf(src.type);
4921      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4922
4923      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4924
4925      dest.type = src.type;
4926      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4927         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4928          * the distance between clusters is at least 2 GRFs.  In this case,
4929          * we don't need the weird striding of the CLUSTER_BROADCAST
4930          * instruction and can just do regular MOVs.
4931          */
4932         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4933         const unsigned groups =
4934            (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4935         const unsigned group_size = dispatch_width / groups;
4936         for (unsigned i = 0; i < groups; i++) {
4937            const unsigned cluster = (i * group_size) / cluster_size;
4938            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4939            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4940                                         component(scan, comp));
4941         }
4942      } else {
4943         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4944                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4945      }
4946      break;
4947   }
4948
4949   case nir_intrinsic_inclusive_scan:
4950   case nir_intrinsic_exclusive_scan: {
4951      fs_reg src = get_nir_src(instr->src[0]);
4952      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4953
4954      /* Figure out the source type */
4955      src.type = brw_type_for_nir_type(devinfo,
4956         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4957                        nir_src_bit_size(instr->src[0])));
4958
4959      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4960      opcode brw_op = brw_op_for_nir_reduction_op(redop);
4961      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4962
4963      /* Set up a register for all of our scratching around and initialize it
4964       * to reduction operation's identity value.
4965       */
4966      fs_reg scan = bld.vgrf(src.type);
4967      const fs_builder allbld = bld.exec_all();
4968      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4969
4970      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4971         /* Exclusive scan is a bit harder because we have to do an annoying
4972          * shift of the contents before we can begin.  To make things worse,
4973          * we can't do this with a normal stride; we have to use indirects.
4974          */
4975         fs_reg shifted = bld.vgrf(src.type);
4976         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4977         allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4978                         brw_imm_w(-1));
4979         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4980         allbld.group(1, 0).MOV(component(shifted, 0), identity);
4981         scan = shifted;
4982      }
4983
4984      bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4985
4986      bld.MOV(retype(dest, src.type), scan);
4987      break;
4988   }
4989
4990   case nir_intrinsic_begin_invocation_interlock: {
4991      const fs_builder ubld = bld.group(8, 0);
4992      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4993
4994      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
4995         ->size_written = 2 * REG_SIZE;
4996      break;
4997   }
4998
4999   case nir_intrinsic_end_invocation_interlock: {
5000      /* For endInvocationInterlock(), we need to insert a memory fence which
5001       * stalls in the shader until the memory transactions prior to that
5002       * fence are complete.  This ensures that the shader does not end before
5003       * any writes from its critical section have landed.  Otherwise, you can
5004       * end up with a case where the next invocation on that pixel properly
5005       * stalls for previous FS invocation on its pixel to complete but
5006       * doesn't actually wait for the dataport memory transactions from that
5007       * thread to land before submitting its own.
5008       */
5009      const fs_builder ubld = bld.group(8, 0);
5010      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5011      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
5012                brw_vec8_grf(0, 0), brw_imm_ud(1))
5013         ->size_written = 2 * REG_SIZE;
5014      break;
5015   }
5016
5017   default:
5018      unreachable("unknown intrinsic");
5019   }
5020}
5021
5022void
5023fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
5024                                 int op, nir_intrinsic_instr *instr)
5025{
5026   if (stage == MESA_SHADER_FRAGMENT)
5027      brw_wm_prog_data(prog_data)->has_side_effects = true;
5028
5029   /* The BTI untyped atomic messages only support 32-bit atomics.  If you
5030    * just look at the big table of messages in the Vol 7 of the SKL PRM, they
5031    * appear to exist.  However, if you look at Vol 2a, there are no message
5032    * descriptors provided for Qword atomic ops except for A64 messages.
5033    */
5034   assert(nir_dest_bit_size(instr->dest) == 32);
5035
5036   fs_reg dest;
5037   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5038      dest = get_nir_dest(instr->dest);
5039
5040   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5041   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5042   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5043   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5044   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5045
5046   fs_reg data;
5047   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5048      data = get_nir_src(instr->src[2]);
5049
5050   if (op == BRW_AOP_CMPWR) {
5051      fs_reg tmp = bld.vgrf(data.type, 2);
5052      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5053      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5054      data = tmp;
5055   }
5056   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5057
5058   /* Emit the actual atomic operation */
5059
5060   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5061            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5062}
5063
5064void
5065fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
5066                                       int op, nir_intrinsic_instr *instr)
5067{
5068   if (stage == MESA_SHADER_FRAGMENT)
5069      brw_wm_prog_data(prog_data)->has_side_effects = true;
5070
5071   fs_reg dest;
5072   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5073      dest = get_nir_dest(instr->dest);
5074
5075   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5076   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5077   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5078   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5079   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5080
5081   fs_reg data = get_nir_src(instr->src[2]);
5082   if (op == BRW_AOP_FCMPWR) {
5083      fs_reg tmp = bld.vgrf(data.type, 2);
5084      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5085      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5086      data = tmp;
5087   }
5088   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5089
5090   /* Emit the actual atomic operation */
5091
5092   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5093            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5094}
5095
5096void
5097fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
5098                                   int op, nir_intrinsic_instr *instr)
5099{
5100   fs_reg dest;
5101   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5102      dest = get_nir_dest(instr->dest);
5103
5104   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5105   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5106   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5107   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5108
5109   fs_reg data;
5110   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5111      data = get_nir_src(instr->src[1]);
5112   if (op == BRW_AOP_CMPWR) {
5113      fs_reg tmp = bld.vgrf(data.type, 2);
5114      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5115      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5116      data = tmp;
5117   }
5118   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5119
5120   /* Get the offset */
5121   if (nir_src_is_const(instr->src[0])) {
5122      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5123         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5124   } else {
5125      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5126      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5127	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5128	      brw_imm_ud(instr->const_index[0]));
5129   }
5130
5131   /* Emit the actual atomic operation operation */
5132
5133   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5134            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5135}
5136
5137void
5138fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
5139                                         int op, nir_intrinsic_instr *instr)
5140{
5141   fs_reg dest;
5142   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5143      dest = get_nir_dest(instr->dest);
5144
5145   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5146   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5147   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5148   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5149
5150   fs_reg data = get_nir_src(instr->src[1]);
5151   if (op == BRW_AOP_FCMPWR) {
5152      fs_reg tmp = bld.vgrf(data.type, 2);
5153      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5154      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5155      data = tmp;
5156   }
5157   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5158
5159   /* Get the offset */
5160   if (nir_src_is_const(instr->src[0])) {
5161      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5162         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5163   } else {
5164      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5165      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5166	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5167	      brw_imm_ud(instr->const_index[0]));
5168   }
5169
5170   /* Emit the actual atomic operation operation */
5171
5172   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5173            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5174}
5175
5176void
5177fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
5178                                   int op, nir_intrinsic_instr *instr)
5179{
5180   if (stage == MESA_SHADER_FRAGMENT)
5181      brw_wm_prog_data(prog_data)->has_side_effects = true;
5182
5183   fs_reg dest;
5184   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5185      dest = get_nir_dest(instr->dest);
5186
5187   fs_reg addr = get_nir_src(instr->src[0]);
5188
5189   fs_reg data;
5190   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5191      data = get_nir_src(instr->src[1]);
5192
5193   if (op == BRW_AOP_CMPWR) {
5194      fs_reg tmp = bld.vgrf(data.type, 2);
5195      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5196      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5197      data = tmp;
5198   }
5199
5200   if (nir_dest_bit_size(instr->dest) == 64) {
5201      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,
5202               dest, addr, data, brw_imm_ud(op));
5203   } else {
5204      assert(nir_dest_bit_size(instr->dest) == 32);
5205      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5206               dest, addr, data, brw_imm_ud(op));
5207   }
5208}
5209
5210void
5211fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
5212                                         int op, nir_intrinsic_instr *instr)
5213{
5214   if (stage == MESA_SHADER_FRAGMENT)
5215      brw_wm_prog_data(prog_data)->has_side_effects = true;
5216
5217   assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
5218   fs_reg dest = get_nir_dest(instr->dest);
5219
5220   fs_reg addr = get_nir_src(instr->src[0]);
5221
5222   assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
5223   fs_reg data = get_nir_src(instr->src[1]);
5224
5225   if (op == BRW_AOP_FCMPWR) {
5226      fs_reg tmp = bld.vgrf(data.type, 2);
5227      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5228      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5229      data = tmp;
5230   }
5231
5232   bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5233            dest, addr, data, brw_imm_ud(op));
5234}
5235
5236void
5237fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
5238{
5239   unsigned texture = instr->texture_index;
5240   unsigned sampler = instr->sampler_index;
5241
5242   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5243
5244   srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
5245   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
5246
5247   int lod_components = 0;
5248
5249   /* The hardware requires a LOD for buffer textures */
5250   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
5251      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
5252
5253   uint32_t header_bits = 0;
5254   for (unsigned i = 0; i < instr->num_srcs; i++) {
5255      fs_reg src = get_nir_src(instr->src[i].src);
5256      switch (instr->src[i].src_type) {
5257      case nir_tex_src_bias:
5258         srcs[TEX_LOGICAL_SRC_LOD] =
5259            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5260         break;
5261      case nir_tex_src_comparator:
5262         srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5263         break;
5264      case nir_tex_src_coord:
5265         switch (instr->op) {
5266         case nir_texop_txf:
5267         case nir_texop_txf_ms:
5268         case nir_texop_txf_ms_mcs:
5269         case nir_texop_samples_identical:
5270            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5271            break;
5272         default:
5273            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5274            break;
5275         }
5276         break;
5277      case nir_tex_src_ddx:
5278         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5279         lod_components = nir_tex_instr_src_size(instr, i);
5280         break;
5281      case nir_tex_src_ddy:
5282         srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5283         break;
5284      case nir_tex_src_lod:
5285         switch (instr->op) {
5286         case nir_texop_txs:
5287            srcs[TEX_LOGICAL_SRC_LOD] =
5288               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5289            break;
5290         case nir_texop_txf:
5291            srcs[TEX_LOGICAL_SRC_LOD] =
5292               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5293            break;
5294         default:
5295            srcs[TEX_LOGICAL_SRC_LOD] =
5296               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5297            break;
5298         }
5299         break;
5300      case nir_tex_src_min_lod:
5301         srcs[TEX_LOGICAL_SRC_MIN_LOD] =
5302            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5303         break;
5304      case nir_tex_src_ms_index:
5305         srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5306         break;
5307
5308      case nir_tex_src_offset: {
5309         uint32_t offset_bits = 0;
5310         if (brw_texture_offset(instr, i, &offset_bits)) {
5311            header_bits |= offset_bits;
5312         } else {
5313            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5314               retype(src, BRW_REGISTER_TYPE_D);
5315         }
5316         break;
5317      }
5318
5319      case nir_tex_src_projector:
5320         unreachable("should be lowered");
5321
5322      case nir_tex_src_texture_offset: {
5323         /* Emit code to evaluate the actual indexing expression */
5324         fs_reg tmp = vgrf(glsl_type::uint_type);
5325         bld.ADD(tmp, src, brw_imm_ud(texture));
5326         srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5327         break;
5328      }
5329
5330      case nir_tex_src_sampler_offset: {
5331         /* Emit code to evaluate the actual indexing expression */
5332         fs_reg tmp = vgrf(glsl_type::uint_type);
5333         bld.ADD(tmp, src, brw_imm_ud(sampler));
5334         srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5335         break;
5336      }
5337
5338      case nir_tex_src_texture_handle:
5339         assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
5340         srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
5341         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
5342         break;
5343
5344      case nir_tex_src_sampler_handle:
5345         assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
5346         srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
5347         srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
5348         break;
5349
5350      case nir_tex_src_ms_mcs:
5351         assert(instr->op == nir_texop_txf_ms);
5352         srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5353         break;
5354
5355      case nir_tex_src_plane: {
5356         const uint32_t plane = nir_src_as_uint(instr->src[i].src);
5357         const uint32_t texture_index =
5358            instr->texture_index +
5359            stage_prog_data->binding_table.plane_start[plane] -
5360            stage_prog_data->binding_table.texture_start;
5361
5362         srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5363         break;
5364      }
5365
5366      default:
5367         unreachable("unknown texture source");
5368      }
5369   }
5370
5371   if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5372       (instr->op == nir_texop_txf_ms ||
5373        instr->op == nir_texop_samples_identical)) {
5374      if (devinfo->gen >= 7 &&
5375          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5376         srcs[TEX_LOGICAL_SRC_MCS] =
5377            emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5378                           instr->coord_components,
5379                           srcs[TEX_LOGICAL_SRC_SURFACE],
5380                           srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
5381      } else {
5382         srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5383      }
5384   }
5385
5386   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5387   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5388
5389   enum opcode opcode;
5390   switch (instr->op) {
5391   case nir_texop_tex:
5392      opcode = SHADER_OPCODE_TEX_LOGICAL;
5393      break;
5394   case nir_texop_txb:
5395      opcode = FS_OPCODE_TXB_LOGICAL;
5396      break;
5397   case nir_texop_txl:
5398      opcode = SHADER_OPCODE_TXL_LOGICAL;
5399      break;
5400   case nir_texop_txd:
5401      opcode = SHADER_OPCODE_TXD_LOGICAL;
5402      break;
5403   case nir_texop_txf:
5404      opcode = SHADER_OPCODE_TXF_LOGICAL;
5405      break;
5406   case nir_texop_txf_ms:
5407      if ((key_tex->msaa_16 & (1 << sampler)))
5408         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5409      else
5410         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5411      break;
5412   case nir_texop_txf_ms_mcs:
5413      opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5414      break;
5415   case nir_texop_query_levels:
5416   case nir_texop_txs:
5417      opcode = SHADER_OPCODE_TXS_LOGICAL;
5418      break;
5419   case nir_texop_lod:
5420      opcode = SHADER_OPCODE_LOD_LOGICAL;
5421      break;
5422   case nir_texop_tg4:
5423      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5424         opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5425      else
5426         opcode = SHADER_OPCODE_TG4_LOGICAL;
5427      break;
5428   case nir_texop_texture_samples:
5429      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5430      break;
5431   case nir_texop_samples_identical: {
5432      fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5433
5434      /* If mcs is an immediate value, it means there is no MCS.  In that case
5435       * just return false.
5436       */
5437      if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5438         bld.MOV(dst, brw_imm_ud(0u));
5439      } else if ((key_tex->msaa_16 & (1 << sampler))) {
5440         fs_reg tmp = vgrf(glsl_type::uint_type);
5441         bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5442                offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5443         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5444      } else {
5445         bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5446                 BRW_CONDITIONAL_EQ);
5447      }
5448      return;
5449   }
5450   default:
5451      unreachable("unknown texture opcode");
5452   }
5453
5454   if (instr->op == nir_texop_tg4) {
5455      if (instr->component == 1 &&
5456          key_tex->gather_channel_quirk_mask & (1 << texture)) {
5457         /* gather4 sampler is broken for green channel on RG32F --
5458          * we must ask for blue instead.
5459          */
5460         header_bits |= 2 << 16;
5461      } else {
5462         header_bits |= instr->component << 16;
5463      }
5464   }
5465
5466   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5467   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5468   inst->offset = header_bits;
5469
5470   const unsigned dest_size = nir_tex_instr_dest_size(instr);
5471   if (devinfo->gen >= 9 &&
5472       instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5473      unsigned write_mask = instr->dest.is_ssa ?
5474                            nir_ssa_def_components_read(&instr->dest.ssa):
5475                            (1 << dest_size) - 1;
5476      assert(write_mask != 0); /* dead code should have been eliminated */
5477      inst->size_written = util_last_bit(write_mask) *
5478                           inst->dst.component_size(inst->exec_size);
5479   } else {
5480      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5481   }
5482
5483   if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5484      inst->shadow_compare = true;
5485
5486   if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5487      emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5488
5489   fs_reg nir_dest[4];
5490   for (unsigned i = 0; i < dest_size; i++)
5491      nir_dest[i] = offset(dst, bld, i);
5492
5493   if (instr->op == nir_texop_query_levels) {
5494      /* # levels is in .w */
5495      nir_dest[0] = offset(dst, bld, 3);
5496   } else if (instr->op == nir_texop_txs &&
5497              dest_size >= 3 && devinfo->gen < 7) {
5498      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5499      fs_reg depth = offset(dst, bld, 2);
5500      nir_dest[2] = vgrf(glsl_type::int_type);
5501      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5502   }
5503
5504   bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5505}
5506
5507void
5508fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5509{
5510   switch (instr->type) {
5511   case nir_jump_break:
5512      bld.emit(BRW_OPCODE_BREAK);
5513      break;
5514   case nir_jump_continue:
5515      bld.emit(BRW_OPCODE_CONTINUE);
5516      break;
5517   case nir_jump_return:
5518   default:
5519      unreachable("unknown jump");
5520   }
5521}
5522
5523/*
5524 * This helper takes a source register and un/shuffles it into the destination
5525 * register.
5526 *
5527 * If source type size is smaller than destination type size the operation
5528 * needed is a component shuffle. The opposite case would be an unshuffle. If
5529 * source/destination type size is equal a shuffle is done that would be
5530 * equivalent to a simple MOV.
5531 *
5532 * For example, if source is a 16-bit type and destination is 32-bit. A 3
5533 * components .xyz 16-bit vector on SIMD8 would be.
5534 *
5535 *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5536 *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5537 *
5538 * This helper will return the following 2 32-bit components with the 16-bit
5539 * values shuffled:
5540 *
5541 *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5542 *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5543 *
5544 * For unshuffle, the example would be the opposite, a 64-bit type source
5545 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5546 * would be:
5547 *
5548 *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5549 *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5550 *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5551 *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5552 *
5553 * The returned result would be the following 4 32-bit components unshuffled:
5554 *
5555 *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5556 *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5557 *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5558 *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5559 *
5560 * - Source and destination register must not be overlapped.
5561 * - components units are measured in terms of the smaller type between
5562 *   source and destination because we are un/shuffling the smaller
5563 *   components from/into the bigger ones.
5564 * - first_component parameter allows skipping source components.
5565 */
5566void
5567shuffle_src_to_dst(const fs_builder &bld,
5568                   const fs_reg &dst,
5569                   const fs_reg &src,
5570                   uint32_t first_component,
5571                   uint32_t components)
5572{
5573   if (type_sz(src.type) == type_sz(dst.type)) {
5574      assert(!regions_overlap(dst,
5575         type_sz(dst.type) * bld.dispatch_width() * components,
5576         offset(src, bld, first_component),
5577         type_sz(src.type) * bld.dispatch_width() * components));
5578      for (unsigned i = 0; i < components; i++) {
5579         bld.MOV(retype(offset(dst, bld, i), src.type),
5580                 offset(src, bld, i + first_component));
5581      }
5582   } else if (type_sz(src.type) < type_sz(dst.type)) {
5583      /* Source is shuffled into destination */
5584      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5585      assert(!regions_overlap(dst,
5586         type_sz(dst.type) * bld.dispatch_width() *
5587         DIV_ROUND_UP(components, size_ratio),
5588         offset(src, bld, first_component),
5589         type_sz(src.type) * bld.dispatch_width() * components));
5590
5591      brw_reg_type shuffle_type =
5592         brw_reg_type_from_bit_size(8 * type_sz(src.type),
5593                                    BRW_REGISTER_TYPE_D);
5594      for (unsigned i = 0; i < components; i++) {
5595         fs_reg shuffle_component_i =
5596            subscript(offset(dst, bld, i / size_ratio),
5597                      shuffle_type, i % size_ratio);
5598         bld.MOV(shuffle_component_i,
5599                 retype(offset(src, bld, i + first_component), shuffle_type));
5600      }
5601   } else {
5602      /* Source is unshuffled into destination */
5603      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5604      assert(!regions_overlap(dst,
5605         type_sz(dst.type) * bld.dispatch_width() * components,
5606         offset(src, bld, first_component / size_ratio),
5607         type_sz(src.type) * bld.dispatch_width() *
5608         DIV_ROUND_UP(components + (first_component % size_ratio),
5609                      size_ratio)));
5610
5611      brw_reg_type shuffle_type =
5612         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5613                                    BRW_REGISTER_TYPE_D);
5614      for (unsigned i = 0; i < components; i++) {
5615         fs_reg shuffle_component_i =
5616            subscript(offset(src, bld, (first_component + i) / size_ratio),
5617                      shuffle_type, (first_component + i) % size_ratio);
5618         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5619                 shuffle_component_i);
5620      }
5621   }
5622}
5623
5624void
5625shuffle_from_32bit_read(const fs_builder &bld,
5626                        const fs_reg &dst,
5627                        const fs_reg &src,
5628                        uint32_t first_component,
5629                        uint32_t components)
5630{
5631   assert(type_sz(src.type) == 4);
5632
5633   /* This function takes components in units of the destination type while
5634    * shuffle_src_to_dst takes components in units of the smallest type
5635    */
5636   if (type_sz(dst.type) > 4) {
5637      assert(type_sz(dst.type) == 8);
5638      first_component *= 2;
5639      components *= 2;
5640   }
5641
5642   shuffle_src_to_dst(bld, dst, src, first_component, components);
5643}
5644
5645fs_reg
5646shuffle_for_32bit_write(const fs_builder &bld,
5647                        const fs_reg &src,
5648                        uint32_t first_component,
5649                        uint32_t components)
5650{
5651   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5652                         DIV_ROUND_UP (components * type_sz(src.type), 4));
5653   /* This function takes components in units of the source type while
5654    * shuffle_src_to_dst takes components in units of the smallest type
5655    */
5656   if (type_sz(src.type) > 4) {
5657      assert(type_sz(src.type) == 8);
5658      first_component *= 2;
5659      components *= 2;
5660   }
5661
5662   shuffle_src_to_dst(bld, dst, src, first_component, components);
5663
5664   return dst;
5665}
5666
5667fs_reg
5668setup_imm_df(const fs_builder &bld, double v)
5669{
5670   const struct gen_device_info *devinfo = bld.shader->devinfo;
5671   assert(devinfo->gen >= 7);
5672
5673   if (devinfo->gen >= 8)
5674      return brw_imm_df(v);
5675
5676   /* gen7.5 does not support DF immediates straighforward but the DIM
5677    * instruction allows to set the 64-bit immediate value.
5678    */
5679   if (devinfo->is_haswell) {
5680      const fs_builder ubld = bld.exec_all().group(1, 0);
5681      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5682      ubld.DIM(dst, brw_imm_df(v));
5683      return component(dst, 0);
5684   }
5685
5686   /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5687    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5688    * the high 32-bit to suboffset 4 and then applying a stride of 0.
5689    *
5690    * Alternatively, we could also produce a normal VGRF (without stride 0)
5691    * by writing to all the channels in the VGRF, however, that would hit the
5692    * gen7 bug where we have to split writes that span more than 1 register
5693    * into instructions with a width of 4 (otherwise the write to the second
5694    * register written runs into an execmask hardware bug) which isn't very
5695    * nice.
5696    */
5697   union {
5698      double d;
5699      struct {
5700         uint32_t i1;
5701         uint32_t i2;
5702      };
5703   } di;
5704
5705   di.d = v;
5706
5707   const fs_builder ubld = bld.exec_all().group(1, 0);
5708   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5709   ubld.MOV(tmp, brw_imm_ud(di.i1));
5710   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5711
5712   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5713}
5714
5715fs_reg
5716setup_imm_b(const fs_builder &bld, int8_t v)
5717{
5718   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5719   bld.MOV(tmp, brw_imm_w(v));
5720   return tmp;
5721}
5722
5723fs_reg
5724setup_imm_ub(const fs_builder &bld, uint8_t v)
5725{
5726   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5727   bld.MOV(tmp, brw_imm_uw(v));
5728   return tmp;
5729}
5730