brw_fs_nir.cpp revision 7ec681f3
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "compiler/glsl/ir.h"
25#include "brw_fs.h"
26#include "brw_nir.h"
27#include "brw_rt.h"
28#include "brw_eu.h"
29#include "nir_search_helpers.h"
30#include "util/u_math.h"
31#include "util/bitscan.h"
32
33using namespace brw;
34
35void
36fs_visitor::emit_nir_code()
37{
38   emit_shader_float_controls_execution_mode();
39
40   /* emit the arrays used for inputs and outputs - load/store intrinsics will
41    * be converted to reads/writes of these arrays
42    */
43   nir_setup_outputs();
44   nir_setup_uniforms();
45   nir_emit_system_values();
46   last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width;
47
48   nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
49
50   bld.emit(SHADER_OPCODE_HALT_TARGET);
51}
52
53void
54fs_visitor::nir_setup_outputs()
55{
56   if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
57      return;
58
59   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
60
61   /* Calculate the size of output registers in a separate pass, before
62    * allocating them.  With ARB_enhanced_layouts, multiple output variables
63    * may occupy the same slot, but have different type sizes.
64    */
65   nir_foreach_shader_out_variable(var, nir) {
66      const int loc = var->data.driver_location;
67      const unsigned var_vec4s =
68         var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
69                           : type_size_vec4(var->type, true);
70      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
71   }
72
73   for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
74      if (vec4s[loc] == 0) {
75         loc++;
76         continue;
77      }
78
79      unsigned reg_size = vec4s[loc];
80
81      /* Check if there are any ranges that start within this range and extend
82       * past it. If so, include them in this allocation.
83       */
84      for (unsigned i = 1; i < reg_size; i++) {
85         assert(i + loc < ARRAY_SIZE(vec4s));
86         reg_size = MAX2(vec4s[i + loc] + i, reg_size);
87      }
88
89      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
90      for (unsigned i = 0; i < reg_size; i++) {
91         assert(loc + i < ARRAY_SIZE(outputs));
92         outputs[loc + i] = offset(reg, bld, 4 * i);
93      }
94
95      loc += reg_size;
96   }
97}
98
99void
100fs_visitor::nir_setup_uniforms()
101{
102   /* Only the first compile gets to set up uniforms. */
103   if (push_constant_loc) {
104      assert(pull_constant_loc);
105      return;
106   }
107
108   uniforms = nir->num_uniforms / 4;
109
110   if ((stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) &&
111       devinfo->verx10 < 125) {
112      /* Add uniforms for builtins after regular NIR uniforms. */
113      assert(uniforms == prog_data->nr_params);
114
115      uint32_t *param;
116      if (nir->info.workgroup_size_variable &&
117          compiler->lower_variable_group_size) {
118         param = brw_stage_prog_data_add_params(prog_data, 3);
119         for (unsigned i = 0; i < 3; i++) {
120            param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i);
121            group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
122         }
123      }
124
125      /* Subgroup ID must be the last uniform on the list.  This will make
126       * easier later to split between cross thread and per thread
127       * uniforms.
128       */
129      param = brw_stage_prog_data_add_params(prog_data, 1);
130      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
131      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
132   }
133}
134
135static bool
136emit_system_values_block(nir_block *block, fs_visitor *v)
137{
138   fs_reg *reg;
139
140   nir_foreach_instr(instr, block) {
141      if (instr->type != nir_instr_type_intrinsic)
142         continue;
143
144      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
145      switch (intrin->intrinsic) {
146      case nir_intrinsic_load_vertex_id:
147      case nir_intrinsic_load_base_vertex:
148         unreachable("should be lowered by nir_lower_system_values().");
149
150      case nir_intrinsic_load_vertex_id_zero_base:
151      case nir_intrinsic_load_is_indexed_draw:
152      case nir_intrinsic_load_first_vertex:
153      case nir_intrinsic_load_instance_id:
154      case nir_intrinsic_load_base_instance:
155      case nir_intrinsic_load_draw_id:
156         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
157
158      case nir_intrinsic_load_invocation_id:
159         if (v->stage == MESA_SHADER_TESS_CTRL)
160            break;
161         assert(v->stage == MESA_SHADER_GEOMETRY);
162         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
163         if (reg->file == BAD_FILE) {
164            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
165            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
166            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
167            abld.SHR(iid, g1, brw_imm_ud(27u));
168            *reg = iid;
169         }
170         break;
171
172      case nir_intrinsic_load_sample_pos:
173         assert(v->stage == MESA_SHADER_FRAGMENT);
174         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
175         if (reg->file == BAD_FILE)
176            *reg = *v->emit_samplepos_setup();
177         break;
178
179      case nir_intrinsic_load_sample_id:
180         assert(v->stage == MESA_SHADER_FRAGMENT);
181         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
182         if (reg->file == BAD_FILE)
183            *reg = *v->emit_sampleid_setup();
184         break;
185
186      case nir_intrinsic_load_sample_mask_in:
187         assert(v->stage == MESA_SHADER_FRAGMENT);
188         assert(v->devinfo->ver >= 7);
189         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
190         if (reg->file == BAD_FILE)
191            *reg = *v->emit_samplemaskin_setup();
192         break;
193
194      case nir_intrinsic_load_workgroup_id:
195         assert(v->stage == MESA_SHADER_COMPUTE ||
196                v->stage == MESA_SHADER_KERNEL);
197         reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID];
198         if (reg->file == BAD_FILE)
199            *reg = *v->emit_cs_work_group_id_setup();
200         break;
201
202      case nir_intrinsic_load_helper_invocation:
203         assert(v->stage == MESA_SHADER_FRAGMENT);
204         reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
205         if (reg->file == BAD_FILE) {
206            const fs_builder abld =
207               v->bld.annotate("gl_HelperInvocation", NULL);
208
209            /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the
210             * pixel mask is in g1.7 of the thread payload.
211             *
212             * We move the per-channel pixel enable bit to the low bit of each
213             * channel by shifting the byte containing the pixel mask by the
214             * vector immediate 0x76543210UV.
215             *
216             * The region of <1,8,0> reads only 1 byte (the pixel masks for
217             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
218             * masks for 2 and 3) in SIMD16.
219             */
220            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
221
222            for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
223               const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
224               hbld.SHR(offset(shifted, hbld, i),
225                        stride(retype(brw_vec1_grf(1 + i, 7),
226                                      BRW_REGISTER_TYPE_UB),
227                               1, 8, 0),
228                        brw_imm_v(0x76543210));
229            }
230
231            /* A set bit in the pixel mask means the channel is enabled, but
232             * that is the opposite of gl_HelperInvocation so we need to invert
233             * the mask.
234             *
235             * The negate source-modifier bit of logical instructions on Gfx8+
236             * performs 1's complement negation, so we can use that instead of
237             * a NOT instruction.
238             */
239            fs_reg inverted = negate(shifted);
240            if (v->devinfo->ver < 8) {
241               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
242               abld.NOT(inverted, shifted);
243            }
244
245            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
246             * with 1 and negating.
247             */
248            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
249            abld.AND(anded, inverted, brw_imm_uw(1));
250
251            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
252            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
253            *reg = dst;
254         }
255         break;
256
257      case nir_intrinsic_load_frag_shading_rate:
258         reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE];
259         if (reg->file == BAD_FILE)
260            *reg = *v->emit_shading_rate_setup();
261         break;
262
263      default:
264         break;
265      }
266   }
267
268   return true;
269}
270
271void
272fs_visitor::nir_emit_system_values()
273{
274   nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
275   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
276      nir_system_values[i] = fs_reg();
277   }
278
279   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
280    * never end up using it.
281    */
282   {
283      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
284      fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
285      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
286
287      const fs_builder allbld8 = abld.group(8, 0).exec_all();
288      allbld8.MOV(reg, brw_imm_v(0x76543210));
289      if (dispatch_width > 8)
290         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
291      if (dispatch_width > 16) {
292         const fs_builder allbld16 = abld.group(16, 0).exec_all();
293         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
294      }
295   }
296
297   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
298   nir_foreach_block(block, impl)
299      emit_system_values_block(block, this);
300}
301
302void
303fs_visitor::nir_emit_impl(nir_function_impl *impl)
304{
305   nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
306   for (unsigned i = 0; i < impl->reg_alloc; i++) {
307      nir_locals[i] = fs_reg();
308   }
309
310   foreach_list_typed(nir_register, reg, node, &impl->registers) {
311      unsigned array_elems =
312         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
313      unsigned size = array_elems * reg->num_components;
314      const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
315         brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
316      nir_locals[reg->index] = bld.vgrf(reg_type, size);
317   }
318
319   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
320                             impl->ssa_alloc);
321
322   nir_emit_cf_list(&impl->body);
323}
324
325void
326fs_visitor::nir_emit_cf_list(exec_list *list)
327{
328   exec_list_validate(list);
329   foreach_list_typed(nir_cf_node, node, node, list) {
330      switch (node->type) {
331      case nir_cf_node_if:
332         nir_emit_if(nir_cf_node_as_if(node));
333         break;
334
335      case nir_cf_node_loop:
336         nir_emit_loop(nir_cf_node_as_loop(node));
337         break;
338
339      case nir_cf_node_block:
340         nir_emit_block(nir_cf_node_as_block(node));
341         break;
342
343      default:
344         unreachable("Invalid CFG node block");
345      }
346   }
347}
348
349void
350fs_visitor::nir_emit_if(nir_if *if_stmt)
351{
352   bool invert;
353   fs_reg cond_reg;
354
355   /* If the condition has the form !other_condition, use other_condition as
356    * the source, but invert the predicate on the if instruction.
357    */
358   nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
359   if (cond != NULL && cond->op == nir_op_inot) {
360      invert = true;
361      cond_reg = get_nir_src(cond->src[0].src);
362      cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]);
363   } else {
364      invert = false;
365      cond_reg = get_nir_src(if_stmt->condition);
366   }
367
368   /* first, put the condition into f0 */
369   fs_inst *inst = bld.MOV(bld.null_reg_d(),
370                           retype(cond_reg, BRW_REGISTER_TYPE_D));
371   inst->conditional_mod = BRW_CONDITIONAL_NZ;
372
373   bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
374
375   nir_emit_cf_list(&if_stmt->then_list);
376
377   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
378      bld.emit(BRW_OPCODE_ELSE);
379      nir_emit_cf_list(&if_stmt->else_list);
380   }
381
382   bld.emit(BRW_OPCODE_ENDIF);
383
384   if (devinfo->ver < 7)
385      limit_dispatch_width(16, "Non-uniform control flow unsupported "
386                           "in SIMD32 mode.");
387}
388
389void
390fs_visitor::nir_emit_loop(nir_loop *loop)
391{
392   bld.emit(BRW_OPCODE_DO);
393
394   nir_emit_cf_list(&loop->body);
395
396   bld.emit(BRW_OPCODE_WHILE);
397
398   if (devinfo->ver < 7)
399      limit_dispatch_width(16, "Non-uniform control flow unsupported "
400                           "in SIMD32 mode.");
401}
402
403void
404fs_visitor::nir_emit_block(nir_block *block)
405{
406   nir_foreach_instr(instr, block) {
407      nir_emit_instr(instr);
408   }
409}
410
411void
412fs_visitor::nir_emit_instr(nir_instr *instr)
413{
414   const fs_builder abld = bld.annotate(NULL, instr);
415
416   switch (instr->type) {
417   case nir_instr_type_alu:
418      nir_emit_alu(abld, nir_instr_as_alu(instr), true);
419      break;
420
421   case nir_instr_type_deref:
422      unreachable("All derefs should've been lowered");
423      break;
424
425   case nir_instr_type_intrinsic:
426      switch (stage) {
427      case MESA_SHADER_VERTEX:
428         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
429         break;
430      case MESA_SHADER_TESS_CTRL:
431         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
432         break;
433      case MESA_SHADER_TESS_EVAL:
434         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
435         break;
436      case MESA_SHADER_GEOMETRY:
437         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
438         break;
439      case MESA_SHADER_FRAGMENT:
440         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
441         break;
442      case MESA_SHADER_COMPUTE:
443      case MESA_SHADER_KERNEL:
444         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
445         break;
446      case MESA_SHADER_RAYGEN:
447      case MESA_SHADER_ANY_HIT:
448      case MESA_SHADER_CLOSEST_HIT:
449      case MESA_SHADER_MISS:
450      case MESA_SHADER_INTERSECTION:
451      case MESA_SHADER_CALLABLE:
452         nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr));
453         break;
454      default:
455         unreachable("unsupported shader stage");
456      }
457      break;
458
459   case nir_instr_type_tex:
460      nir_emit_texture(abld, nir_instr_as_tex(instr));
461      break;
462
463   case nir_instr_type_load_const:
464      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
465      break;
466
467   case nir_instr_type_ssa_undef:
468      /* We create a new VGRF for undefs on every use (by handling
469       * them in get_nir_src()), rather than for each definition.
470       * This helps register coalescing eliminate MOVs from undef.
471       */
472      break;
473
474   case nir_instr_type_jump:
475      nir_emit_jump(abld, nir_instr_as_jump(instr));
476      break;
477
478   default:
479      unreachable("unknown instruction type");
480   }
481}
482
483/**
484 * Recognizes a parent instruction of nir_op_extract_* and changes the type to
485 * match instr.
486 */
487bool
488fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
489                                      const fs_reg &result)
490{
491   if (!instr->src[0].src.is_ssa ||
492       !instr->src[0].src.ssa->parent_instr)
493      return false;
494
495   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
496      return false;
497
498   nir_alu_instr *src0 =
499      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
500
501   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
502       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
503      return false;
504
505   unsigned element = nir_src_as_uint(src0->src[1].src);
506
507   /* Element type to extract.*/
508   const brw_reg_type type = brw_int_type(
509      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
510      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
511
512   fs_reg op0 = get_nir_src(src0->src[0].src);
513   op0.type = brw_type_for_nir_type(devinfo,
514      (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
515                     nir_src_bit_size(src0->src[0].src)));
516   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
517
518   bld.MOV(result, subscript(op0, type, element));
519   return true;
520}
521
522bool
523fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
524                                         const fs_reg &result)
525{
526   nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
527   if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
528      return false;
529
530   if (!nir_src_is_const(instr->src[1].src) ||
531       !nir_src_is_const(instr->src[2].src))
532      return false;
533
534   const float value1 = nir_src_as_float(instr->src[1].src);
535   const float value2 = nir_src_as_float(instr->src[2].src);
536   if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
537      return false;
538
539   /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
540   assert(value1 == -value2);
541
542   fs_reg tmp = vgrf(glsl_type::int_type);
543
544   if (devinfo->ver >= 12) {
545      /* Bit 15 of g1.1 is 0 if the polygon is front facing. */
546      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
547
548      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
549       *
550       *    or(8)  tmp.1<2>W  g1.1<0,1,0>W  0x00003f80W
551       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
552       *
553       * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
554       */
555      if (value1 == -1.0f)
556         g1.negate = true;
557
558      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
559             g1, brw_imm_uw(0x3f80));
560   } else if (devinfo->ver >= 6) {
561      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
562      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
563
564      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
565       *
566       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
567       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
568       *
569       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
570       *
571       * This negation looks like it's safe in practice, because bits 0:4 will
572       * surely be TRIANGLES
573       */
574
575      if (value1 == -1.0f) {
576         g0.negate = true;
577      }
578
579      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
580             g0, brw_imm_uw(0x3f80));
581   } else {
582      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
583      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
584
585      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
586       *
587       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
588       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
589       *
590       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
591       *
592       * This negation looks like it's safe in practice, because bits 0:4 will
593       * surely be TRIANGLES
594       */
595
596      if (value1 == -1.0f) {
597         g1_6.negate = true;
598      }
599
600      bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
601   }
602   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
603
604   return true;
605}
606
607static void
608emit_find_msb_using_lzd(const fs_builder &bld,
609                        const fs_reg &result,
610                        const fs_reg &src,
611                        bool is_signed)
612{
613   fs_inst *inst;
614   fs_reg temp = src;
615
616   if (is_signed) {
617      /* LZD of an absolute value source almost always does the right
618       * thing.  There are two problem values:
619       *
620       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
621       *   0.  However, findMSB(int(0x80000000)) == 30.
622       *
623       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
624       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
625       *
626       *    For a value of zero or negative one, -1 will be returned.
627       *
628       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
629       *   findMSB(-(1<<x)) should return x-1.
630       *
631       * For all negative number cases, including 0x80000000 and
632       * 0xffffffff, the correct value is obtained from LZD if instead of
633       * negating the (already negative) value the logical-not is used.  A
634       * conditonal logical-not can be achieved in two instructions.
635       */
636      temp = bld.vgrf(BRW_REGISTER_TYPE_D);
637
638      bld.ASR(temp, src, brw_imm_d(31));
639      bld.XOR(temp, temp, src);
640   }
641
642   bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
643           retype(temp, BRW_REGISTER_TYPE_UD));
644
645   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
646    * from the LSB side. Subtract the result from 31 to convert the MSB
647    * count into an LSB count.  If no bits are set, LZD will return 32.
648    * 31-32 = -1, which is exactly what findMSB() is supposed to return.
649    */
650   inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
651   inst->src[0].negate = true;
652}
653
654static brw_rnd_mode
655brw_rnd_mode_from_nir_op (const nir_op op) {
656   switch (op) {
657   case nir_op_f2f16_rtz:
658      return BRW_RND_MODE_RTZ;
659   case nir_op_f2f16_rtne:
660      return BRW_RND_MODE_RTNE;
661   default:
662      unreachable("Operation doesn't support rounding mode");
663   }
664}
665
666static brw_rnd_mode
667brw_rnd_mode_from_execution_mode(unsigned execution_mode)
668{
669   if (nir_has_any_rounding_mode_rtne(execution_mode))
670      return BRW_RND_MODE_RTNE;
671   if (nir_has_any_rounding_mode_rtz(execution_mode))
672      return BRW_RND_MODE_RTZ;
673   return BRW_RND_MODE_UNSPECIFIED;
674}
675
676fs_reg
677fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
678                                                nir_alu_instr *instr,
679                                                fs_reg *op,
680                                                bool need_dest)
681{
682   fs_reg result =
683      need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
684
685   result.type = brw_type_for_nir_type(devinfo,
686      (nir_alu_type)(nir_op_infos[instr->op].output_type |
687                     nir_dest_bit_size(instr->dest.dest)));
688
689   assert(!instr->dest.saturate);
690
691   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
692      /* We don't lower to source modifiers so they should not exist. */
693      assert(!instr->src[i].abs);
694      assert(!instr->src[i].negate);
695
696      op[i] = get_nir_src(instr->src[i].src);
697      op[i].type = brw_type_for_nir_type(devinfo,
698         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
699                        nir_src_bit_size(instr->src[i].src)));
700   }
701
702   /* Move and vecN instrutions may still be vectored.  Return the raw,
703    * vectored source and destination so that fs_visitor::nir_emit_alu can
704    * handle it.  Other callers should not have to handle these kinds of
705    * instructions.
706    */
707   switch (instr->op) {
708   case nir_op_mov:
709   case nir_op_vec2:
710   case nir_op_vec3:
711   case nir_op_vec4:
712   case nir_op_vec8:
713   case nir_op_vec16:
714      return result;
715   default:
716      break;
717   }
718
719   /* At this point, we have dealt with any instruction that operates on
720    * more than a single channel.  Therefore, we can just adjust the source
721    * and destination registers for that channel and emit the instruction.
722    */
723   unsigned channel = 0;
724   if (nir_op_infos[instr->op].output_size == 0) {
725      /* Since NIR is doing the scalarizing for us, we should only ever see
726       * vectorized operations with a single channel.
727       */
728      assert(util_bitcount(instr->dest.write_mask) == 1);
729      channel = ffs(instr->dest.write_mask) - 1;
730
731      result = offset(result, bld, channel);
732   }
733
734   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
735      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
736      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
737   }
738
739   return result;
740}
741
742void
743fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
744                                 fs_reg *op)
745{
746   for (unsigned i = 0; i < 2; i++) {
747      nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
748
749      if (inot_instr != NULL && inot_instr->op == nir_op_inot) {
750         /* The source of the inot is now the source of instr. */
751         prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
752
753         assert(!op[i].negate);
754         op[i].negate = true;
755      } else {
756         op[i] = resolve_source_modifiers(op[i]);
757      }
758   }
759}
760
761bool
762fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
763                                  fs_reg result,
764                                  nir_alu_instr *instr)
765{
766   if (devinfo->ver < 6 || devinfo->ver >= 12)
767      return false;
768
769   nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
770
771   if (inot_instr == NULL || inot_instr->op != nir_op_inot)
772      return false;
773
774   /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
775    * of valid size-changing combinations is a bit more complex.
776    *
777    * The source restriction is just because I was lazy about generating the
778    * constant below.
779    */
780   if (nir_dest_bit_size(instr->dest.dest) != 32 ||
781       nir_src_bit_size(inot_instr->src[0].src) != 32)
782      return false;
783
784   /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
785    * this is float(1 + a).
786    */
787   fs_reg op;
788
789   prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
790
791   /* Ignore the saturate modifier, if there is one.  The result of the
792    * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
793    */
794   bld.ADD(result, op, brw_imm_d(1));
795
796   return true;
797}
798
799/**
800 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
801 *
802 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
803 * the source of \c instr that is a \c nir_op_fsign.
804 */
805void
806fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
807                       fs_reg result, fs_reg *op, unsigned fsign_src)
808{
809   fs_inst *inst;
810
811   assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
812   assert(fsign_src < nir_op_infos[instr->op].num_inputs);
813
814   if (instr->op != nir_op_fsign) {
815      const nir_alu_instr *const fsign_instr =
816         nir_src_as_alu_instr(instr->src[fsign_src].src);
817
818      /* op[fsign_src] has the nominal result of the fsign, and op[1 -
819       * fsign_src] has the other multiply source.  This must be rearranged so
820       * that op[0] is the source of the fsign op[1] is the other multiply
821       * source.
822       */
823      if (fsign_src != 0)
824         op[1] = op[0];
825
826      op[0] = get_nir_src(fsign_instr->src[0].src);
827
828      const nir_alu_type t =
829         (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
830                        nir_src_bit_size(fsign_instr->src[0].src));
831
832      op[0].type = brw_type_for_nir_type(devinfo, t);
833
834      unsigned channel = 0;
835      if (nir_op_infos[instr->op].output_size == 0) {
836         /* Since NIR is doing the scalarizing for us, we should only ever see
837          * vectorized operations with a single channel.
838          */
839         assert(util_bitcount(instr->dest.write_mask) == 1);
840         channel = ffs(instr->dest.write_mask) - 1;
841      }
842
843      op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
844   }
845
846   if (type_sz(op[0].type) == 2) {
847      /* AND(val, 0x8000) gives the sign bit.
848       *
849       * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
850       */
851      fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
852      bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
853
854      op[0].type = BRW_REGISTER_TYPE_UW;
855      result.type = BRW_REGISTER_TYPE_UW;
856      bld.AND(result, op[0], brw_imm_uw(0x8000u));
857
858      if (instr->op == nir_op_fsign)
859         inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
860      else {
861         /* Use XOR here to get the result sign correct. */
862         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
863      }
864
865      inst->predicate = BRW_PREDICATE_NORMAL;
866   } else if (type_sz(op[0].type) == 4) {
867      /* AND(val, 0x80000000) gives the sign bit.
868       *
869       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
870       * zero.
871       */
872      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
873
874      op[0].type = BRW_REGISTER_TYPE_UD;
875      result.type = BRW_REGISTER_TYPE_UD;
876      bld.AND(result, op[0], brw_imm_ud(0x80000000u));
877
878      if (instr->op == nir_op_fsign)
879         inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
880      else {
881         /* Use XOR here to get the result sign correct. */
882         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
883      }
884
885      inst->predicate = BRW_PREDICATE_NORMAL;
886   } else {
887      /* For doubles we do the same but we need to consider:
888       *
889       * - 2-src instructions can't operate with 64-bit immediates
890       * - The sign is encoded in the high 32-bit of each DF
891       * - We need to produce a DF result.
892       */
893
894      fs_reg zero = vgrf(glsl_type::double_type);
895      bld.MOV(zero, setup_imm_df(bld, 0.0));
896      bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
897
898      bld.MOV(result, zero);
899
900      fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
901      bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
902              brw_imm_ud(0x80000000u));
903
904      if (instr->op == nir_op_fsign) {
905         set_predicate(BRW_PREDICATE_NORMAL,
906                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
907      } else {
908         /* This could be done better in some cases.  If the scale is an
909          * immediate with the low 32-bits all 0, emitting a separate XOR and
910          * OR would allow an algebraic optimization to remove the OR.  There
911          * are currently zero instances of fsign(double(x))*IMM in shader-db
912          * or any test suite, so it is hard to care at this time.
913          */
914         fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
915         inst = bld.XOR(result_int64, result_int64,
916                        retype(op[1], BRW_REGISTER_TYPE_UQ));
917      }
918   }
919}
920
921/**
922 * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
923 *
924 * Checks the operands of a \c nir_op_fmul to determine whether or not
925 * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
926 *
927 * \param instr  The multiplication instruction
928 *
929 * \param fsign_src The source of \c instr that may or may not be a
930 *                  \c nir_op_fsign
931 */
932static bool
933can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
934{
935   assert(instr->op == nir_op_fmul);
936
937   nir_alu_instr *const fsign_instr =
938      nir_src_as_alu_instr(instr->src[fsign_src].src);
939
940   /* Rules:
941    *
942    * 1. instr->src[fsign_src] must be a nir_op_fsign.
943    * 2. The nir_op_fsign can only be used by this multiplication.
944    * 3. The source that is the nir_op_fsign does not have source modifiers.
945    *    \c emit_fsign only examines the source modifiers of the source of the
946    *    \c nir_op_fsign.
947    *
948    * The nir_op_fsign must also not have the saturate modifier, but steps
949    * have already been taken (in nir_opt_algebraic) to ensure that.
950    */
951   return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
952          is_used_once(fsign_instr);
953}
954
955void
956fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr,
957                         bool need_dest)
958{
959   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
960   fs_inst *inst;
961   unsigned execution_mode =
962      bld.shader->nir->info.float_controls_execution_mode;
963
964   fs_reg op[NIR_MAX_VEC_COMPONENTS];
965   fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest);
966
967#ifndef NDEBUG
968   /* Everything except raw moves, some type conversions, iabs, and ineg
969    * should have 8-bit sources lowered by nir_lower_bit_size in
970    * brw_preprocess_nir or by brw_nir_lower_conversions in
971    * brw_postprocess_nir.
972    */
973   switch (instr->op) {
974   case nir_op_mov:
975   case nir_op_vec2:
976   case nir_op_vec3:
977   case nir_op_vec4:
978   case nir_op_vec8:
979   case nir_op_vec16:
980   case nir_op_i2f16:
981   case nir_op_i2f32:
982   case nir_op_i2i16:
983   case nir_op_i2i32:
984   case nir_op_u2f16:
985   case nir_op_u2f32:
986   case nir_op_u2u16:
987   case nir_op_u2u32:
988   case nir_op_iabs:
989   case nir_op_ineg:
990   case nir_op_pack_32_4x8_split:
991      break;
992
993   default:
994      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
995         assert(type_sz(op[i].type) > 1);
996      }
997   }
998#endif
999
1000   switch (instr->op) {
1001   case nir_op_mov:
1002   case nir_op_vec2:
1003   case nir_op_vec3:
1004   case nir_op_vec4:
1005   case nir_op_vec8:
1006   case nir_op_vec16: {
1007      fs_reg temp = result;
1008      bool need_extra_copy = false;
1009      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1010         if (!instr->src[i].src.is_ssa &&
1011             instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
1012            need_extra_copy = true;
1013            temp = bld.vgrf(result.type, 4);
1014            break;
1015         }
1016      }
1017
1018      for (unsigned i = 0; i < 4; i++) {
1019         if (!(instr->dest.write_mask & (1 << i)))
1020            continue;
1021
1022         if (instr->op == nir_op_mov) {
1023            bld.MOV(offset(temp, bld, i),
1024                           offset(op[0], bld, instr->src[0].swizzle[i]));
1025         } else {
1026            bld.MOV(offset(temp, bld, i),
1027                           offset(op[i], bld, instr->src[i].swizzle[0]));
1028         }
1029      }
1030
1031      /* In this case the source and destination registers were the same,
1032       * so we need to insert an extra set of moves in order to deal with
1033       * any swizzling.
1034       */
1035      if (need_extra_copy) {
1036         for (unsigned i = 0; i < 4; i++) {
1037            if (!(instr->dest.write_mask & (1 << i)))
1038               continue;
1039
1040            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1041         }
1042      }
1043      return;
1044   }
1045
1046   case nir_op_i2f32:
1047   case nir_op_u2f32:
1048      if (optimize_extract_to_float(instr, result))
1049         return;
1050      inst = bld.MOV(result, op[0]);
1051      break;
1052
1053   case nir_op_f2f16_rtne:
1054   case nir_op_f2f16_rtz:
1055   case nir_op_f2f16: {
1056      brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED;
1057
1058      if (nir_op_f2f16 == instr->op)
1059         rnd = brw_rnd_mode_from_execution_mode(execution_mode);
1060      else
1061         rnd = brw_rnd_mode_from_nir_op(instr->op);
1062
1063      if (BRW_RND_MODE_UNSPECIFIED != rnd)
1064         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd));
1065
1066      /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
1067       * on the HW gen, it is a special hw opcode or just a MOV, and
1068       * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
1069       *
1070       * But if we want to use that opcode, we need to provide support on
1071       * different optimizations and lowerings. As right now HF support is
1072       * only for gfx8+, it will be better to use directly the MOV, and use
1073       * BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7.
1074       */
1075      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1076      inst = bld.MOV(result, op[0]);
1077      break;
1078   }
1079
1080   case nir_op_b2i8:
1081   case nir_op_b2i16:
1082   case nir_op_b2i32:
1083   case nir_op_b2i64:
1084   case nir_op_b2f16:
1085   case nir_op_b2f32:
1086   case nir_op_b2f64:
1087      if (try_emit_b2fi_of_inot(bld, result, instr))
1088         break;
1089      op[0].type = BRW_REGISTER_TYPE_D;
1090      op[0].negate = !op[0].negate;
1091      FALLTHROUGH;
1092   case nir_op_i2f64:
1093   case nir_op_i2i64:
1094   case nir_op_u2f64:
1095   case nir_op_u2u64:
1096   case nir_op_f2f64:
1097   case nir_op_f2i64:
1098   case nir_op_f2u64:
1099   case nir_op_i2i32:
1100   case nir_op_u2u32:
1101   case nir_op_f2i32:
1102   case nir_op_f2u32:
1103   case nir_op_i2f16:
1104   case nir_op_u2f16:
1105   case nir_op_f2i16:
1106   case nir_op_f2u16:
1107   case nir_op_f2i8:
1108   case nir_op_f2u8:
1109      if (result.type == BRW_REGISTER_TYPE_B ||
1110          result.type == BRW_REGISTER_TYPE_UB ||
1111          result.type == BRW_REGISTER_TYPE_HF)
1112         assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1113
1114      if (op[0].type == BRW_REGISTER_TYPE_B ||
1115          op[0].type == BRW_REGISTER_TYPE_UB ||
1116          op[0].type == BRW_REGISTER_TYPE_HF)
1117         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1118
1119      inst = bld.MOV(result, op[0]);
1120      break;
1121
1122   case nir_op_i2i8:
1123   case nir_op_u2u8:
1124      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1125      FALLTHROUGH;
1126   case nir_op_i2i16:
1127   case nir_op_u2u16: {
1128      /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns.
1129       * Emitting the instructions one by one results in two MOV instructions
1130       * that won't be propagated.  By handling both instructions here, a
1131       * single MOV is emitted.
1132       */
1133      nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src);
1134      if (extract_instr != NULL) {
1135         if (extract_instr->op == nir_op_extract_u8 ||
1136             extract_instr->op == nir_op_extract_i8) {
1137            prepare_alu_destination_and_sources(bld, extract_instr, op, false);
1138
1139            const unsigned byte = nir_src_as_uint(extract_instr->src[1].src);
1140            const brw_reg_type type =
1141               brw_int_type(1, extract_instr->op == nir_op_extract_i8);
1142
1143            op[0] = subscript(op[0], type, byte);
1144         } else if (extract_instr->op == nir_op_extract_u16 ||
1145                    extract_instr->op == nir_op_extract_i16) {
1146            prepare_alu_destination_and_sources(bld, extract_instr, op, false);
1147
1148            const unsigned word = nir_src_as_uint(extract_instr->src[1].src);
1149            const brw_reg_type type =
1150               brw_int_type(2, extract_instr->op == nir_op_extract_i16);
1151
1152            op[0] = subscript(op[0], type, word);
1153         }
1154      }
1155
1156      inst = bld.MOV(result, op[0]);
1157      break;
1158   }
1159
1160   case nir_op_fsat:
1161      inst = bld.MOV(result, op[0]);
1162      inst->saturate = true;
1163      break;
1164
1165   case nir_op_fneg:
1166   case nir_op_ineg:
1167      op[0].negate = true;
1168      inst = bld.MOV(result, op[0]);
1169      break;
1170
1171   case nir_op_fabs:
1172   case nir_op_iabs:
1173      op[0].negate = false;
1174      op[0].abs = true;
1175      inst = bld.MOV(result, op[0]);
1176      break;
1177
1178   case nir_op_f2f32:
1179      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1180         brw_rnd_mode rnd =
1181            brw_rnd_mode_from_execution_mode(execution_mode);
1182         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1183                  brw_imm_d(rnd));
1184      }
1185
1186      if (op[0].type == BRW_REGISTER_TYPE_HF)
1187         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1188
1189      inst = bld.MOV(result, op[0]);
1190      break;
1191
1192   case nir_op_fsign:
1193      emit_fsign(bld, instr, result, op, 0);
1194      break;
1195
1196   case nir_op_frcp:
1197      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1198      break;
1199
1200   case nir_op_fexp2:
1201      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1202      break;
1203
1204   case nir_op_flog2:
1205      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1206      break;
1207
1208   case nir_op_fsin:
1209      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1210      break;
1211
1212   case nir_op_fcos:
1213      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1214      break;
1215
1216   case nir_op_fddx:
1217      if (fs_key->high_quality_derivatives) {
1218         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1219      } else {
1220         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1221      }
1222      break;
1223   case nir_op_fddx_fine:
1224      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1225      break;
1226   case nir_op_fddx_coarse:
1227      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1228      break;
1229   case nir_op_fddy:
1230      if (fs_key->high_quality_derivatives) {
1231         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1232      } else {
1233         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1234      }
1235      break;
1236   case nir_op_fddy_fine:
1237      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1238      break;
1239   case nir_op_fddy_coarse:
1240      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1241      break;
1242
1243   case nir_op_fadd:
1244      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1245         brw_rnd_mode rnd =
1246            brw_rnd_mode_from_execution_mode(execution_mode);
1247         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1248                  brw_imm_d(rnd));
1249      }
1250      FALLTHROUGH;
1251   case nir_op_iadd:
1252      inst = bld.ADD(result, op[0], op[1]);
1253      break;
1254
1255   case nir_op_iadd3:
1256      inst = bld.ADD3(result, op[0], op[1], op[2]);
1257      break;
1258
1259   case nir_op_iadd_sat:
1260   case nir_op_uadd_sat:
1261      inst = bld.ADD(result, op[0], op[1]);
1262      inst->saturate = true;
1263      break;
1264
1265   case nir_op_isub_sat:
1266      bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]);
1267      break;
1268
1269   case nir_op_usub_sat:
1270      bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]);
1271      break;
1272
1273   case nir_op_irhadd:
1274   case nir_op_urhadd:
1275      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1276      inst = bld.AVG(result, op[0], op[1]);
1277      break;
1278
1279   case nir_op_ihadd:
1280   case nir_op_uhadd: {
1281      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1282      fs_reg tmp = bld.vgrf(result.type);
1283
1284      if (devinfo->ver >= 8) {
1285         op[0] = resolve_source_modifiers(op[0]);
1286         op[1] = resolve_source_modifiers(op[1]);
1287      }
1288
1289      /* AVG(x, y) - ((x ^ y) & 1) */
1290      bld.XOR(tmp, op[0], op[1]);
1291      bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type));
1292      bld.AVG(result, op[0], op[1]);
1293      inst = bld.ADD(result, result, tmp);
1294      inst->src[1].negate = true;
1295      break;
1296   }
1297
1298   case nir_op_fmul:
1299      for (unsigned i = 0; i < 2; i++) {
1300         if (can_fuse_fmul_fsign(instr, i)) {
1301            emit_fsign(bld, instr, result, op, i);
1302            return;
1303         }
1304      }
1305
1306      /* We emit the rounding mode after the previous fsign optimization since
1307       * it won't result in a MUL, but will try to negate the value by other
1308       * means.
1309       */
1310      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1311         brw_rnd_mode rnd =
1312            brw_rnd_mode_from_execution_mode(execution_mode);
1313         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1314                  brw_imm_d(rnd));
1315      }
1316
1317      inst = bld.MUL(result, op[0], op[1]);
1318      break;
1319
1320   case nir_op_imul_2x32_64:
1321   case nir_op_umul_2x32_64:
1322      bld.MUL(result, op[0], op[1]);
1323      break;
1324
1325   case nir_op_imul_32x16:
1326   case nir_op_umul_32x16: {
1327      const bool ud = instr->op == nir_op_umul_32x16;
1328
1329      assert(nir_dest_bit_size(instr->dest.dest) == 32);
1330
1331      /* Before Gfx7, the order of the 32-bit source and the 16-bit source was
1332       * swapped.  The extension isn't enabled on those platforms, so don't
1333       * pretend to support the differences.
1334       */
1335      assert(devinfo->ver >= 7);
1336
1337      if (op[1].file == IMM)
1338         op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d);
1339      else {
1340         const enum brw_reg_type word_type =
1341            ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W;
1342
1343         op[1] = subscript(op[1], word_type, 0);
1344      }
1345
1346      const enum brw_reg_type dword_type =
1347         ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
1348
1349      bld.MUL(result, retype(op[0], dword_type), op[1]);
1350      break;
1351   }
1352
1353   case nir_op_imul:
1354      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1355      bld.MUL(result, op[0], op[1]);
1356      break;
1357
1358   case nir_op_imul_high:
1359   case nir_op_umul_high:
1360      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1361      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1362      break;
1363
1364   case nir_op_idiv:
1365   case nir_op_udiv:
1366      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1367      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1368      break;
1369
1370   case nir_op_uadd_carry:
1371      unreachable("Should have been lowered by carry_to_arith().");
1372
1373   case nir_op_usub_borrow:
1374      unreachable("Should have been lowered by borrow_to_arith().");
1375
1376   case nir_op_umod:
1377   case nir_op_irem:
1378      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1379       * appears that our hardware just does the right thing for signed
1380       * remainder.
1381       */
1382      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1383      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1384      break;
1385
1386   case nir_op_imod: {
1387      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1388      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1389
1390      /* Math instructions don't support conditional mod */
1391      inst = bld.MOV(bld.null_reg_d(), result);
1392      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1393
1394      /* Now, we need to determine if signs of the sources are different.
1395       * When we XOR the sources, the top bit is 0 if they are the same and 1
1396       * if they are different.  We can then use a conditional modifier to
1397       * turn that into a predicate.  This leads us to an XOR.l instruction.
1398       *
1399       * Technically, according to the PRM, you're not allowed to use .l on a
1400       * XOR instruction.  However, emperical experiments and Curro's reading
1401       * of the simulator source both indicate that it's safe.
1402       */
1403      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1404      inst = bld.XOR(tmp, op[0], op[1]);
1405      inst->predicate = BRW_PREDICATE_NORMAL;
1406      inst->conditional_mod = BRW_CONDITIONAL_L;
1407
1408      /* If the result of the initial remainder operation is non-zero and the
1409       * two sources have different signs, add in a copy of op[1] to get the
1410       * final integer modulus value.
1411       */
1412      inst = bld.ADD(result, result, op[1]);
1413      inst->predicate = BRW_PREDICATE_NORMAL;
1414      break;
1415   }
1416
1417   case nir_op_flt32:
1418   case nir_op_fge32:
1419   case nir_op_feq32:
1420   case nir_op_fneu32: {
1421      fs_reg dest = result;
1422
1423      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1424      if (bit_size != 32)
1425         dest = bld.vgrf(op[0].type, 1);
1426
1427      bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op));
1428
1429      if (bit_size > 32) {
1430         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1431      } else if(bit_size < 32) {
1432         /* When we convert the result to 32-bit we need to be careful and do
1433          * it as a signed conversion to get sign extension (for 32-bit true)
1434          */
1435         const brw_reg_type src_type =
1436            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1437
1438         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1439      }
1440      break;
1441   }
1442
1443   case nir_op_ilt32:
1444   case nir_op_ult32:
1445   case nir_op_ige32:
1446   case nir_op_uge32:
1447   case nir_op_ieq32:
1448   case nir_op_ine32: {
1449      fs_reg dest = result;
1450
1451      const uint32_t bit_size = type_sz(op[0].type) * 8;
1452      if (bit_size != 32)
1453         dest = bld.vgrf(op[0].type, 1);
1454
1455      bld.CMP(dest, op[0], op[1],
1456              brw_cmod_for_nir_comparison(instr->op));
1457
1458      if (bit_size > 32) {
1459         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1460      } else if (bit_size < 32) {
1461         /* When we convert the result to 32-bit we need to be careful and do
1462          * it as a signed conversion to get sign extension (for 32-bit true)
1463          */
1464         const brw_reg_type src_type =
1465            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1466
1467         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1468      }
1469      break;
1470   }
1471
1472   case nir_op_inot:
1473      if (devinfo->ver >= 8) {
1474         nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1475
1476         if (inot_src_instr != NULL &&
1477             (inot_src_instr->op == nir_op_ior ||
1478              inot_src_instr->op == nir_op_ixor ||
1479              inot_src_instr->op == nir_op_iand)) {
1480            /* The sources of the source logical instruction are now the
1481             * sources of the instruction that will be generated.
1482             */
1483            prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1484            resolve_inot_sources(bld, inot_src_instr, op);
1485
1486            /* Smash all of the sources and destination to be signed.  This
1487             * doesn't matter for the operation of the instruction, but cmod
1488             * propagation fails on unsigned sources with negation (due to
1489             * fs_inst::can_do_cmod returning false).
1490             */
1491            result.type =
1492               brw_type_for_nir_type(devinfo,
1493                                     (nir_alu_type)(nir_type_int |
1494                                                    nir_dest_bit_size(instr->dest.dest)));
1495            op[0].type =
1496               brw_type_for_nir_type(devinfo,
1497                                     (nir_alu_type)(nir_type_int |
1498                                                    nir_src_bit_size(inot_src_instr->src[0].src)));
1499            op[1].type =
1500               brw_type_for_nir_type(devinfo,
1501                                     (nir_alu_type)(nir_type_int |
1502                                                    nir_src_bit_size(inot_src_instr->src[1].src)));
1503
1504            /* For XOR, only invert one of the sources.  Arbitrarily choose
1505             * the first source.
1506             */
1507            op[0].negate = !op[0].negate;
1508            if (inot_src_instr->op != nir_op_ixor)
1509               op[1].negate = !op[1].negate;
1510
1511            switch (inot_src_instr->op) {
1512            case nir_op_ior:
1513               bld.AND(result, op[0], op[1]);
1514               return;
1515
1516            case nir_op_iand:
1517               bld.OR(result, op[0], op[1]);
1518               return;
1519
1520            case nir_op_ixor:
1521               bld.XOR(result, op[0], op[1]);
1522               return;
1523
1524            default:
1525               unreachable("impossible opcode");
1526            }
1527         }
1528         op[0] = resolve_source_modifiers(op[0]);
1529      }
1530      bld.NOT(result, op[0]);
1531      break;
1532   case nir_op_ixor:
1533      if (devinfo->ver >= 8) {
1534         resolve_inot_sources(bld, instr, op);
1535      }
1536      bld.XOR(result, op[0], op[1]);
1537      break;
1538   case nir_op_ior:
1539      if (devinfo->ver >= 8) {
1540         resolve_inot_sources(bld, instr, op);
1541      }
1542      bld.OR(result, op[0], op[1]);
1543      break;
1544   case nir_op_iand:
1545      if (devinfo->ver >= 8) {
1546         resolve_inot_sources(bld, instr, op);
1547      }
1548      bld.AND(result, op[0], op[1]);
1549      break;
1550
1551   case nir_op_fdot2:
1552   case nir_op_fdot3:
1553   case nir_op_fdot4:
1554   case nir_op_b32all_fequal2:
1555   case nir_op_b32all_iequal2:
1556   case nir_op_b32all_fequal3:
1557   case nir_op_b32all_iequal3:
1558   case nir_op_b32all_fequal4:
1559   case nir_op_b32all_iequal4:
1560   case nir_op_b32any_fnequal2:
1561   case nir_op_b32any_inequal2:
1562   case nir_op_b32any_fnequal3:
1563   case nir_op_b32any_inequal3:
1564   case nir_op_b32any_fnequal4:
1565   case nir_op_b32any_inequal4:
1566      unreachable("Lowered by nir_lower_alu_reductions");
1567
1568   case nir_op_ldexp:
1569      unreachable("not reached: should be handled by ldexp_to_arith()");
1570
1571   case nir_op_fsqrt:
1572      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1573      break;
1574
1575   case nir_op_frsq:
1576      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1577      break;
1578
1579   case nir_op_i2b32:
1580   case nir_op_f2b32: {
1581      uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1582      if (bit_size == 64) {
1583         /* two-argument instructions can't take 64-bit immediates */
1584         fs_reg zero;
1585         fs_reg tmp;
1586
1587         if (instr->op == nir_op_f2b32) {
1588            zero = vgrf(glsl_type::double_type);
1589            tmp = vgrf(glsl_type::double_type);
1590            bld.MOV(zero, setup_imm_df(bld, 0.0));
1591         } else {
1592            zero = vgrf(glsl_type::int64_t_type);
1593            tmp = vgrf(glsl_type::int64_t_type);
1594            bld.MOV(zero, brw_imm_q(0));
1595         }
1596
1597         /* A SIMD16 execution needs to be split in two instructions, so use
1598          * a vgrf instead of the flag register as dst so instruction splitting
1599          * works
1600          */
1601         bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1602         bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1603      } else {
1604         fs_reg zero;
1605         if (bit_size == 32) {
1606            zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1607         } else {
1608            assert(bit_size == 16);
1609            zero = instr->op == nir_op_f2b32 ?
1610               retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1611         }
1612         bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1613      }
1614      break;
1615   }
1616
1617   case nir_op_ftrunc:
1618      inst = bld.RNDZ(result, op[0]);
1619      if (devinfo->ver < 6) {
1620         set_condmod(BRW_CONDITIONAL_R, inst);
1621         set_predicate(BRW_PREDICATE_NORMAL,
1622                       bld.ADD(result, result, brw_imm_f(1.0f)));
1623         inst = bld.MOV(result, result); /* for potential saturation */
1624      }
1625      break;
1626
1627   case nir_op_fceil: {
1628      op[0].negate = !op[0].negate;
1629      fs_reg temp = vgrf(glsl_type::float_type);
1630      bld.RNDD(temp, op[0]);
1631      temp.negate = true;
1632      inst = bld.MOV(result, temp);
1633      break;
1634   }
1635   case nir_op_ffloor:
1636      inst = bld.RNDD(result, op[0]);
1637      break;
1638   case nir_op_ffract:
1639      inst = bld.FRC(result, op[0]);
1640      break;
1641   case nir_op_fround_even:
1642      inst = bld.RNDE(result, op[0]);
1643      if (devinfo->ver < 6) {
1644         set_condmod(BRW_CONDITIONAL_R, inst);
1645         set_predicate(BRW_PREDICATE_NORMAL,
1646                       bld.ADD(result, result, brw_imm_f(1.0f)));
1647         inst = bld.MOV(result, result); /* for potential saturation */
1648      }
1649      break;
1650
1651   case nir_op_fquantize2f16: {
1652      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1653      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1654      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1655
1656      /* The destination stride must be at least as big as the source stride. */
1657      tmp16.type = BRW_REGISTER_TYPE_W;
1658      tmp16.stride = 2;
1659
1660      /* Check for denormal */
1661      fs_reg abs_src0 = op[0];
1662      abs_src0.abs = true;
1663      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1664              BRW_CONDITIONAL_L);
1665      /* Get the appropriately signed zero */
1666      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1667              retype(op[0], BRW_REGISTER_TYPE_UD),
1668              brw_imm_ud(0x80000000));
1669      /* Do the actual F32 -> F16 -> F32 conversion */
1670      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1671      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1672      /* Select that or zero based on normal status */
1673      inst = bld.SEL(result, zero, tmp32);
1674      inst->predicate = BRW_PREDICATE_NORMAL;
1675      break;
1676   }
1677
1678   case nir_op_imin:
1679   case nir_op_umin:
1680   case nir_op_fmin:
1681      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1682      break;
1683
1684   case nir_op_imax:
1685   case nir_op_umax:
1686   case nir_op_fmax:
1687      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1688      break;
1689
1690   case nir_op_pack_snorm_2x16:
1691   case nir_op_pack_snorm_4x8:
1692   case nir_op_pack_unorm_2x16:
1693   case nir_op_pack_unorm_4x8:
1694   case nir_op_unpack_snorm_2x16:
1695   case nir_op_unpack_snorm_4x8:
1696   case nir_op_unpack_unorm_2x16:
1697   case nir_op_unpack_unorm_4x8:
1698   case nir_op_unpack_half_2x16:
1699   case nir_op_pack_half_2x16:
1700      unreachable("not reached: should be handled by lower_packing_builtins");
1701
1702   case nir_op_unpack_half_2x16_split_x_flush_to_zero:
1703      assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1704      FALLTHROUGH;
1705   case nir_op_unpack_half_2x16_split_x:
1706      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1707                      subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1708      break;
1709
1710   case nir_op_unpack_half_2x16_split_y_flush_to_zero:
1711      assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode);
1712      FALLTHROUGH;
1713   case nir_op_unpack_half_2x16_split_y:
1714      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1715                      subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1716      break;
1717
1718   case nir_op_pack_64_2x32_split:
1719   case nir_op_pack_32_2x16_split:
1720      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1721      break;
1722
1723   case nir_op_pack_32_4x8_split:
1724      bld.emit(FS_OPCODE_PACK, result, op, 4);
1725      break;
1726
1727   case nir_op_unpack_64_2x32_split_x:
1728   case nir_op_unpack_64_2x32_split_y: {
1729      if (instr->op == nir_op_unpack_64_2x32_split_x)
1730         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1731      else
1732         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1733      break;
1734   }
1735
1736   case nir_op_unpack_32_2x16_split_x:
1737   case nir_op_unpack_32_2x16_split_y: {
1738      if (instr->op == nir_op_unpack_32_2x16_split_x)
1739         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1740      else
1741         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1742      break;
1743   }
1744
1745   case nir_op_fpow:
1746      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1747      break;
1748
1749   case nir_op_bitfield_reverse:
1750      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1751      bld.BFREV(result, op[0]);
1752      break;
1753
1754   case nir_op_bit_count:
1755      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1756      bld.CBIT(result, op[0]);
1757      break;
1758
1759   case nir_op_ufind_msb: {
1760      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1761      emit_find_msb_using_lzd(bld, result, op[0], false);
1762      break;
1763   }
1764
1765   case nir_op_uclz:
1766      assert(nir_dest_bit_size(instr->dest.dest) == 32);
1767      bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1768      break;
1769
1770   case nir_op_ifind_msb: {
1771      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1772
1773      if (devinfo->ver < 7) {
1774         emit_find_msb_using_lzd(bld, result, op[0], true);
1775      } else {
1776         bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1777
1778         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1779          * count from the LSB side. If FBH didn't return an error
1780          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1781          * count into an LSB count.
1782          */
1783         bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1784
1785         inst = bld.ADD(result, result, brw_imm_d(31));
1786         inst->predicate = BRW_PREDICATE_NORMAL;
1787         inst->src[0].negate = true;
1788      }
1789      break;
1790   }
1791
1792   case nir_op_find_lsb:
1793      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1794
1795      if (devinfo->ver < 7) {
1796         fs_reg temp = vgrf(glsl_type::int_type);
1797
1798         /* (x & -x) generates a value that consists of only the LSB of x.
1799          * For all powers of 2, findMSB(y) == findLSB(y).
1800          */
1801         fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1802         fs_reg negated_src = src;
1803
1804         /* One must be negated, and the other must be non-negated.  It
1805          * doesn't matter which is which.
1806          */
1807         negated_src.negate = true;
1808         src.negate = false;
1809
1810         bld.AND(temp, src, negated_src);
1811         emit_find_msb_using_lzd(bld, result, temp, false);
1812      } else {
1813         bld.FBL(result, op[0]);
1814      }
1815      break;
1816
1817   case nir_op_ubitfield_extract:
1818   case nir_op_ibitfield_extract:
1819      unreachable("should have been lowered");
1820   case nir_op_ubfe:
1821   case nir_op_ibfe:
1822      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1823      bld.BFE(result, op[2], op[1], op[0]);
1824      break;
1825   case nir_op_bfm:
1826      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1827      bld.BFI1(result, op[0], op[1]);
1828      break;
1829   case nir_op_bfi:
1830      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1831      bld.BFI2(result, op[0], op[1], op[2]);
1832      break;
1833
1834   case nir_op_bitfield_insert:
1835      unreachable("not reached: should have been lowered");
1836
1837   /* For all shift operations:
1838    *
1839    * Gen4 - Gen7: After application of source modifiers, the low 5-bits of
1840    * src1 are used an unsigned value for the shift count.
1841    *
1842    * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low
1843    * 6-bit of src1 are used.
1844    *
1845    * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for
1846    * W or UW src0).
1847    *
1848    * The implication is that the following instruction will produce a
1849    * different result on Gen9+ than on previous platforms:
1850    *
1851    *    shr(8)    g4<1>UW    g12<8,8,1>UW    0x0010UW
1852    *
1853    * where Gen9+ will shift by zero, and earlier platforms will shift by 16.
1854    *
1855    * This does not seem to be the case.  Experimentally, it has been
1856    * determined that shifts of 16-bit values on Gen8 behave properly.  Shifts
1857    * of 8-bit values on both Gen8 and Gen9 do not.  Gen11+ lowers 8-bit
1858    * values, so those platforms were not tested.  No features expose access
1859    * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not
1860    * tested either.  See
1861    * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76.
1862    *
1863    * This is part of the reason 8-bit values are lowered to 16-bit on all
1864    * platforms.
1865    */
1866   case nir_op_ishl:
1867      bld.SHL(result, op[0], op[1]);
1868      break;
1869   case nir_op_ishr:
1870      bld.ASR(result, op[0], op[1]);
1871      break;
1872   case nir_op_ushr:
1873      bld.SHR(result, op[0], op[1]);
1874      break;
1875
1876   case nir_op_urol:
1877      bld.ROL(result, op[0], op[1]);
1878      break;
1879   case nir_op_uror:
1880      bld.ROR(result, op[0], op[1]);
1881      break;
1882
1883   case nir_op_pack_half_2x16_split:
1884      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1885      break;
1886
1887   case nir_op_sdot_4x8_iadd:
1888   case nir_op_sdot_4x8_iadd_sat:
1889      inst = bld.DP4A(result,
1890                      retype(op[2], BRW_REGISTER_TYPE_D),
1891                      retype(op[0], BRW_REGISTER_TYPE_D),
1892                      retype(op[1], BRW_REGISTER_TYPE_D));
1893
1894      if (instr->op == nir_op_sdot_4x8_iadd_sat)
1895         inst->saturate = true;
1896      break;
1897
1898   case nir_op_udot_4x8_uadd:
1899   case nir_op_udot_4x8_uadd_sat:
1900      inst = bld.DP4A(result,
1901                      retype(op[2], BRW_REGISTER_TYPE_UD),
1902                      retype(op[0], BRW_REGISTER_TYPE_UD),
1903                      retype(op[1], BRW_REGISTER_TYPE_UD));
1904
1905      if (instr->op == nir_op_udot_4x8_uadd_sat)
1906         inst->saturate = true;
1907      break;
1908
1909   case nir_op_sudot_4x8_iadd:
1910   case nir_op_sudot_4x8_iadd_sat:
1911      inst = bld.DP4A(result,
1912                      retype(op[2], BRW_REGISTER_TYPE_D),
1913                      retype(op[0], BRW_REGISTER_TYPE_D),
1914                      retype(op[1], BRW_REGISTER_TYPE_UD));
1915
1916      if (instr->op == nir_op_sudot_4x8_iadd_sat)
1917         inst->saturate = true;
1918      break;
1919
1920   case nir_op_ffma:
1921      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1922         brw_rnd_mode rnd =
1923            brw_rnd_mode_from_execution_mode(execution_mode);
1924         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1925                  brw_imm_d(rnd));
1926      }
1927
1928      inst = bld.MAD(result, op[2], op[1], op[0]);
1929      break;
1930
1931   case nir_op_flrp:
1932      if (nir_has_any_rounding_mode_enabled(execution_mode)) {
1933         brw_rnd_mode rnd =
1934            brw_rnd_mode_from_execution_mode(execution_mode);
1935         bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1936                  brw_imm_d(rnd));
1937      }
1938
1939      inst = bld.LRP(result, op[0], op[1], op[2]);
1940      break;
1941
1942   case nir_op_b32csel:
1943      if (optimize_frontfacing_ternary(instr, result))
1944         return;
1945
1946      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1947      inst = bld.SEL(result, op[1], op[2]);
1948      inst->predicate = BRW_PREDICATE_NORMAL;
1949      break;
1950
1951   case nir_op_extract_u8:
1952   case nir_op_extract_i8: {
1953      unsigned byte = nir_src_as_uint(instr->src[1].src);
1954
1955      /* The PRMs say:
1956       *
1957       *    BDW+
1958       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1959       *    Use two instructions and a word or DWord intermediate integer type.
1960       */
1961      if (nir_dest_bit_size(instr->dest.dest) == 64) {
1962         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1963
1964         if (instr->op == nir_op_extract_i8) {
1965            /* If we need to sign extend, extract to a word first */
1966            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1967            bld.MOV(w_temp, subscript(op[0], type, byte));
1968            bld.MOV(result, w_temp);
1969         } else if (byte & 1) {
1970            /* Extract the high byte from the word containing the desired byte
1971             * offset.
1972             */
1973            bld.SHR(result,
1974                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1975                    brw_imm_uw(8));
1976         } else {
1977            /* Otherwise use an AND with 0xff and a word type */
1978            bld.AND(result,
1979                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1980                    brw_imm_uw(0xff));
1981         }
1982      } else {
1983         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1984         bld.MOV(result, subscript(op[0], type, byte));
1985      }
1986      break;
1987   }
1988
1989   case nir_op_extract_u16:
1990   case nir_op_extract_i16: {
1991      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1992      unsigned word = nir_src_as_uint(instr->src[1].src);
1993      bld.MOV(result, subscript(op[0], type, word));
1994      break;
1995   }
1996
1997   default:
1998      unreachable("unhandled instruction");
1999   }
2000
2001   /* If we need to do a boolean resolve, replace the result with -(x & 1)
2002    * to sign extend the low bit to 0/~0
2003    */
2004   if (devinfo->ver <= 5 &&
2005       !result.is_null() &&
2006       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
2007      fs_reg masked = vgrf(glsl_type::int_type);
2008      bld.AND(masked, result, brw_imm_d(1));
2009      masked.negate = true;
2010      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
2011   }
2012}
2013
2014void
2015fs_visitor::nir_emit_load_const(const fs_builder &bld,
2016                                nir_load_const_instr *instr)
2017{
2018   const brw_reg_type reg_type =
2019      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
2020   fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
2021
2022   switch (instr->def.bit_size) {
2023   case 8:
2024      for (unsigned i = 0; i < instr->def.num_components; i++)
2025         bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
2026      break;
2027
2028   case 16:
2029      for (unsigned i = 0; i < instr->def.num_components; i++)
2030         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
2031      break;
2032
2033   case 32:
2034      for (unsigned i = 0; i < instr->def.num_components; i++)
2035         bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
2036      break;
2037
2038   case 64:
2039      assert(devinfo->ver >= 7);
2040      if (devinfo->ver == 7) {
2041         /* We don't get 64-bit integer types until gfx8 */
2042         for (unsigned i = 0; i < instr->def.num_components; i++) {
2043            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
2044                    setup_imm_df(bld, instr->value[i].f64));
2045         }
2046      } else {
2047         for (unsigned i = 0; i < instr->def.num_components; i++)
2048            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
2049      }
2050      break;
2051
2052   default:
2053      unreachable("Invalid bit size");
2054   }
2055
2056   nir_ssa_values[instr->def.index] = reg;
2057}
2058
2059fs_reg
2060fs_visitor::get_nir_src(const nir_src &src)
2061{
2062   fs_reg reg;
2063   if (src.is_ssa) {
2064      if (nir_src_is_undef(src)) {
2065         const brw_reg_type reg_type =
2066            brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
2067         reg = bld.vgrf(reg_type, src.ssa->num_components);
2068      } else {
2069         reg = nir_ssa_values[src.ssa->index];
2070      }
2071   } else {
2072      /* We don't handle indirects on locals */
2073      assert(src.reg.indirect == NULL);
2074      reg = offset(nir_locals[src.reg.reg->index], bld,
2075                   src.reg.base_offset * src.reg.reg->num_components);
2076   }
2077
2078   if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) {
2079      /* The only 64-bit type available on gfx7 is DF, so use that. */
2080      reg.type = BRW_REGISTER_TYPE_DF;
2081   } else {
2082      /* To avoid floating-point denorm flushing problems, set the type by
2083       * default to an integer type - instructions that need floating point
2084       * semantics will set this to F if they need to
2085       */
2086      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
2087                                            BRW_REGISTER_TYPE_D);
2088   }
2089
2090   return reg;
2091}
2092
2093/**
2094 * Return an IMM for constants; otherwise call get_nir_src() as normal.
2095 *
2096 * This function should not be called on any value which may be 64 bits.
2097 * We could theoretically support 64-bit on gfx8+ but we choose not to
2098 * because it wouldn't work in general (no gfx7 support) and there are
2099 * enough restrictions in 64-bit immediates that you can't take the return
2100 * value and treat it the same as the result of get_nir_src().
2101 */
2102fs_reg
2103fs_visitor::get_nir_src_imm(const nir_src &src)
2104{
2105   assert(nir_src_bit_size(src) == 32);
2106   return nir_src_is_const(src) ?
2107          fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
2108}
2109
2110fs_reg
2111fs_visitor::get_nir_dest(const nir_dest &dest)
2112{
2113   if (dest.is_ssa) {
2114      const brw_reg_type reg_type =
2115         brw_reg_type_from_bit_size(dest.ssa.bit_size,
2116                                    dest.ssa.bit_size == 8 ?
2117                                    BRW_REGISTER_TYPE_D :
2118                                    BRW_REGISTER_TYPE_F);
2119      nir_ssa_values[dest.ssa.index] =
2120         bld.vgrf(reg_type, dest.ssa.num_components);
2121      bld.UNDEF(nir_ssa_values[dest.ssa.index]);
2122      return nir_ssa_values[dest.ssa.index];
2123   } else {
2124      /* We don't handle indirects on locals */
2125      assert(dest.reg.indirect == NULL);
2126      return offset(nir_locals[dest.reg.reg->index], bld,
2127                    dest.reg.base_offset * dest.reg.reg->num_components);
2128   }
2129}
2130
2131void
2132fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
2133                         unsigned wr_mask)
2134{
2135   for (unsigned i = 0; i < 4; i++) {
2136      if (!((wr_mask >> i) & 1))
2137         continue;
2138
2139      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
2140      new_inst->dst = offset(new_inst->dst, bld, i);
2141      for (unsigned j = 0; j < new_inst->sources; j++)
2142         if (new_inst->src[j].file == VGRF)
2143            new_inst->src[j] = offset(new_inst->src[j], bld, i);
2144
2145      bld.emit(new_inst);
2146   }
2147}
2148
2149static fs_inst *
2150emit_pixel_interpolater_send(const fs_builder &bld,
2151                             enum opcode opcode,
2152                             const fs_reg &dst,
2153                             const fs_reg &src,
2154                             const fs_reg &desc,
2155                             glsl_interp_mode interpolation)
2156{
2157   struct brw_wm_prog_data *wm_prog_data =
2158      brw_wm_prog_data(bld.shader->stage_prog_data);
2159
2160   fs_inst *inst = bld.emit(opcode, dst, src, desc);
2161   /* 2 floats per slot returned */
2162   inst->size_written = 2 * dst.component_size(inst->exec_size);
2163   inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
2164
2165   wm_prog_data->pulls_bary = true;
2166
2167   return inst;
2168}
2169
2170/**
2171 * Computes 1 << x, given a D/UD register containing some value x.
2172 */
2173static fs_reg
2174intexp2(const fs_builder &bld, const fs_reg &x)
2175{
2176   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
2177
2178   fs_reg result = bld.vgrf(x.type, 1);
2179   fs_reg one = bld.vgrf(x.type, 1);
2180
2181   bld.MOV(one, retype(brw_imm_d(1), one.type));
2182   bld.SHL(result, one, x);
2183   return result;
2184}
2185
2186void
2187fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
2188{
2189   assert(stage == MESA_SHADER_GEOMETRY);
2190
2191   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2192
2193   if (gs_compile->control_data_header_size_bits == 0)
2194      return;
2195
2196   /* We can only do EndPrimitive() functionality when the control data
2197    * consists of cut bits.  Fortunately, the only time it isn't is when the
2198    * output type is points, in which case EndPrimitive() is a no-op.
2199    */
2200   if (gs_prog_data->control_data_format !=
2201       GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2202      return;
2203   }
2204
2205   /* Cut bits use one bit per vertex. */
2206   assert(gs_compile->control_data_bits_per_vertex == 1);
2207
2208   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2209   vertex_count.type = BRW_REGISTER_TYPE_UD;
2210
2211   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2212    * vertex n, 0 otherwise.  So all we need to do here is mark bit
2213    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2214    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2215    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2216    *
2217    * Note that if EndPrimitive() is called before emitting any vertices, this
2218    * will cause us to set bit 31 of the control_data_bits register to 1.
2219    * That's fine because:
2220    *
2221    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2222    *   output, so the hardware will ignore cut bit 31.
2223    *
2224    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2225    *   last vertex, so setting cut bit 31 has no effect (since the primitive
2226    *   is automatically ended when the GS terminates).
2227    *
2228    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2229    *   control_data_bits register to 0 when the first vertex is emitted.
2230    */
2231
2232   const fs_builder abld = bld.annotate("end primitive");
2233
2234   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2235   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2236   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2237   fs_reg mask = intexp2(abld, prev_count);
2238   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2239    * attention to the lower 5 bits of its second source argument, so on this
2240    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2241    * ((vertex_count - 1) % 32).
2242    */
2243   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2244}
2245
2246void
2247fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2248{
2249   assert(stage == MESA_SHADER_GEOMETRY);
2250   assert(gs_compile->control_data_bits_per_vertex != 0);
2251
2252   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2253
2254   const fs_builder abld = bld.annotate("emit control data bits");
2255   const fs_builder fwa_bld = bld.exec_all();
2256
2257   /* We use a single UD register to accumulate control data bits (32 bits
2258    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2259    * at a time.
2260    *
2261    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2262    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2263    * use the Channel Mask phase to enable/disable which DWord within that
2264    * group to write.  (Remember, different SIMD8 channels may have emitted
2265    * different numbers of vertices, so we may need per-slot offsets.)
2266    *
2267    * Channel masking presents an annoying problem: we may have to replicate
2268    * the data up to 4 times:
2269    *
2270    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2271    *
2272    * To avoid penalizing shaders that emit a small number of vertices, we
2273    * can avoid these sometimes: if the size of the control data header is
2274    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2275    * land in the same 128-bit group, so we can skip per-slot offsets.
2276    *
2277    * Similarly, if the control data header is <= 32 bits, there is only one
2278    * DWord, so we can skip channel masks.
2279    */
2280   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
2281
2282   fs_reg channel_mask, per_slot_offset;
2283
2284   if (gs_compile->control_data_header_size_bits > 32) {
2285      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2286      channel_mask = vgrf(glsl_type::uint_type);
2287   }
2288
2289   if (gs_compile->control_data_header_size_bits > 128) {
2290      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
2291      per_slot_offset = vgrf(glsl_type::uint_type);
2292   }
2293
2294   /* Figure out which DWord we're trying to write to using the formula:
2295    *
2296    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2297    *
2298    * Since bits_per_vertex is a power of two, and is known at compile
2299    * time, this can be optimized to:
2300    *
2301    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2302    */
2303   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
2304      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2305      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2306      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2307      unsigned log2_bits_per_vertex =
2308         util_last_bit(gs_compile->control_data_bits_per_vertex);
2309      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2310
2311      if (per_slot_offset.file != BAD_FILE) {
2312         /* Set the per-slot offset to dword_index / 4, so that we'll write to
2313          * the appropriate OWord within the control data header.
2314          */
2315         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2316      }
2317
2318      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2319       * write to the appropriate DWORD within the OWORD.
2320       */
2321      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2322      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2323      channel_mask = intexp2(fwa_bld, channel);
2324      /* Then the channel masks need to be in bits 23:16. */
2325      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2326   }
2327
2328   /* Store the control data bits in the message payload and send it. */
2329   unsigned mlen = 2;
2330   if (channel_mask.file != BAD_FILE)
2331      mlen += 4; /* channel masks, plus 3 extra copies of the data */
2332   if (per_slot_offset.file != BAD_FILE)
2333      mlen++;
2334
2335   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2336   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
2337   unsigned i = 0;
2338   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2339   if (per_slot_offset.file != BAD_FILE)
2340      sources[i++] = per_slot_offset;
2341   if (channel_mask.file != BAD_FILE)
2342      sources[i++] = channel_mask;
2343   while (i < mlen) {
2344      sources[i++] = this->control_data_bits;
2345   }
2346
2347   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2348   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2349   inst->mlen = mlen;
2350   /* We need to increment Global Offset by 256-bits to make room for
2351    * Broadwell's extra "Vertex Count" payload at the beginning of the
2352    * URB entry.  Since this is an OWord message, Global Offset is counted
2353    * in 128-bit units, so we must set it to 2.
2354    */
2355   if (gs_prog_data->static_vertex_count == -1)
2356      inst->offset = 2;
2357}
2358
2359void
2360fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2361                                            unsigned stream_id)
2362{
2363   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2364
2365   /* Note: we are calling this *before* increasing vertex_count, so
2366    * this->vertex_count == vertex_count - 1 in the formula above.
2367    */
2368
2369   /* Stream mode uses 2 bits per vertex */
2370   assert(gs_compile->control_data_bits_per_vertex == 2);
2371
2372   /* Must be a valid stream */
2373   assert(stream_id < MAX_VERTEX_STREAMS);
2374
2375   /* Control data bits are initialized to 0 so we don't have to set any
2376    * bits when sending vertices to stream 0.
2377    */
2378   if (stream_id == 0)
2379      return;
2380
2381   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2382
2383   /* reg::sid = stream_id */
2384   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2385   abld.MOV(sid, brw_imm_ud(stream_id));
2386
2387   /* reg:shift_count = 2 * (vertex_count - 1) */
2388   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2389   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2390
2391   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2392    * attention to the lower 5 bits of its second source argument, so on this
2393    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2394    * stream_id << ((2 * (vertex_count - 1)) % 32).
2395    */
2396   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2397   abld.SHL(mask, sid, shift_count);
2398   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2399}
2400
2401void
2402fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2403                           unsigned stream_id)
2404{
2405   assert(stage == MESA_SHADER_GEOMETRY);
2406
2407   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2408
2409   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2410   vertex_count.type = BRW_REGISTER_TYPE_UD;
2411
2412   /* Haswell and later hardware ignores the "Render Stream Select" bits
2413    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2414    * and instead sends all primitives down the pipeline for rasterization.
2415    * If the SOL stage is enabled, "Render Stream Select" is honored and
2416    * primitives bound to non-zero streams are discarded after stream output.
2417    *
2418    * Since the only purpose of primives sent to non-zero streams is to
2419    * be recorded by transform feedback, we can simply discard all geometry
2420    * bound to these streams when transform feedback is disabled.
2421    */
2422   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2423      return;
2424
2425   /* If we're outputting 32 control data bits or less, then we can wait
2426    * until the shader is over to output them all.  Otherwise we need to
2427    * output them as we go.  Now is the time to do it, since we're about to
2428    * output the vertex_count'th vertex, so it's guaranteed that the
2429    * control data bits associated with the (vertex_count - 1)th vertex are
2430    * correct.
2431    */
2432   if (gs_compile->control_data_header_size_bits > 32) {
2433      const fs_builder abld =
2434         bld.annotate("emit vertex: emit control data bits");
2435
2436      /* Only emit control data bits if we've finished accumulating a batch
2437       * of 32 bits.  This is the case when:
2438       *
2439       *     (vertex_count * bits_per_vertex) % 32 == 0
2440       *
2441       * (in other words, when the last 5 bits of vertex_count *
2442       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2443       * integer n (which is always the case, since bits_per_vertex is
2444       * always 1 or 2), this is equivalent to requiring that the last 5-n
2445       * bits of vertex_count are 0:
2446       *
2447       *     vertex_count & (2^(5-n) - 1) == 0
2448       *
2449       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2450       * equivalent to:
2451       *
2452       *     vertex_count & (32 / bits_per_vertex - 1) == 0
2453       *
2454       * TODO: If vertex_count is an immediate, we could do some of this math
2455       *       at compile time...
2456       */
2457      fs_inst *inst =
2458         abld.AND(bld.null_reg_d(), vertex_count,
2459                  brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2460      inst->conditional_mod = BRW_CONDITIONAL_Z;
2461
2462      abld.IF(BRW_PREDICATE_NORMAL);
2463      /* If vertex_count is 0, then no control data bits have been
2464       * accumulated yet, so we can skip emitting them.
2465       */
2466      abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2467               BRW_CONDITIONAL_NEQ);
2468      abld.IF(BRW_PREDICATE_NORMAL);
2469      emit_gs_control_data_bits(vertex_count);
2470      abld.emit(BRW_OPCODE_ENDIF);
2471
2472      /* Reset control_data_bits to 0 so we can start accumulating a new
2473       * batch.
2474       *
2475       * Note: in the case where vertex_count == 0, this neutralizes the
2476       * effect of any call to EndPrimitive() that the shader may have
2477       * made before outputting its first vertex.
2478       */
2479      inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2480      inst->force_writemask_all = true;
2481      abld.emit(BRW_OPCODE_ENDIF);
2482   }
2483
2484   emit_urb_writes(vertex_count);
2485
2486   /* In stream mode we have to set control data bits for all vertices
2487    * unless we have disabled control data bits completely (which we do
2488    * do for GL_POINTS outputs that don't use streams).
2489    */
2490   if (gs_compile->control_data_header_size_bits > 0 &&
2491       gs_prog_data->control_data_format ==
2492          GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2493      set_gs_stream_control_data_bits(vertex_count, stream_id);
2494   }
2495}
2496
2497void
2498fs_visitor::emit_gs_input_load(const fs_reg &dst,
2499                               const nir_src &vertex_src,
2500                               unsigned base_offset,
2501                               const nir_src &offset_src,
2502                               unsigned num_components,
2503                               unsigned first_component)
2504{
2505   assert(type_sz(dst.type) == 4);
2506   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2507   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2508
2509   /* TODO: figure out push input layout for invocations == 1 */
2510   if (gs_prog_data->invocations == 1 &&
2511       nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2512       4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2513      int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2514                       nir_src_as_uint(vertex_src) * push_reg_count;
2515      for (unsigned i = 0; i < num_components; i++) {
2516         bld.MOV(offset(dst, bld, i),
2517                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2518      }
2519      return;
2520   }
2521
2522   /* Resort to the pull model.  Ensure the VUE handles are provided. */
2523   assert(gs_prog_data->base.include_vue_handles);
2524
2525   unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2526   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2527
2528   if (gs_prog_data->invocations == 1) {
2529      if (nir_src_is_const(vertex_src)) {
2530         /* The vertex index is constant; just select the proper URB handle. */
2531         icp_handle =
2532            retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2533                   BRW_REGISTER_TYPE_UD);
2534      } else {
2535         /* The vertex index is non-constant.  We need to use indirect
2536          * addressing to fetch the proper URB handle.
2537          *
2538          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2539          * indicating that channel <n> should read the handle from
2540          * DWord <n>.  We convert that to bytes by multiplying by 4.
2541          *
2542          * Next, we convert the vertex index to bytes by multiplying
2543          * by 32 (shifting by 5), and add the two together.  This is
2544          * the final indirect byte offset.
2545          */
2546         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2547         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2548         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2549         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2550
2551         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2552         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2553         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2554         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2555         /* Convert vertex_index to bytes (multiply by 32) */
2556         bld.SHL(vertex_offset_bytes,
2557                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2558                 brw_imm_ud(5u));
2559         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2560
2561         /* Use first_icp_handle as the base offset.  There is one register
2562          * of URB handles per vertex, so inform the register allocator that
2563          * we might read up to nir->info.gs.vertices_in registers.
2564          */
2565         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2566                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2567                  fs_reg(icp_offset_bytes),
2568                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2569      }
2570   } else {
2571      assert(gs_prog_data->invocations > 1);
2572
2573      if (nir_src_is_const(vertex_src)) {
2574         unsigned vertex = nir_src_as_uint(vertex_src);
2575         assert(devinfo->ver >= 9 || vertex <= 5);
2576         bld.MOV(icp_handle,
2577                 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2578                        BRW_REGISTER_TYPE_UD));
2579      } else {
2580         /* The vertex index is non-constant.  We need to use indirect
2581          * addressing to fetch the proper URB handle.
2582          *
2583          */
2584         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2585
2586         /* Convert vertex_index to bytes (multiply by 4) */
2587         bld.SHL(icp_offset_bytes,
2588                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2589                 brw_imm_ud(2u));
2590
2591         /* Use first_icp_handle as the base offset.  There is one DWord
2592          * of URB handles per vertex, so inform the register allocator that
2593          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2594          */
2595         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2596                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2597                  fs_reg(icp_offset_bytes),
2598                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2599                             REG_SIZE));
2600      }
2601   }
2602
2603   fs_inst *inst;
2604   fs_reg indirect_offset = get_nir_src(offset_src);
2605
2606   if (nir_src_is_const(offset_src)) {
2607      /* Constant indexing - use global offset. */
2608      if (first_component != 0) {
2609         unsigned read_components = num_components + first_component;
2610         fs_reg tmp = bld.vgrf(dst.type, read_components);
2611         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2612         inst->size_written = read_components *
2613                              tmp.component_size(inst->exec_size);
2614         for (unsigned i = 0; i < num_components; i++) {
2615            bld.MOV(offset(dst, bld, i),
2616                    offset(tmp, bld, i + first_component));
2617         }
2618      } else {
2619         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2620         inst->size_written = num_components *
2621                              dst.component_size(inst->exec_size);
2622      }
2623      inst->offset = base_offset + nir_src_as_uint(offset_src);
2624      inst->mlen = 1;
2625   } else {
2626      /* Indirect indexing - use per-slot offsets as well. */
2627      const fs_reg srcs[] = { icp_handle, indirect_offset };
2628      unsigned read_components = num_components + first_component;
2629      fs_reg tmp = bld.vgrf(dst.type, read_components);
2630      fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2631      bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2632      if (first_component != 0) {
2633         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2634                         payload);
2635         inst->size_written = read_components *
2636                              tmp.component_size(inst->exec_size);
2637         for (unsigned i = 0; i < num_components; i++) {
2638            bld.MOV(offset(dst, bld, i),
2639                    offset(tmp, bld, i + first_component));
2640         }
2641      } else {
2642         inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload);
2643         inst->size_written = num_components *
2644                              dst.component_size(inst->exec_size);
2645      }
2646      inst->offset = base_offset;
2647      inst->mlen = 2;
2648   }
2649}
2650
2651fs_reg
2652fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2653{
2654   nir_src *offset_src = nir_get_io_offset_src(instr);
2655
2656   if (nir_src_is_const(*offset_src)) {
2657      /* The only constant offset we should find is 0.  brw_nir.c's
2658       * add_const_offset_to_base() will fold other constant offsets
2659       * into instr->const_index[0].
2660       */
2661      assert(nir_src_as_uint(*offset_src) == 0);
2662      return fs_reg();
2663   }
2664
2665   return get_nir_src(*offset_src);
2666}
2667
2668void
2669fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2670                                  nir_intrinsic_instr *instr)
2671{
2672   assert(stage == MESA_SHADER_VERTEX);
2673
2674   fs_reg dest;
2675   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2676      dest = get_nir_dest(instr->dest);
2677
2678   switch (instr->intrinsic) {
2679   case nir_intrinsic_load_vertex_id:
2680   case nir_intrinsic_load_base_vertex:
2681      unreachable("should be lowered by nir_lower_system_values()");
2682
2683   case nir_intrinsic_load_input: {
2684      assert(nir_dest_bit_size(instr->dest) == 32);
2685      fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2686      src = offset(src, bld, nir_intrinsic_component(instr));
2687      src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2688
2689      for (unsigned i = 0; i < instr->num_components; i++)
2690         bld.MOV(offset(dest, bld, i), offset(src, bld, i));
2691      break;
2692   }
2693
2694   case nir_intrinsic_load_vertex_id_zero_base:
2695   case nir_intrinsic_load_instance_id:
2696   case nir_intrinsic_load_base_instance:
2697   case nir_intrinsic_load_draw_id:
2698   case nir_intrinsic_load_first_vertex:
2699   case nir_intrinsic_load_is_indexed_draw:
2700      unreachable("lowered by brw_nir_lower_vs_inputs");
2701
2702   default:
2703      nir_emit_intrinsic(bld, instr);
2704      break;
2705   }
2706}
2707
2708fs_reg
2709fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld,
2710                                            nir_intrinsic_instr *instr)
2711{
2712   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2713   const nir_src &vertex_src = instr->src[0];
2714   nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src);
2715   fs_reg icp_handle;
2716
2717   if (nir_src_is_const(vertex_src)) {
2718      /* Emit a MOV to resolve <0,1,0> regioning. */
2719      icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2720      unsigned vertex = nir_src_as_uint(vertex_src);
2721      bld.MOV(icp_handle,
2722              retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2723                     BRW_REGISTER_TYPE_UD));
2724   } else if (tcs_prog_data->instances == 1 && vertex_intrin &&
2725              vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) {
2726      /* For the common case of only 1 instance, an array index of
2727       * gl_InvocationID means reading g1.  Skip all the indirect work.
2728       */
2729      icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2730   } else {
2731      /* The vertex index is non-constant.  We need to use indirect
2732       * addressing to fetch the proper URB handle.
2733       */
2734      icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2735
2736      /* Each ICP handle is a single DWord (4 bytes) */
2737      fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2738      bld.SHL(vertex_offset_bytes,
2739              retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2740              brw_imm_ud(2u));
2741
2742      /* Start at g1.  We might read up to 4 registers. */
2743      bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2744               retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2745               brw_imm_ud(4 * REG_SIZE));
2746   }
2747
2748   return icp_handle;
2749}
2750
2751fs_reg
2752fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld,
2753                                           nir_intrinsic_instr *instr)
2754{
2755   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2756   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2757   const nir_src &vertex_src = instr->src[0];
2758
2759   unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2;
2760
2761   if (nir_src_is_const(vertex_src)) {
2762      return fs_reg(retype(brw_vec8_grf(first_icp_handle +
2763                                        nir_src_as_uint(vertex_src), 0),
2764                           BRW_REGISTER_TYPE_UD));
2765   }
2766
2767   /* The vertex index is non-constant.  We need to use indirect
2768    * addressing to fetch the proper URB handle.
2769    *
2770    * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2771    * indicating that channel <n> should read the handle from
2772    * DWord <n>.  We convert that to bytes by multiplying by 4.
2773    *
2774    * Next, we convert the vertex index to bytes by multiplying
2775    * by 32 (shifting by 5), and add the two together.  This is
2776    * the final indirect byte offset.
2777    */
2778   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2779   fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2780   fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2781   fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2782   fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2783
2784   /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2785   bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2786   /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2787   bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2788   /* Convert vertex_index to bytes (multiply by 32) */
2789   bld.SHL(vertex_offset_bytes,
2790           retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2791           brw_imm_ud(5u));
2792   bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2793
2794   /* Use first_icp_handle as the base offset.  There is one register
2795    * of URB handles per vertex, so inform the register allocator that
2796    * we might read up to nir->info.gs.vertices_in registers.
2797    */
2798   bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2799            retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2800            icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE));
2801
2802   return icp_handle;
2803}
2804
2805struct brw_reg
2806fs_visitor::get_tcs_output_urb_handle()
2807{
2808   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2809
2810   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
2811      return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2812   } else {
2813      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
2814      return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2815   }
2816}
2817
2818void
2819fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2820                                   nir_intrinsic_instr *instr)
2821{
2822   assert(stage == MESA_SHADER_TESS_CTRL);
2823   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2824   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2825   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
2826
2827   bool eight_patch =
2828      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH;
2829
2830   fs_reg dst;
2831   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2832      dst = get_nir_dest(instr->dest);
2833
2834   switch (instr->intrinsic) {
2835   case nir_intrinsic_load_primitive_id:
2836      bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0)
2837                                      : brw_vec1_grf(0, 1)));
2838      break;
2839   case nir_intrinsic_load_invocation_id:
2840      bld.MOV(retype(dst, invocation_id.type), invocation_id);
2841      break;
2842   case nir_intrinsic_load_patch_vertices_in:
2843      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2844              brw_imm_d(tcs_key->input_vertices));
2845      break;
2846
2847   case nir_intrinsic_control_barrier: {
2848      if (tcs_prog_data->instances == 1)
2849         break;
2850
2851      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2852      fs_reg m0_2 = component(m0, 2);
2853
2854      const fs_builder chanbld = bld.exec_all().group(1, 0);
2855
2856      /* Zero the message header */
2857      bld.exec_all().MOV(m0, brw_imm_ud(0u));
2858
2859      if (devinfo->verx10 >= 125) {
2860         /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */
2861         fs_reg m0_10ub = component(retype(m0, BRW_REGISTER_TYPE_UB), 10);
2862         fs_reg r0_11ub =
2863            stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11),
2864                   0, 1, 0);
2865         bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub);
2866      } else if (devinfo->ver >= 11) {
2867         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2868                     brw_imm_ud(INTEL_MASK(30, 24)));
2869
2870         /* Set the Barrier Count and the enable bit */
2871         chanbld.OR(m0_2, m0_2,
2872                    brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2873      } else {
2874         /* Copy "Barrier ID" from r0.2, bits 16:13 */
2875         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2876                     brw_imm_ud(INTEL_MASK(16, 13)));
2877
2878         /* Shift it up to bits 27:24. */
2879         chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2880
2881         /* Set the Barrier Count and the enable bit */
2882         chanbld.OR(m0_2, m0_2,
2883                    brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2884      }
2885
2886      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2887      break;
2888   }
2889
2890   case nir_intrinsic_load_input:
2891      unreachable("nir_lower_io should never give us these.");
2892      break;
2893
2894   case nir_intrinsic_load_per_vertex_input: {
2895      assert(nir_dest_bit_size(instr->dest) == 32);
2896      fs_reg indirect_offset = get_indirect_offset(instr);
2897      unsigned imm_offset = instr->const_index[0];
2898      fs_inst *inst;
2899
2900      fs_reg icp_handle =
2901         eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr)
2902                     : get_tcs_single_patch_icp_handle(bld, instr);
2903
2904      /* We can only read two double components with each URB read, so
2905       * we send two read messages in that case, each one loading up to
2906       * two double components.
2907       */
2908      unsigned num_components = instr->num_components;
2909      unsigned first_component = nir_intrinsic_component(instr);
2910
2911      if (indirect_offset.file == BAD_FILE) {
2912         /* Constant indexing - use global offset. */
2913         if (first_component != 0) {
2914            unsigned read_components = num_components + first_component;
2915            fs_reg tmp = bld.vgrf(dst.type, read_components);
2916            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2917            for (unsigned i = 0; i < num_components; i++) {
2918               bld.MOV(offset(dst, bld, i),
2919                       offset(tmp, bld, i + first_component));
2920            }
2921         } else {
2922            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2923         }
2924         inst->offset = imm_offset;
2925         inst->mlen = 1;
2926      } else {
2927         /* Indirect indexing - use per-slot offsets as well. */
2928         const fs_reg srcs[] = { icp_handle, indirect_offset };
2929         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2930         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2931         if (first_component != 0) {
2932            unsigned read_components = num_components + first_component;
2933            fs_reg tmp = bld.vgrf(dst.type, read_components);
2934            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2935                            payload);
2936            for (unsigned i = 0; i < num_components; i++) {
2937               bld.MOV(offset(dst, bld, i),
2938                       offset(tmp, bld, i + first_component));
2939            }
2940         } else {
2941            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2942                            payload);
2943         }
2944         inst->offset = imm_offset;
2945         inst->mlen = 2;
2946      }
2947      inst->size_written = (num_components + first_component) *
2948                           inst->dst.component_size(inst->exec_size);
2949
2950      /* Copy the temporary to the destination to deal with writemasking.
2951       *
2952       * Also attempt to deal with gl_PointSize being in the .w component.
2953       */
2954      if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2955         assert(type_sz(dst.type) == 4);
2956         inst->dst = bld.vgrf(dst.type, 4);
2957         inst->size_written = 4 * REG_SIZE;
2958         bld.MOV(dst, offset(inst->dst, bld, 3));
2959      }
2960      break;
2961   }
2962
2963   case nir_intrinsic_load_output:
2964   case nir_intrinsic_load_per_vertex_output: {
2965      assert(nir_dest_bit_size(instr->dest) == 32);
2966      fs_reg indirect_offset = get_indirect_offset(instr);
2967      unsigned imm_offset = instr->const_index[0];
2968      unsigned first_component = nir_intrinsic_component(instr);
2969
2970      struct brw_reg output_handles = get_tcs_output_urb_handle();
2971
2972      fs_inst *inst;
2973      if (indirect_offset.file == BAD_FILE) {
2974         /* This MOV replicates the output handle to all enabled channels
2975          * is SINGLE_PATCH mode.
2976          */
2977         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2978         bld.MOV(patch_handle, output_handles);
2979
2980         {
2981            if (first_component != 0) {
2982               unsigned read_components =
2983                  instr->num_components + first_component;
2984               fs_reg tmp = bld.vgrf(dst.type, read_components);
2985               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2986                               patch_handle);
2987               inst->size_written = read_components * REG_SIZE;
2988               for (unsigned i = 0; i < instr->num_components; i++) {
2989                  bld.MOV(offset(dst, bld, i),
2990                          offset(tmp, bld, i + first_component));
2991               }
2992            } else {
2993               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2994                               patch_handle);
2995               inst->size_written = instr->num_components * REG_SIZE;
2996            }
2997            inst->offset = imm_offset;
2998            inst->mlen = 1;
2999         }
3000      } else {
3001         /* Indirect indexing - use per-slot offsets as well. */
3002         const fs_reg srcs[] = { output_handles, indirect_offset };
3003         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3004         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
3005         if (first_component != 0) {
3006            unsigned read_components =
3007               instr->num_components + first_component;
3008            fs_reg tmp = bld.vgrf(dst.type, read_components);
3009            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
3010                            payload);
3011            inst->size_written = read_components * REG_SIZE;
3012            for (unsigned i = 0; i < instr->num_components; i++) {
3013               bld.MOV(offset(dst, bld, i),
3014                       offset(tmp, bld, i + first_component));
3015            }
3016         } else {
3017            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
3018                            payload);
3019            inst->size_written = instr->num_components * REG_SIZE;
3020         }
3021         inst->offset = imm_offset;
3022         inst->mlen = 2;
3023      }
3024      break;
3025   }
3026
3027   case nir_intrinsic_store_output:
3028   case nir_intrinsic_store_per_vertex_output: {
3029      assert(nir_src_bit_size(instr->src[0]) == 32);
3030      fs_reg value = get_nir_src(instr->src[0]);
3031      fs_reg indirect_offset = get_indirect_offset(instr);
3032      unsigned imm_offset = instr->const_index[0];
3033      unsigned mask = instr->const_index[1];
3034      unsigned header_regs = 0;
3035      struct brw_reg output_handles = get_tcs_output_urb_handle();
3036
3037      fs_reg srcs[7];
3038      srcs[header_regs++] = output_handles;
3039
3040      if (indirect_offset.file != BAD_FILE) {
3041         srcs[header_regs++] = indirect_offset;
3042      }
3043
3044      if (mask == 0)
3045         break;
3046
3047      unsigned num_components = util_last_bit(mask);
3048      enum opcode opcode;
3049
3050      /* We can only pack two 64-bit components in a single message, so send
3051       * 2 messages if we have more components
3052       */
3053      unsigned first_component = nir_intrinsic_component(instr);
3054      mask = mask << first_component;
3055
3056      if (mask != WRITEMASK_XYZW) {
3057         srcs[header_regs++] = brw_imm_ud(mask << 16);
3058         opcode = indirect_offset.file != BAD_FILE ?
3059            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
3060            SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
3061      } else {
3062         opcode = indirect_offset.file != BAD_FILE ?
3063            SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
3064            SHADER_OPCODE_URB_WRITE_SIMD8;
3065      }
3066
3067      for (unsigned i = 0; i < num_components; i++) {
3068         if (!(mask & (1 << (i + first_component))))
3069            continue;
3070
3071         srcs[header_regs + i + first_component] = offset(value, bld, i);
3072      }
3073
3074      unsigned mlen = header_regs + num_components + first_component;
3075      fs_reg payload =
3076         bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
3077      bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
3078
3079      fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
3080      inst->offset = imm_offset;
3081      inst->mlen = mlen;
3082      break;
3083   }
3084
3085   default:
3086      nir_emit_intrinsic(bld, instr);
3087      break;
3088   }
3089}
3090
3091void
3092fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
3093                                   nir_intrinsic_instr *instr)
3094{
3095   assert(stage == MESA_SHADER_TESS_EVAL);
3096   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
3097
3098   fs_reg dest;
3099   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3100      dest = get_nir_dest(instr->dest);
3101
3102   switch (instr->intrinsic) {
3103   case nir_intrinsic_load_primitive_id:
3104      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
3105      break;
3106   case nir_intrinsic_load_tess_coord:
3107      /* gl_TessCoord is part of the payload in g1-3 */
3108      for (unsigned i = 0; i < 3; i++) {
3109         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
3110      }
3111      break;
3112
3113   case nir_intrinsic_load_input:
3114   case nir_intrinsic_load_per_vertex_input: {
3115      assert(nir_dest_bit_size(instr->dest) == 32);
3116      fs_reg indirect_offset = get_indirect_offset(instr);
3117      unsigned imm_offset = instr->const_index[0];
3118      unsigned first_component = nir_intrinsic_component(instr);
3119
3120      fs_inst *inst;
3121      if (indirect_offset.file == BAD_FILE) {
3122         /* Arbitrarily only push up to 32 vec4 slots worth of data,
3123          * which is 16 registers (since each holds 2 vec4 slots).
3124          */
3125         const unsigned max_push_slots = 32;
3126         if (imm_offset < max_push_slots) {
3127            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
3128            for (int i = 0; i < instr->num_components; i++) {
3129               unsigned comp = 4 * (imm_offset % 2) + i + first_component;
3130               bld.MOV(offset(dest, bld, i), component(src, comp));
3131            }
3132
3133            tes_prog_data->base.urb_read_length =
3134               MAX2(tes_prog_data->base.urb_read_length,
3135                    (imm_offset / 2) + 1);
3136         } else {
3137            /* Replicate the patch handle to all enabled channels */
3138            const fs_reg srcs[] = {
3139               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
3140            };
3141            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
3142            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
3143
3144            if (first_component != 0) {
3145               unsigned read_components =
3146                  instr->num_components + first_component;
3147               fs_reg tmp = bld.vgrf(dest.type, read_components);
3148               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
3149                               patch_handle);
3150               inst->size_written = read_components * REG_SIZE;
3151               for (unsigned i = 0; i < instr->num_components; i++) {
3152                  bld.MOV(offset(dest, bld, i),
3153                          offset(tmp, bld, i + first_component));
3154               }
3155            } else {
3156               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
3157                               patch_handle);
3158               inst->size_written = instr->num_components * REG_SIZE;
3159            }
3160            inst->mlen = 1;
3161            inst->offset = imm_offset;
3162         }
3163      } else {
3164         /* Indirect indexing - use per-slot offsets as well. */
3165
3166         /* We can only read two double components with each URB read, so
3167          * we send two read messages in that case, each one loading up to
3168          * two double components.
3169          */
3170         unsigned num_components = instr->num_components;
3171         const fs_reg srcs[] = {
3172            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
3173            indirect_offset
3174         };
3175         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3176         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
3177
3178         if (first_component != 0) {
3179            unsigned read_components =
3180                num_components + first_component;
3181            fs_reg tmp = bld.vgrf(dest.type, read_components);
3182            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
3183                            payload);
3184            for (unsigned i = 0; i < num_components; i++) {
3185               bld.MOV(offset(dest, bld, i),
3186                       offset(tmp, bld, i + first_component));
3187            }
3188         } else {
3189            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
3190                            payload);
3191         }
3192         inst->mlen = 2;
3193         inst->offset = imm_offset;
3194         inst->size_written = (num_components + first_component) *
3195                              inst->dst.component_size(inst->exec_size);
3196      }
3197      break;
3198   }
3199   default:
3200      nir_emit_intrinsic(bld, instr);
3201      break;
3202   }
3203}
3204
3205void
3206fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3207                                  nir_intrinsic_instr *instr)
3208{
3209   assert(stage == MESA_SHADER_GEOMETRY);
3210   fs_reg indirect_offset;
3211
3212   fs_reg dest;
3213   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3214      dest = get_nir_dest(instr->dest);
3215
3216   switch (instr->intrinsic) {
3217   case nir_intrinsic_load_primitive_id:
3218      assert(stage == MESA_SHADER_GEOMETRY);
3219      assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3220      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3221              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3222      break;
3223
3224   case nir_intrinsic_load_input:
3225      unreachable("load_input intrinsics are invalid for the GS stage");
3226
3227   case nir_intrinsic_load_per_vertex_input:
3228      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3229                         instr->src[1], instr->num_components,
3230                         nir_intrinsic_component(instr));
3231      break;
3232
3233   case nir_intrinsic_emit_vertex_with_counter:
3234      emit_gs_vertex(instr->src[0], instr->const_index[0]);
3235      break;
3236
3237   case nir_intrinsic_end_primitive_with_counter:
3238      emit_gs_end_primitive(instr->src[0]);
3239      break;
3240
3241   case nir_intrinsic_set_vertex_and_primitive_count:
3242      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3243      break;
3244
3245   case nir_intrinsic_load_invocation_id: {
3246      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3247      assert(val.file != BAD_FILE);
3248      dest.type = val.type;
3249      bld.MOV(dest, val);
3250      break;
3251   }
3252
3253   default:
3254      nir_emit_intrinsic(bld, instr);
3255      break;
3256   }
3257}
3258
3259/**
3260 * Fetch the current render target layer index.
3261 */
3262static fs_reg
3263fetch_render_target_array_index(const fs_builder &bld)
3264{
3265   if (bld.shader->devinfo->ver >= 12) {
3266      /* The render target array index is provided in the thread payload as
3267       * bits 26:16 of r1.1.
3268       */
3269      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3270      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3),
3271              brw_imm_uw(0x7ff));
3272      return idx;
3273   } else if (bld.shader->devinfo->ver >= 6) {
3274      /* The render target array index is provided in the thread payload as
3275       * bits 26:16 of r0.0.
3276       */
3277      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3278      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3279              brw_imm_uw(0x7ff));
3280      return idx;
3281   } else {
3282      /* Pre-SNB we only ever render into the first layer of the framebuffer
3283       * since layered rendering is not implemented.
3284       */
3285      return brw_imm_ud(0);
3286   }
3287}
3288
3289/**
3290 * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3291 * framebuffer at the current fragment coordinates and sample index.
3292 */
3293fs_inst *
3294fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3295                                      unsigned target)
3296{
3297   const struct intel_device_info *devinfo = bld.shader->devinfo;
3298
3299   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3300   const brw_wm_prog_key *wm_key =
3301      reinterpret_cast<const brw_wm_prog_key *>(key);
3302   assert(!wm_key->coherent_fb_fetch);
3303   const struct brw_wm_prog_data *wm_prog_data =
3304      brw_wm_prog_data(stage_prog_data);
3305
3306   /* Calculate the surface index relative to the start of the texture binding
3307    * table block, since that's what the texturing messages expect.
3308    */
3309   const unsigned surface = target +
3310      wm_prog_data->binding_table.render_target_read_start -
3311      wm_prog_data->base.binding_table.texture_start;
3312
3313   /* Calculate the fragment coordinates. */
3314   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3315   bld.MOV(offset(coords, bld, 0), pixel_x);
3316   bld.MOV(offset(coords, bld, 1), pixel_y);
3317   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3318
3319   /* Calculate the sample index and MCS payload when multisampling.  Luckily
3320    * the MCS fetch message behaves deterministically for UMS surfaces, so it
3321    * shouldn't be necessary to recompile based on whether the framebuffer is
3322    * CMS or UMS.
3323    */
3324   if (wm_key->multisample_fbo &&
3325       nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3326      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3327
3328   const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3329   const fs_reg mcs = wm_key->multisample_fbo ?
3330      emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
3331
3332   /* Use either a normal or a CMS texel fetch message depending on whether
3333    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3334    * message just in case the framebuffer uses 16x multisampling, it should
3335    * be equivalent to the normal CMS fetch for lower multisampling modes.
3336    */
3337   const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3338                     devinfo->ver >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3339                     SHADER_OPCODE_TXF_CMS_LOGICAL;
3340
3341   /* Emit the instruction. */
3342   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3343   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3344   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3345   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3346   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3347   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3348   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3349   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3350   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3351
3352   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3353   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3354
3355   return inst;
3356}
3357
3358/**
3359 * Actual coherent framebuffer read implemented using the native render target
3360 * read message.  Requires SKL+.
3361 */
3362static fs_inst *
3363emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3364{
3365   assert(bld.shader->devinfo->ver >= 9);
3366   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3367   inst->target = target;
3368   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3369
3370   return inst;
3371}
3372
3373static fs_reg
3374alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3375{
3376   if (n && regs[0].file != BAD_FILE) {
3377      return regs[0];
3378
3379   } else {
3380      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3381
3382      for (unsigned i = 0; i < n; i++)
3383         regs[i] = tmp;
3384
3385      return tmp;
3386   }
3387}
3388
3389static fs_reg
3390alloc_frag_output(fs_visitor *v, unsigned location)
3391{
3392   assert(v->stage == MESA_SHADER_FRAGMENT);
3393   const brw_wm_prog_key *const key =
3394      reinterpret_cast<const brw_wm_prog_key *>(v->key);
3395   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3396   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3397
3398   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3399      return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3400
3401   else if (l == FRAG_RESULT_COLOR)
3402      return alloc_temporary(v->bld, 4, v->outputs,
3403                             MAX2(key->nr_color_regions, 1));
3404
3405   else if (l == FRAG_RESULT_DEPTH)
3406      return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3407
3408   else if (l == FRAG_RESULT_STENCIL)
3409      return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3410
3411   else if (l == FRAG_RESULT_SAMPLE_MASK)
3412      return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3413
3414   else if (l >= FRAG_RESULT_DATA0 &&
3415            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3416      return alloc_temporary(v->bld, 4,
3417                             &v->outputs[l - FRAG_RESULT_DATA0], 1);
3418
3419   else
3420      unreachable("Invalid location");
3421}
3422
3423void
3424fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3425                                  nir_intrinsic_instr *instr)
3426{
3427   assert(stage == MESA_SHADER_FRAGMENT);
3428
3429   fs_reg dest;
3430   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3431      dest = get_nir_dest(instr->dest);
3432
3433   switch (instr->intrinsic) {
3434   case nir_intrinsic_load_front_face:
3435      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3436              *emit_frontfacing_interpolation());
3437      break;
3438
3439   case nir_intrinsic_load_sample_pos: {
3440      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3441      assert(sample_pos.file != BAD_FILE);
3442      dest.type = sample_pos.type;
3443      bld.MOV(dest, sample_pos);
3444      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3445      break;
3446   }
3447
3448   case nir_intrinsic_load_layer_id:
3449      dest.type = BRW_REGISTER_TYPE_UD;
3450      bld.MOV(dest, fetch_render_target_array_index(bld));
3451      break;
3452
3453   case nir_intrinsic_is_helper_invocation: {
3454      /* Unlike the regular gl_HelperInvocation, that is defined at dispatch,
3455       * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into
3456       * consideration demoted invocations.  That information is stored in
3457       * f0.1.
3458       */
3459      dest.type = BRW_REGISTER_TYPE_UD;
3460
3461      bld.MOV(dest, brw_imm_ud(0));
3462
3463      fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0));
3464      mov->predicate = BRW_PREDICATE_NORMAL;
3465      mov->predicate_inverse = true;
3466      mov->flag_subreg = sample_mask_flag_subreg(this);
3467      break;
3468   }
3469
3470   case nir_intrinsic_load_helper_invocation:
3471   case nir_intrinsic_load_sample_mask_in:
3472   case nir_intrinsic_load_sample_id:
3473   case nir_intrinsic_load_frag_shading_rate: {
3474      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3475      fs_reg val = nir_system_values[sv];
3476      assert(val.file != BAD_FILE);
3477      dest.type = val.type;
3478      bld.MOV(dest, val);
3479      break;
3480   }
3481
3482   case nir_intrinsic_store_output: {
3483      const fs_reg src = get_nir_src(instr->src[0]);
3484      const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3485      const unsigned location = nir_intrinsic_base(instr) +
3486         SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3487      const fs_reg new_dest = retype(alloc_frag_output(this, location),
3488                                     src.type);
3489
3490      for (unsigned j = 0; j < instr->num_components; j++)
3491         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3492                 offset(src, bld, j));
3493
3494      break;
3495   }
3496
3497   case nir_intrinsic_load_output: {
3498      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3499                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
3500      assert(l >= FRAG_RESULT_DATA0);
3501      const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3502      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3503      const fs_reg tmp = bld.vgrf(dest.type, 4);
3504
3505      if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3506         emit_coherent_fb_read(bld, tmp, target);
3507      else
3508         emit_non_coherent_fb_read(bld, tmp, target);
3509
3510      for (unsigned j = 0; j < instr->num_components; j++) {
3511         bld.MOV(offset(dest, bld, j),
3512                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3513      }
3514
3515      break;
3516   }
3517
3518   case nir_intrinsic_demote:
3519   case nir_intrinsic_discard:
3520   case nir_intrinsic_terminate:
3521   case nir_intrinsic_demote_if:
3522   case nir_intrinsic_discard_if:
3523   case nir_intrinsic_terminate_if: {
3524      /* We track our discarded pixels in f0.1/f1.0.  By predicating on it, we
3525       * can update just the flag bits that aren't yet discarded.  If there's
3526       * no condition, we emit a CMP of g0 != g0, so all currently executing
3527       * channels will get turned off.
3528       */
3529      fs_inst *cmp = NULL;
3530      if (instr->intrinsic == nir_intrinsic_demote_if ||
3531          instr->intrinsic == nir_intrinsic_discard_if ||
3532          instr->intrinsic == nir_intrinsic_terminate_if) {
3533         nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]);
3534
3535         if (alu != NULL &&
3536             alu->op != nir_op_bcsel &&
3537             (devinfo->ver > 5 ||
3538              (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE ||
3539              alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 ||
3540              alu->op == nir_op_flt32 || alu->op == nir_op_fge32 ||
3541              alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 ||
3542              alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 ||
3543              alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) {
3544            /* Re-emit the instruction that generated the Boolean value, but
3545             * do not store it.  Since this instruction will be conditional,
3546             * other instructions that want to use the real Boolean value may
3547             * get garbage.  This was a problem for piglit's fs-discard-exit-2
3548             * test.
3549             *
3550             * Ideally we'd detect that the instruction cannot have a
3551             * conditional modifier before emitting the instructions.  Alas,
3552             * that is nigh impossible.  Instead, we're going to assume the
3553             * instruction (or last instruction) generated can have a
3554             * conditional modifier.  If it cannot, fallback to the old-style
3555             * compare, and hope dead code elimination will clean up the
3556             * extra instructions generated.
3557             */
3558            nir_emit_alu(bld, alu, false);
3559
3560            cmp = (fs_inst *) instructions.get_tail();
3561            if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) {
3562               if (cmp->can_do_cmod())
3563                  cmp->conditional_mod = BRW_CONDITIONAL_Z;
3564               else
3565                  cmp = NULL;
3566            } else {
3567               /* The old sequence that would have been generated is,
3568                * basically, bool_result == false.  This is equivalent to
3569                * !bool_result, so negate the old modifier.
3570                */
3571               cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
3572            }
3573         }
3574
3575         if (cmp == NULL) {
3576            cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3577                          brw_imm_d(0), BRW_CONDITIONAL_Z);
3578         }
3579      } else {
3580         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3581                                       BRW_REGISTER_TYPE_UW));
3582         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3583      }
3584
3585      cmp->predicate = BRW_PREDICATE_NORMAL;
3586      cmp->flag_subreg = sample_mask_flag_subreg(this);
3587
3588      fs_inst *jump = bld.emit(BRW_OPCODE_HALT);
3589      jump->flag_subreg = sample_mask_flag_subreg(this);
3590      jump->predicate_inverse = true;
3591
3592      if (instr->intrinsic == nir_intrinsic_terminate ||
3593          instr->intrinsic == nir_intrinsic_terminate_if) {
3594         jump->predicate = BRW_PREDICATE_NORMAL;
3595      } else {
3596         /* Only jump when the whole quad is demoted.  For historical
3597          * reasons this is also used for discard.
3598          */
3599         jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
3600      }
3601
3602      if (devinfo->ver < 7)
3603         limit_dispatch_width(
3604            16, "Fragment discard/demote not implemented in SIMD32 mode.\n");
3605      break;
3606   }
3607
3608   case nir_intrinsic_load_input: {
3609      /* load_input is only used for flat inputs */
3610      assert(nir_dest_bit_size(instr->dest) == 32);
3611      unsigned base = nir_intrinsic_base(instr);
3612      unsigned comp = nir_intrinsic_component(instr);
3613      unsigned num_components = instr->num_components;
3614
3615      /* Special case fields in the VUE header */
3616      if (base == VARYING_SLOT_LAYER)
3617         comp = 1;
3618      else if (base == VARYING_SLOT_VIEWPORT)
3619         comp = 2;
3620
3621      for (unsigned int i = 0; i < num_components; i++) {
3622         bld.MOV(offset(dest, bld, i),
3623                 retype(component(interp_reg(base, comp + i), 3), dest.type));
3624      }
3625      break;
3626   }
3627
3628   case nir_intrinsic_load_fs_input_interp_deltas: {
3629      assert(stage == MESA_SHADER_FRAGMENT);
3630      assert(nir_src_as_uint(instr->src[0]) == 0);
3631      fs_reg interp = interp_reg(nir_intrinsic_base(instr),
3632                                 nir_intrinsic_component(instr));
3633      dest.type = BRW_REGISTER_TYPE_F;
3634      bld.MOV(offset(dest, bld, 0), component(interp, 3));
3635      bld.MOV(offset(dest, bld, 1), component(interp, 1));
3636      bld.MOV(offset(dest, bld, 2), component(interp, 0));
3637      break;
3638   }
3639
3640   case nir_intrinsic_load_barycentric_pixel:
3641   case nir_intrinsic_load_barycentric_centroid:
3642   case nir_intrinsic_load_barycentric_sample: {
3643      /* Use the delta_xy values computed from the payload */
3644      const glsl_interp_mode interp_mode =
3645         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3646      enum brw_barycentric_mode bary =
3647         brw_barycentric_mode(interp_mode, instr->intrinsic);
3648      const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0),
3649                              offset(this->delta_xy[bary], bld, 1) };
3650      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
3651      break;
3652   }
3653
3654   case nir_intrinsic_load_barycentric_at_sample: {
3655      const glsl_interp_mode interpolation =
3656         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3657
3658      if (nir_src_is_const(instr->src[0])) {
3659         unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3660
3661         emit_pixel_interpolater_send(bld,
3662                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3663                                      dest,
3664                                      fs_reg(), /* src */
3665                                      brw_imm_ud(msg_data),
3666                                      interpolation);
3667      } else {
3668         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3669                                          BRW_REGISTER_TYPE_UD);
3670
3671         if (nir_src_is_dynamically_uniform(instr->src[0])) {
3672            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3673            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3674            bld.exec_all().group(1, 0)
3675               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3676            emit_pixel_interpolater_send(bld,
3677                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3678                                         dest,
3679                                         fs_reg(), /* src */
3680                                         component(msg_data, 0),
3681                                         interpolation);
3682         } else {
3683            /* Make a loop that sends a message to the pixel interpolater
3684             * for the sample number in each live channel. If there are
3685             * multiple channels with the same sample number then these
3686             * will be handled simultaneously with a single interation of
3687             * the loop.
3688             */
3689            bld.emit(BRW_OPCODE_DO);
3690
3691            /* Get the next live sample number into sample_id_reg */
3692            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3693
3694            /* Set the flag register so that we can perform the send
3695             * message on all channels that have the same sample number
3696             */
3697            bld.CMP(bld.null_reg_ud(),
3698                    sample_src, sample_id,
3699                    BRW_CONDITIONAL_EQ);
3700            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3701            bld.exec_all().group(1, 0)
3702               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3703            fs_inst *inst =
3704               emit_pixel_interpolater_send(bld,
3705                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3706                                            dest,
3707                                            fs_reg(), /* src */
3708                                            component(msg_data, 0),
3709                                            interpolation);
3710            set_predicate(BRW_PREDICATE_NORMAL, inst);
3711
3712            /* Continue the loop if there are any live channels left */
3713            set_predicate_inv(BRW_PREDICATE_NORMAL,
3714                              true, /* inverse */
3715                              bld.emit(BRW_OPCODE_WHILE));
3716         }
3717      }
3718      break;
3719   }
3720
3721   case nir_intrinsic_load_barycentric_at_offset: {
3722      const glsl_interp_mode interpolation =
3723         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3724
3725      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3726
3727      if (const_offset) {
3728         assert(nir_src_bit_size(instr->src[0]) == 32);
3729         unsigned off_x = const_offset[0].u32 & 0xf;
3730         unsigned off_y = const_offset[1].u32 & 0xf;
3731
3732         emit_pixel_interpolater_send(bld,
3733                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3734                                      dest,
3735                                      fs_reg(), /* src */
3736                                      brw_imm_ud(off_x | (off_y << 4)),
3737                                      interpolation);
3738      } else {
3739         fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D);
3740         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3741         emit_pixel_interpolater_send(bld,
3742                                      opcode,
3743                                      dest,
3744                                      src,
3745                                      brw_imm_ud(0u),
3746                                      interpolation);
3747      }
3748      break;
3749   }
3750
3751   case nir_intrinsic_load_frag_coord:
3752      emit_fragcoord_interpolation(dest);
3753      break;
3754
3755   case nir_intrinsic_load_interpolated_input: {
3756      assert(instr->src[0].ssa &&
3757             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3758      nir_intrinsic_instr *bary_intrinsic =
3759         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3760      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3761      enum glsl_interp_mode interp_mode =
3762         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3763      fs_reg dst_xy;
3764
3765      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3766          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3767         /* Use the result of the PI message. */
3768         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3769      } else {
3770         /* Use the delta_xy values computed from the payload */
3771         enum brw_barycentric_mode bary =
3772            brw_barycentric_mode(interp_mode, bary_intrin);
3773         dst_xy = this->delta_xy[bary];
3774      }
3775
3776      for (unsigned int i = 0; i < instr->num_components; i++) {
3777         fs_reg interp =
3778            component(interp_reg(nir_intrinsic_base(instr),
3779                                 nir_intrinsic_component(instr) + i), 0);
3780         interp.type = BRW_REGISTER_TYPE_F;
3781         dest.type = BRW_REGISTER_TYPE_F;
3782
3783         if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3784            fs_reg tmp = vgrf(glsl_type::float_type);
3785            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3786            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3787         } else {
3788            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3789         }
3790      }
3791      break;
3792   }
3793
3794   default:
3795      nir_emit_intrinsic(bld, instr);
3796      break;
3797   }
3798}
3799
3800void
3801fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3802                                  nir_intrinsic_instr *instr)
3803{
3804   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3805   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3806
3807   fs_reg dest;
3808   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3809      dest = get_nir_dest(instr->dest);
3810
3811   switch (instr->intrinsic) {
3812   case nir_intrinsic_control_barrier:
3813      /* The whole workgroup fits in a single HW thread, so all the
3814       * invocations are already executed lock-step.  Instead of an actual
3815       * barrier just emit a scheduling fence, that will generate no code.
3816       */
3817      if (!nir->info.workgroup_size_variable &&
3818          workgroup_size() <= dispatch_width) {
3819         bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE);
3820         break;
3821      }
3822
3823      emit_barrier();
3824      cs_prog_data->uses_barrier = true;
3825      break;
3826
3827   case nir_intrinsic_load_subgroup_id:
3828      if (devinfo->verx10 >= 125)
3829         bld.AND(retype(dest, BRW_REGISTER_TYPE_UD),
3830                 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
3831                 brw_imm_ud(INTEL_MASK(7, 0)));
3832      else
3833         bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3834      break;
3835
3836   case nir_intrinsic_load_local_invocation_id:
3837   case nir_intrinsic_load_workgroup_id: {
3838      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3839      fs_reg val = nir_system_values[sv];
3840      assert(val.file != BAD_FILE);
3841      dest.type = val.type;
3842      for (unsigned i = 0; i < 3; i++)
3843         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3844      break;
3845   }
3846
3847   case nir_intrinsic_load_num_workgroups: {
3848      assert(nir_dest_bit_size(instr->dest) == 32);
3849      const unsigned surface =
3850         cs_prog_data->binding_table.work_groups_start;
3851
3852      cs_prog_data->uses_num_work_groups = true;
3853
3854      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3855      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3856      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3857      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */
3858      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0);
3859      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3860      fs_inst *inst =
3861         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3862                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3863      inst->size_written = 3 * dispatch_width * 4;
3864      break;
3865   }
3866
3867   case nir_intrinsic_shared_atomic_add:
3868   case nir_intrinsic_shared_atomic_imin:
3869   case nir_intrinsic_shared_atomic_umin:
3870   case nir_intrinsic_shared_atomic_imax:
3871   case nir_intrinsic_shared_atomic_umax:
3872   case nir_intrinsic_shared_atomic_and:
3873   case nir_intrinsic_shared_atomic_or:
3874   case nir_intrinsic_shared_atomic_xor:
3875   case nir_intrinsic_shared_atomic_exchange:
3876   case nir_intrinsic_shared_atomic_comp_swap:
3877      nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
3878      break;
3879   case nir_intrinsic_shared_atomic_fmin:
3880   case nir_intrinsic_shared_atomic_fmax:
3881   case nir_intrinsic_shared_atomic_fcomp_swap:
3882      nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
3883      break;
3884
3885   case nir_intrinsic_load_shared: {
3886      assert(devinfo->ver >= 7);
3887      assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3888
3889      const unsigned bit_size = nir_dest_bit_size(instr->dest);
3890      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3891      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
3892      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3893      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3894      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3895
3896      /* Make dest unsigned because that's what the temporary will be */
3897      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3898
3899      /* Read the vector */
3900      assert(nir_dest_bit_size(instr->dest) <= 32);
3901      assert(nir_intrinsic_align(instr) > 0);
3902      if (nir_dest_bit_size(instr->dest) == 32 &&
3903          nir_intrinsic_align(instr) >= 4) {
3904         assert(nir_dest_num_components(instr->dest) <= 4);
3905         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3906         fs_inst *inst =
3907            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3908                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3909         inst->size_written = instr->num_components * dispatch_width * 4;
3910      } else {
3911         assert(nir_dest_num_components(instr->dest) == 1);
3912         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3913
3914         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3915         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3916                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3917         bld.MOV(dest, subscript(read_result, dest.type, 0));
3918      }
3919      break;
3920   }
3921
3922   case nir_intrinsic_store_shared: {
3923      assert(devinfo->ver >= 7);
3924      assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
3925
3926      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3927      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3928      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
3929      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3930      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3931      /* No point in masking with sample mask, here we're handling compute
3932       * intrinsics.
3933       */
3934      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
3935
3936      fs_reg data = get_nir_src(instr->src[0]);
3937      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3938
3939      assert(nir_src_bit_size(instr->src[0]) <= 32);
3940      assert(nir_intrinsic_write_mask(instr) ==
3941             (1u << instr->num_components) - 1);
3942      assert(nir_intrinsic_align(instr) > 0);
3943      if (nir_src_bit_size(instr->src[0]) == 32 &&
3944          nir_intrinsic_align(instr) >= 4) {
3945         assert(nir_src_num_components(instr->src[0]) <= 4);
3946         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3947         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3948         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3949                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3950      } else {
3951         assert(nir_src_num_components(instr->src[0]) == 1);
3952         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3953
3954         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3955         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3956
3957         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3958                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3959      }
3960      break;
3961   }
3962
3963   case nir_intrinsic_load_workgroup_size: {
3964      assert(compiler->lower_variable_group_size);
3965      assert(nir->info.workgroup_size_variable);
3966      for (unsigned i = 0; i < 3; i++) {
3967         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
3968            group_size[i]);
3969      }
3970      break;
3971   }
3972
3973   default:
3974      nir_emit_intrinsic(bld, instr);
3975      break;
3976   }
3977}
3978
3979void
3980fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld,
3981                                  nir_intrinsic_instr *instr)
3982{
3983   assert(brw_shader_stage_is_bindless(stage));
3984
3985   fs_reg dest;
3986   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3987      dest = get_nir_dest(instr->dest);
3988
3989   switch (instr->intrinsic) {
3990   case nir_intrinsic_load_btd_global_arg_addr_intel:
3991      bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type));
3992      break;
3993
3994   case nir_intrinsic_load_btd_local_arg_addr_intel:
3995      bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type));
3996      break;
3997
3998   case nir_intrinsic_trace_ray_initial_intel:
3999      bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
4000               bld.null_reg_ud(),
4001               brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD),
4002               brw_imm_ud(GEN_RT_TRACE_RAY_INITAL));
4003      break;
4004
4005   case nir_intrinsic_trace_ray_commit_intel:
4006      bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
4007               bld.null_reg_ud(),
4008               brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
4009               brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT));
4010      break;
4011
4012   case nir_intrinsic_trace_ray_continue_intel:
4013      bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL,
4014               bld.null_reg_ud(),
4015               brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT),
4016               brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE));
4017      break;
4018
4019   default:
4020      nir_emit_intrinsic(bld, instr);
4021      break;
4022   }
4023}
4024
4025static fs_reg
4026brw_nir_reduction_op_identity(const fs_builder &bld,
4027                              nir_op op, brw_reg_type type)
4028{
4029   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
4030   switch (type_sz(type)) {
4031   case 1:
4032      if (type == BRW_REGISTER_TYPE_UB) {
4033         return brw_imm_uw(value.u8);
4034      } else {
4035         assert(type == BRW_REGISTER_TYPE_B);
4036         return brw_imm_w(value.i8);
4037      }
4038   case 2:
4039      return retype(brw_imm_uw(value.u16), type);
4040   case 4:
4041      return retype(brw_imm_ud(value.u32), type);
4042   case 8:
4043      if (type == BRW_REGISTER_TYPE_DF)
4044         return setup_imm_df(bld, value.f64);
4045      else
4046         return retype(brw_imm_u64(value.u64), type);
4047   default:
4048      unreachable("Invalid type size");
4049   }
4050}
4051
4052static opcode
4053brw_op_for_nir_reduction_op(nir_op op)
4054{
4055   switch (op) {
4056   case nir_op_iadd: return BRW_OPCODE_ADD;
4057   case nir_op_fadd: return BRW_OPCODE_ADD;
4058   case nir_op_imul: return BRW_OPCODE_MUL;
4059   case nir_op_fmul: return BRW_OPCODE_MUL;
4060   case nir_op_imin: return BRW_OPCODE_SEL;
4061   case nir_op_umin: return BRW_OPCODE_SEL;
4062   case nir_op_fmin: return BRW_OPCODE_SEL;
4063   case nir_op_imax: return BRW_OPCODE_SEL;
4064   case nir_op_umax: return BRW_OPCODE_SEL;
4065   case nir_op_fmax: return BRW_OPCODE_SEL;
4066   case nir_op_iand: return BRW_OPCODE_AND;
4067   case nir_op_ior:  return BRW_OPCODE_OR;
4068   case nir_op_ixor: return BRW_OPCODE_XOR;
4069   default:
4070      unreachable("Invalid reduction operation");
4071   }
4072}
4073
4074static brw_conditional_mod
4075brw_cond_mod_for_nir_reduction_op(nir_op op)
4076{
4077   switch (op) {
4078   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
4079   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
4080   case nir_op_imul: return BRW_CONDITIONAL_NONE;
4081   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
4082   case nir_op_imin: return BRW_CONDITIONAL_L;
4083   case nir_op_umin: return BRW_CONDITIONAL_L;
4084   case nir_op_fmin: return BRW_CONDITIONAL_L;
4085   case nir_op_imax: return BRW_CONDITIONAL_GE;
4086   case nir_op_umax: return BRW_CONDITIONAL_GE;
4087   case nir_op_fmax: return BRW_CONDITIONAL_GE;
4088   case nir_op_iand: return BRW_CONDITIONAL_NONE;
4089   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
4090   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
4091   default:
4092      unreachable("Invalid reduction operation");
4093   }
4094}
4095
4096fs_reg
4097fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
4098                                          nir_intrinsic_instr *instr)
4099{
4100   fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
4101   fs_reg surf_index = image;
4102
4103   if (stage_prog_data->binding_table.image_start > 0) {
4104      if (image.file == BRW_IMMEDIATE_VALUE) {
4105         surf_index =
4106            brw_imm_ud(image.d + stage_prog_data->binding_table.image_start);
4107      } else {
4108         surf_index = vgrf(glsl_type::uint_type);
4109         bld.ADD(surf_index, image,
4110                 brw_imm_d(stage_prog_data->binding_table.image_start));
4111      }
4112   }
4113
4114   return bld.emit_uniformize(surf_index);
4115}
4116
4117fs_reg
4118fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
4119                                         nir_intrinsic_instr *instr)
4120{
4121   /* SSBO stores are weird in that their index is in src[1] */
4122   const bool is_store =
4123      instr->intrinsic == nir_intrinsic_store_ssbo ||
4124      instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
4125   const unsigned src = is_store ? 1 : 0;
4126
4127   if (nir_src_is_const(instr->src[src])) {
4128      unsigned index = stage_prog_data->binding_table.ssbo_start +
4129                       nir_src_as_uint(instr->src[src]);
4130      return brw_imm_ud(index);
4131   } else {
4132      fs_reg surf_index = vgrf(glsl_type::uint_type);
4133      bld.ADD(surf_index, get_nir_src(instr->src[src]),
4134              brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
4135      return bld.emit_uniformize(surf_index);
4136   }
4137}
4138
4139/**
4140 * The offsets we get from NIR act as if each SIMD channel has it's own blob
4141 * of contiguous space.  However, if we actually place each SIMD channel in
4142 * it's own space, we end up with terrible cache performance because each SIMD
4143 * channel accesses a different cache line even when they're all accessing the
4144 * same byte offset.  To deal with this problem, we swizzle the address using
4145 * a simple algorithm which ensures that any time a SIMD message reads or
4146 * writes the same address, it's all in the same cache line.  We have to keep
4147 * the bottom two bits fixed so that we can read/write up to a dword at a time
4148 * and the individual element is contiguous.  We do this by splitting the
4149 * address as follows:
4150 *
4151 *    31                             4-6           2          0
4152 *    +-------------------------------+------------+----------+
4153 *    |        Hi address bits        | chan index | addr low |
4154 *    +-------------------------------+------------+----------+
4155 *
4156 * In other words, the bottom two address bits stay, and the top 30 get
4157 * shifted up so that we can stick the SIMD channel index in the middle.  This
4158 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit
4159 * at the same logical offset, the scratch read/write instruction acts on
4160 * continuous elements and we get good cache locality.
4161 */
4162fs_reg
4163fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld,
4164                                     const fs_reg &nir_addr,
4165                                     bool in_dwords)
4166{
4167   const fs_reg &chan_index =
4168      nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
4169   const unsigned chan_index_bits = ffs(dispatch_width) - 1;
4170
4171   fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4172   if (in_dwords) {
4173      /* In this case, we know the address is aligned to a DWORD and we want
4174       * the final address in DWORDs.
4175       */
4176      bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2));
4177      bld.OR(addr, addr, chan_index);
4178   } else {
4179      /* This case substantially more annoying because we have to pay
4180       * attention to those pesky two bottom bits.
4181       */
4182      fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD);
4183      bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u));
4184      bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits));
4185      fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD);
4186      bld.SHL(chan_addr, chan_index, brw_imm_ud(2));
4187      bld.AND(addr, nir_addr, brw_imm_ud(0x3u));
4188      bld.OR(addr, addr, addr_hi);
4189      bld.OR(addr, addr, chan_addr);
4190   }
4191   return addr;
4192}
4193
4194static unsigned
4195choose_oword_block_size_dwords(unsigned dwords)
4196{
4197   unsigned block;
4198   if (dwords >= 32) {
4199      block = 32;
4200   } else if (dwords >= 16) {
4201      block = 16;
4202   } else {
4203      block = 8;
4204   }
4205   assert(block <= dwords);
4206   return block;
4207}
4208
4209static void
4210increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v)
4211{
4212   if (bld.shader->devinfo->has_64bit_int) {
4213      bld.ADD(address, address, brw_imm_ud(v));
4214   } else {
4215      fs_reg low = retype(address, BRW_REGISTER_TYPE_UD);
4216      fs_reg high = offset(low, bld, 1);
4217
4218      /* Add low and if that overflows, add carry to high. */
4219      bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O;
4220      bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL;
4221   }
4222}
4223
4224static fs_reg
4225emit_fence(const fs_builder &bld, enum opcode opcode,
4226           uint8_t sfid, bool commit_enable, uint8_t bti)
4227{
4228   assert(opcode == SHADER_OPCODE_INTERLOCK ||
4229          opcode == SHADER_OPCODE_MEMORY_FENCE);
4230
4231   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
4232   fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0),
4233                             brw_imm_ud(commit_enable),
4234                             brw_imm_ud(bti));
4235   fence->sfid = sfid;
4236   return dst;
4237}
4238
4239void
4240fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
4241{
4242   fs_reg dest;
4243   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
4244      dest = get_nir_dest(instr->dest);
4245
4246   switch (instr->intrinsic) {
4247   case nir_intrinsic_image_load:
4248   case nir_intrinsic_image_store:
4249   case nir_intrinsic_image_atomic_add:
4250   case nir_intrinsic_image_atomic_imin:
4251   case nir_intrinsic_image_atomic_umin:
4252   case nir_intrinsic_image_atomic_imax:
4253   case nir_intrinsic_image_atomic_umax:
4254   case nir_intrinsic_image_atomic_and:
4255   case nir_intrinsic_image_atomic_or:
4256   case nir_intrinsic_image_atomic_xor:
4257   case nir_intrinsic_image_atomic_exchange:
4258   case nir_intrinsic_image_atomic_comp_swap:
4259   case nir_intrinsic_bindless_image_load:
4260   case nir_intrinsic_bindless_image_store:
4261   case nir_intrinsic_bindless_image_atomic_add:
4262   case nir_intrinsic_bindless_image_atomic_imin:
4263   case nir_intrinsic_bindless_image_atomic_umin:
4264   case nir_intrinsic_bindless_image_atomic_imax:
4265   case nir_intrinsic_bindless_image_atomic_umax:
4266   case nir_intrinsic_bindless_image_atomic_and:
4267   case nir_intrinsic_bindless_image_atomic_or:
4268   case nir_intrinsic_bindless_image_atomic_xor:
4269   case nir_intrinsic_bindless_image_atomic_exchange:
4270   case nir_intrinsic_bindless_image_atomic_comp_swap: {
4271      /* Get some metadata from the image intrinsic. */
4272      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
4273
4274      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4275
4276      switch (instr->intrinsic) {
4277      case nir_intrinsic_image_load:
4278      case nir_intrinsic_image_store:
4279      case nir_intrinsic_image_atomic_add:
4280      case nir_intrinsic_image_atomic_imin:
4281      case nir_intrinsic_image_atomic_umin:
4282      case nir_intrinsic_image_atomic_imax:
4283      case nir_intrinsic_image_atomic_umax:
4284      case nir_intrinsic_image_atomic_and:
4285      case nir_intrinsic_image_atomic_or:
4286      case nir_intrinsic_image_atomic_xor:
4287      case nir_intrinsic_image_atomic_exchange:
4288      case nir_intrinsic_image_atomic_comp_swap:
4289         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4290            get_nir_image_intrinsic_image(bld, instr);
4291         break;
4292
4293      default:
4294         /* Bindless */
4295         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4296            bld.emit_uniformize(get_nir_src(instr->src[0]));
4297         break;
4298      }
4299
4300      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4301      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4302         brw_imm_ud(nir_image_intrinsic_coord_components(instr));
4303
4304      /* Emit an image load, store or atomic op. */
4305      if (instr->intrinsic == nir_intrinsic_image_load ||
4306          instr->intrinsic == nir_intrinsic_bindless_image_load) {
4307         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4308         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4309         fs_inst *inst =
4310            bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4311                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4312         inst->size_written = instr->num_components * dispatch_width * 4;
4313      } else if (instr->intrinsic == nir_intrinsic_image_store ||
4314                 instr->intrinsic == nir_intrinsic_bindless_image_store) {
4315         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4316         srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
4317         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4318         bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4319                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4320      } else {
4321         unsigned num_srcs = info->num_srcs;
4322         int op = brw_aop_for_nir_intrinsic(instr);
4323         if (op == BRW_AOP_INC || op == BRW_AOP_DEC) {
4324            assert(num_srcs == 4);
4325            num_srcs = 3;
4326         }
4327
4328         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4329
4330         fs_reg data;
4331         if (num_srcs >= 4)
4332            data = get_nir_src(instr->src[3]);
4333         if (num_srcs >= 5) {
4334            fs_reg tmp = bld.vgrf(data.type, 2);
4335            fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
4336            bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4337            data = tmp;
4338         }
4339         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4340         srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4341
4342         bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4343                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4344      }
4345      break;
4346   }
4347
4348   case nir_intrinsic_image_size:
4349   case nir_intrinsic_bindless_image_size: {
4350      /* Cube image sizes should have previously been lowered to a 2D array */
4351      assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
4352
4353      /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4354       * into will handle the binding table index for us in the geneerator.
4355       * Incidentally, this means that we can handle bindless with exactly the
4356       * same code.
4357       */
4358      fs_reg image = retype(get_nir_src_imm(instr->src[0]),
4359                            BRW_REGISTER_TYPE_UD);
4360      image = bld.emit_uniformize(image);
4361
4362      assert(nir_src_as_uint(instr->src[1]) == 0);
4363
4364      fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4365      if (instr->intrinsic == nir_intrinsic_image_size)
4366         srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4367      else
4368         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4369      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
4370      srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
4371      srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
4372
4373      /* Since the image size is always uniform, we can just emit a SIMD8
4374       * query instruction and splat the result out.
4375       */
4376      const fs_builder ubld = bld.exec_all().group(8, 0);
4377
4378      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4379      fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4380                                tmp, srcs, ARRAY_SIZE(srcs));
4381      inst->size_written = 4 * REG_SIZE;
4382
4383      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
4384         bld.MOV(offset(retype(dest, tmp.type), bld, c),
4385                 component(offset(tmp, ubld, c), 0));
4386      }
4387      break;
4388   }
4389
4390   case nir_intrinsic_image_load_raw_intel: {
4391      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4392      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4393         get_nir_image_intrinsic_image(bld, instr);
4394      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4395      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4396      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4397      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4398
4399      fs_inst *inst =
4400         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4401                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4402      inst->size_written = instr->num_components * dispatch_width * 4;
4403      break;
4404   }
4405
4406   case nir_intrinsic_image_store_raw_intel: {
4407      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4408      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4409         get_nir_image_intrinsic_image(bld, instr);
4410      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4411      srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
4412      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4413      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4414      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4415
4416      bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4417               fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4418      break;
4419   }
4420
4421   case nir_intrinsic_scoped_barrier:
4422      assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE);
4423      FALLTHROUGH;
4424   case nir_intrinsic_group_memory_barrier:
4425   case nir_intrinsic_memory_barrier_shared:
4426   case nir_intrinsic_memory_barrier_buffer:
4427   case nir_intrinsic_memory_barrier_image:
4428   case nir_intrinsic_memory_barrier:
4429   case nir_intrinsic_begin_invocation_interlock:
4430   case nir_intrinsic_end_invocation_interlock: {
4431      bool ugm_fence, slm_fence, tgm_fence, urb_fence;
4432      const enum opcode opcode =
4433         instr->intrinsic == nir_intrinsic_begin_invocation_interlock ?
4434         SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE;
4435
4436      switch (instr->intrinsic) {
4437      case nir_intrinsic_scoped_barrier: {
4438         nir_variable_mode modes = nir_intrinsic_memory_modes(instr);
4439         ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global);
4440         slm_fence = modes & nir_var_mem_shared;
4441         tgm_fence = modes & nir_var_mem_ssbo;
4442         urb_fence = modes & nir_var_shader_out;
4443         break;
4444      }
4445
4446      case nir_intrinsic_begin_invocation_interlock:
4447      case nir_intrinsic_end_invocation_interlock:
4448         /* For beginInvocationInterlockARB(), we will generate a memory fence
4449          * but with a different opcode so that generator can pick SENDC
4450          * instead of SEND.
4451          *
4452          * For endInvocationInterlockARB(), we need to insert a memory fence which
4453          * stalls in the shader until the memory transactions prior to that
4454          * fence are complete.  This ensures that the shader does not end before
4455          * any writes from its critical section have landed.  Otherwise, you can
4456          * end up with a case where the next invocation on that pixel properly
4457          * stalls for previous FS invocation on its pixel to complete but
4458          * doesn't actually wait for the dataport memory transactions from that
4459          * thread to land before submitting its own.
4460          *
4461          * Handling them here will allow the logic for IVB render cache (see
4462          * below) to be reused.
4463          */
4464         assert(stage == MESA_SHADER_FRAGMENT);
4465         ugm_fence = tgm_fence = true;
4466         slm_fence = urb_fence = false;
4467         break;
4468
4469      default:
4470         ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared &&
4471                     instr->intrinsic != nir_intrinsic_memory_barrier_image;
4472         slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
4473                     instr->intrinsic == nir_intrinsic_memory_barrier ||
4474                     instr->intrinsic == nir_intrinsic_memory_barrier_shared;
4475         tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
4476                     instr->intrinsic == nir_intrinsic_memory_barrier ||
4477                     instr->intrinsic == nir_intrinsic_memory_barrier_image;
4478         urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier;
4479         break;
4480      }
4481
4482      if (nir->info.shared_size > 0) {
4483         assert(gl_shader_stage_uses_workgroup(stage));
4484      } else {
4485         slm_fence = false;
4486      }
4487
4488      /* If the workgroup fits in a single HW thread, the messages for SLM are
4489       * processed in-order and the shader itself is already synchronized so
4490       * the memory fence is not necessary.
4491       *
4492       * TODO: Check if applies for many HW threads sharing same Data Port.
4493       */
4494      if (!nir->info.workgroup_size_variable &&
4495          slm_fence && workgroup_size() <= dispatch_width)
4496         slm_fence = false;
4497
4498      if (stage != MESA_SHADER_TESS_CTRL)
4499         urb_fence = false;
4500
4501      unsigned fence_regs_count = 0;
4502      fs_reg fence_regs[3] = {};
4503
4504      const fs_builder ubld = bld.group(8, 0);
4505
4506      if (devinfo->has_lsc) {
4507         assert(devinfo->verx10 >= 125);
4508         if (ugm_fence) {
4509            fence_regs[fence_regs_count++] =
4510               emit_fence(ubld, opcode, GFX12_SFID_UGM,
4511                          true /* commit_enable */,
4512                          0 /* bti; ignored for LSC */);
4513         }
4514
4515         if (tgm_fence) {
4516            fence_regs[fence_regs_count++] =
4517               emit_fence(ubld, opcode, GFX12_SFID_TGM,
4518                          true /* commit_enable */,
4519                          0 /* bti; ignored for LSC */);
4520         }
4521
4522         if (slm_fence) {
4523            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4524            fence_regs[fence_regs_count++] =
4525               emit_fence(ubld, opcode, GFX12_SFID_SLM,
4526                          true /* commit_enable */,
4527                          0 /* BTI; ignored for LSC */);
4528         }
4529
4530         if (urb_fence) {
4531            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4532            fence_regs[fence_regs_count++] =
4533               emit_fence(ubld, opcode, BRW_SFID_URB,
4534                          true /* commit_enable */,
4535                          0 /* BTI; ignored for LSC */);
4536         }
4537      } else if (devinfo->ver >= 11) {
4538         if (tgm_fence || ugm_fence || urb_fence) {
4539            fence_regs[fence_regs_count++] =
4540               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
4541                          true /* commit_enable HSD ES # 1404612949 */,
4542                          0 /* BTI = 0 means data cache */);
4543         }
4544
4545         if (slm_fence) {
4546            assert(opcode == SHADER_OPCODE_MEMORY_FENCE);
4547            fence_regs[fence_regs_count++] =
4548               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
4549                          true /* commit_enable HSD ES # 1404612949 */,
4550                          GFX7_BTI_SLM);
4551         }
4552      } else {
4553         /* Prior to Icelake, they're all lumped into a single cache except on
4554          * Ivy Bridge and Bay Trail where typed messages actually go through
4555          * the render cache.  There, we need both fences because we may
4556          * access storage images as either typed or untyped.
4557          */
4558         const bool render_fence = tgm_fence && devinfo->verx10 == 70;
4559
4560         const bool commit_enable = render_fence ||
4561            instr->intrinsic == nir_intrinsic_end_invocation_interlock;
4562
4563         if (tgm_fence || ugm_fence || slm_fence || urb_fence) {
4564            fence_regs[fence_regs_count++] =
4565               emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE,
4566                          commit_enable, 0 /* BTI */);
4567         }
4568
4569         if (render_fence) {
4570            fence_regs[fence_regs_count++] =
4571               emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE,
4572                          commit_enable, /* bti */ 0);
4573         }
4574      }
4575
4576      assert(fence_regs_count <= ARRAY_SIZE(fence_regs));
4577
4578      /* There are three cases where we want to insert a stall:
4579       *
4580       *  1. If we're a nir_intrinsic_end_invocation_interlock.  This is
4581       *     required to ensure that the shader EOT doesn't happen until
4582       *     after the fence returns.  Otherwise, we might end up with the
4583       *     next shader invocation for that pixel not respecting our fence
4584       *     because it may happen on a different HW thread.
4585       *
4586       *  2. If we have multiple fences.  This is required to ensure that
4587       *     they all complete and nothing gets weirdly out-of-order.
4588       *
4589       *  3. If we have no fences.  In this case, we need at least a
4590       *     scheduling barrier to keep the compiler from moving things
4591       *     around in an invalid way.
4592       */
4593      if (instr->intrinsic == nir_intrinsic_end_invocation_interlock ||
4594          fence_regs_count != 1) {
4595         ubld.exec_all().group(1, 0).emit(
4596            FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(),
4597            fence_regs, fence_regs_count);
4598      }
4599
4600      break;
4601   }
4602
4603   case nir_intrinsic_memory_barrier_tcs_patch:
4604      break;
4605
4606   case nir_intrinsic_shader_clock: {
4607      /* We cannot do anything if there is an event, so ignore it for now */
4608      const fs_reg shader_clock = get_timestamp(bld);
4609      const fs_reg srcs[] = { component(shader_clock, 0),
4610                              component(shader_clock, 1) };
4611      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4612      break;
4613   }
4614
4615   case nir_intrinsic_image_samples:
4616      /* The driver does not support multi-sampled images. */
4617      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4618      break;
4619
4620   case nir_intrinsic_load_reloc_const_intel: {
4621      uint32_t id = nir_intrinsic_param_idx(instr);
4622      bld.emit(SHADER_OPCODE_MOV_RELOC_IMM,
4623               dest, brw_imm_ud(id));
4624      break;
4625   }
4626
4627   case nir_intrinsic_load_uniform: {
4628      /* Offsets are in bytes but they should always aligned to
4629       * the type size
4630       */
4631      assert(instr->const_index[0] % 4 == 0 ||
4632             instr->const_index[0] % type_sz(dest.type) == 0);
4633
4634      fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4635
4636      if (nir_src_is_const(instr->src[0])) {
4637         unsigned load_offset = nir_src_as_uint(instr->src[0]);
4638         assert(load_offset % type_sz(dest.type) == 0);
4639         /* For 16-bit types we add the module of the const_index[0]
4640          * offset to access to not 32-bit aligned element
4641          */
4642         src.offset = load_offset + instr->const_index[0] % 4;
4643
4644         for (unsigned j = 0; j < instr->num_components; j++) {
4645            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4646         }
4647      } else {
4648         fs_reg indirect = retype(get_nir_src(instr->src[0]),
4649                                  BRW_REGISTER_TYPE_UD);
4650
4651         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4652          * go past the end of the uniform.  In order to keep the n'th
4653          * component from running past, we subtract off the size of all but
4654          * one component of the vector.
4655          */
4656         assert(instr->const_index[1] >=
4657                instr->num_components * (int) type_sz(dest.type));
4658         unsigned read_size = instr->const_index[1] -
4659            (instr->num_components - 1) * type_sz(dest.type);
4660
4661         bool supports_64bit_indirects =
4662            !devinfo->is_cherryview && !intel_device_info_is_9lp(devinfo);
4663
4664         if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4665            for (unsigned j = 0; j < instr->num_components; j++) {
4666               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4667                        offset(dest, bld, j), offset(src, bld, j),
4668                        indirect, brw_imm_ud(read_size));
4669            }
4670         } else {
4671            const unsigned num_mov_indirects =
4672               type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4673            /* We read a little bit less per MOV INDIRECT, as they are now
4674             * 32-bits ones instead of 64-bit. Fix read_size then.
4675             */
4676            const unsigned read_size_32bit = read_size -
4677                (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4678            for (unsigned j = 0; j < instr->num_components; j++) {
4679               for (unsigned i = 0; i < num_mov_indirects; i++) {
4680                  bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4681                           subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4682                           subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4683                           indirect, brw_imm_ud(read_size_32bit));
4684               }
4685            }
4686         }
4687      }
4688      break;
4689   }
4690
4691   case nir_intrinsic_load_ubo: {
4692      fs_reg surf_index;
4693      if (nir_src_is_const(instr->src[0])) {
4694         const unsigned index = stage_prog_data->binding_table.ubo_start +
4695                                nir_src_as_uint(instr->src[0]);
4696         surf_index = brw_imm_ud(index);
4697      } else {
4698         /* The block index is not a constant. Evaluate the index expression
4699          * per-channel and add the base UBO index; we have to select a value
4700          * from any live channel.
4701          */
4702         surf_index = vgrf(glsl_type::uint_type);
4703         bld.ADD(surf_index, get_nir_src(instr->src[0]),
4704                 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4705         surf_index = bld.emit_uniformize(surf_index);
4706      }
4707
4708      if (!nir_src_is_const(instr->src[1])) {
4709         fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4710                                     BRW_REGISTER_TYPE_UD);
4711
4712         for (int i = 0; i < instr->num_components; i++)
4713            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4714                                       base_offset, i * type_sz(dest.type),
4715                                       nir_dest_bit_size(instr->dest) / 8);
4716
4717         prog_data->has_ubo_pull = true;
4718      } else {
4719         /* Even if we are loading doubles, a pull constant load will load
4720          * a 32-bit vec4, so should only reserve vgrf space for that. If we
4721          * need to load a full dvec4 we will have to emit 2 loads. This is
4722          * similar to demote_pull_constants(), except that in that case we
4723          * see individual accesses to each component of the vector and then
4724          * we let CSE deal with duplicate loads. Here we see a vector access
4725          * and we have to split it if necessary.
4726          */
4727         const unsigned type_size = type_sz(dest.type);
4728         const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4729
4730         /* See if we've selected this as a push constant candidate */
4731         if (nir_src_is_const(instr->src[0])) {
4732            const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4733            const unsigned offset_256b = load_offset / 32;
4734
4735            fs_reg push_reg;
4736            for (int i = 0; i < 4; i++) {
4737               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4738               if (range->block == ubo_block &&
4739                   offset_256b >= range->start &&
4740                   offset_256b < range->start + range->length) {
4741
4742                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4743                  push_reg.offset = load_offset - 32 * range->start;
4744                  break;
4745               }
4746            }
4747
4748            if (push_reg.file != BAD_FILE) {
4749               for (unsigned i = 0; i < instr->num_components; i++) {
4750                  bld.MOV(offset(dest, bld, i),
4751                          byte_offset(push_reg, i * type_size));
4752               }
4753               break;
4754            }
4755         }
4756
4757         prog_data->has_ubo_pull = true;
4758
4759         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4760         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4761         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4762
4763         for (unsigned c = 0; c < instr->num_components;) {
4764            const unsigned base = load_offset + c * type_size;
4765            /* Number of usable components in the next block-aligned load. */
4766            const unsigned count = MIN2(instr->num_components - c,
4767                                        (block_sz - base % block_sz) / type_size);
4768
4769            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4770                      packed_consts, surf_index,
4771                      brw_imm_ud(base & ~(block_sz - 1)));
4772
4773            const fs_reg consts =
4774               retype(byte_offset(packed_consts, base & (block_sz - 1)),
4775                      dest.type);
4776
4777            for (unsigned d = 0; d < count; d++)
4778               bld.MOV(offset(dest, bld, c + d), component(consts, d));
4779
4780            c += count;
4781         }
4782      }
4783      break;
4784   }
4785
4786   case nir_intrinsic_load_global:
4787   case nir_intrinsic_load_global_constant: {
4788      assert(devinfo->ver >= 8);
4789
4790      assert(nir_dest_bit_size(instr->dest) <= 32);
4791      assert(nir_intrinsic_align(instr) > 0);
4792      if (nir_dest_bit_size(instr->dest) == 32 &&
4793          nir_intrinsic_align(instr) >= 4) {
4794         assert(nir_dest_num_components(instr->dest) <= 4);
4795         fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4796                                  dest,
4797                                  get_nir_src(instr->src[0]), /* Address */
4798                                  fs_reg(), /* No source data */
4799                                  brw_imm_ud(instr->num_components));
4800         inst->size_written = instr->num_components *
4801                              inst->dst.component_size(inst->exec_size);
4802      } else {
4803         const unsigned bit_size = nir_dest_bit_size(instr->dest);
4804         assert(nir_dest_num_components(instr->dest) == 1);
4805         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4806         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4807                  tmp,
4808                  get_nir_src(instr->src[0]), /* Address */
4809                  fs_reg(), /* No source data */
4810                  brw_imm_ud(bit_size));
4811         bld.MOV(dest, subscript(tmp, dest.type, 0));
4812      }
4813      break;
4814   }
4815
4816   case nir_intrinsic_store_global:
4817      assert(devinfo->ver >= 8);
4818
4819      assert(nir_src_bit_size(instr->src[0]) <= 32);
4820      assert(nir_intrinsic_write_mask(instr) ==
4821             (1u << instr->num_components) - 1);
4822      assert(nir_intrinsic_align(instr) > 0);
4823      if (nir_src_bit_size(instr->src[0]) == 32 &&
4824          nir_intrinsic_align(instr) >= 4) {
4825         assert(nir_src_num_components(instr->src[0]) <= 4);
4826         bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4827                  fs_reg(),
4828                  get_nir_src(instr->src[1]), /* Address */
4829                  get_nir_src(instr->src[0]), /* Data */
4830                  brw_imm_ud(instr->num_components));
4831      } else {
4832         assert(nir_src_num_components(instr->src[0]) == 1);
4833         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4834         brw_reg_type data_type =
4835            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4836         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4837         bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4838         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4839                  fs_reg(),
4840                  get_nir_src(instr->src[1]), /* Address */
4841                  tmp, /* Data */
4842                  brw_imm_ud(nir_src_bit_size(instr->src[0])));
4843      }
4844      break;
4845
4846   case nir_intrinsic_global_atomic_add:
4847   case nir_intrinsic_global_atomic_imin:
4848   case nir_intrinsic_global_atomic_umin:
4849   case nir_intrinsic_global_atomic_imax:
4850   case nir_intrinsic_global_atomic_umax:
4851   case nir_intrinsic_global_atomic_and:
4852   case nir_intrinsic_global_atomic_or:
4853   case nir_intrinsic_global_atomic_xor:
4854   case nir_intrinsic_global_atomic_exchange:
4855   case nir_intrinsic_global_atomic_comp_swap:
4856      nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
4857      break;
4858   case nir_intrinsic_global_atomic_fadd:
4859   case nir_intrinsic_global_atomic_fmin:
4860   case nir_intrinsic_global_atomic_fmax:
4861   case nir_intrinsic_global_atomic_fcomp_swap:
4862      nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
4863      break;
4864
4865   case nir_intrinsic_load_global_const_block_intel: {
4866      assert(nir_dest_bit_size(instr->dest) == 32);
4867      assert(instr->num_components == 8 || instr->num_components == 16);
4868
4869      const fs_builder ubld = bld.exec_all().group(instr->num_components, 0);
4870      fs_reg load_val;
4871
4872      bool is_pred_const = nir_src_is_const(instr->src[1]);
4873      if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) {
4874         /* In this case, we don't want the UBO load at all.  We really
4875          * shouldn't get here but it's possible.
4876          */
4877         load_val = brw_imm_ud(0);
4878      } else {
4879         /* The uniform process may stomp the flag so do this first */
4880         fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0]));
4881
4882         load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4883
4884         /* If the predicate is constant and we got here, then it's non-zero
4885          * and we don't need the predicate at all.
4886          */
4887         if (!is_pred_const) {
4888            /* Load the predicate */
4889            fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1]));
4890            fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred);
4891            mov->conditional_mod = BRW_CONDITIONAL_NZ;
4892
4893            /* Stomp the destination with 0 if we're OOB */
4894            mov = ubld.MOV(load_val, brw_imm_ud(0));
4895            mov->predicate = BRW_PREDICATE_NORMAL;
4896            mov->predicate_inverse = true;
4897         }
4898
4899         fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL,
4900                                   load_val, addr,
4901                                   fs_reg(), /* No source data */
4902                                   brw_imm_ud(instr->num_components));
4903
4904         if (!is_pred_const)
4905            load->predicate = BRW_PREDICATE_NORMAL;
4906      }
4907
4908      /* From the HW perspective, we just did a single SIMD16 instruction
4909       * which loaded a dword in each SIMD channel.  From NIR's perspective,
4910       * this instruction returns a vec16.  Any users of this data in the
4911       * back-end will expect a vec16 per SIMD channel so we have to emit a
4912       * pile of MOVs to resolve this discrepancy.  Fortunately, copy-prop
4913       * will generally clean them up for us.
4914       */
4915      for (unsigned i = 0; i < instr->num_components; i++) {
4916         bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD),
4917                 component(load_val, i));
4918      }
4919      break;
4920   }
4921
4922   case nir_intrinsic_load_ssbo: {
4923      assert(devinfo->ver >= 7);
4924
4925      const unsigned bit_size = nir_dest_bit_size(instr->dest);
4926      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4927      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4928         get_nir_ssbo_intrinsic_index(bld, instr);
4929      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4930      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4931      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
4932
4933      /* Make dest unsigned because that's what the temporary will be */
4934      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4935
4936      /* Read the vector */
4937      assert(nir_dest_bit_size(instr->dest) <= 32);
4938      assert(nir_intrinsic_align(instr) > 0);
4939      if (nir_dest_bit_size(instr->dest) == 32 &&
4940          nir_intrinsic_align(instr) >= 4) {
4941         assert(nir_dest_num_components(instr->dest) <= 4);
4942         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4943         fs_inst *inst =
4944            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4945                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4946         inst->size_written = instr->num_components * dispatch_width * 4;
4947      } else {
4948         assert(nir_dest_num_components(instr->dest) == 1);
4949         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4950
4951         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4952         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4953                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4954         bld.MOV(dest, subscript(read_result, dest.type, 0));
4955      }
4956      break;
4957   }
4958
4959   case nir_intrinsic_store_ssbo: {
4960      assert(devinfo->ver >= 7);
4961
4962      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4963      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4964      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4965         get_nir_ssbo_intrinsic_index(bld, instr);
4966      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4967      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4968      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
4969
4970      fs_reg data = get_nir_src(instr->src[0]);
4971      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4972
4973      assert(nir_src_bit_size(instr->src[0]) <= 32);
4974      assert(nir_intrinsic_write_mask(instr) ==
4975             (1u << instr->num_components) - 1);
4976      assert(nir_intrinsic_align(instr) > 0);
4977      if (nir_src_bit_size(instr->src[0]) == 32 &&
4978          nir_intrinsic_align(instr) >= 4) {
4979         assert(nir_src_num_components(instr->src[0]) <= 4);
4980         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4981         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4982         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4983                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4984      } else {
4985         assert(nir_src_num_components(instr->src[0]) == 1);
4986         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4987
4988         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4989         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4990
4991         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4992                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4993      }
4994      break;
4995   }
4996
4997   case nir_intrinsic_store_output: {
4998      assert(nir_src_bit_size(instr->src[0]) == 32);
4999      fs_reg src = get_nir_src(instr->src[0]);
5000
5001      unsigned store_offset = nir_src_as_uint(instr->src[1]);
5002      unsigned num_components = instr->num_components;
5003      unsigned first_component = nir_intrinsic_component(instr);
5004
5005      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
5006                                      4 * store_offset), src.type);
5007      for (unsigned j = 0; j < num_components; j++) {
5008         bld.MOV(offset(new_dest, bld, j + first_component),
5009                 offset(src, bld, j));
5010      }
5011      break;
5012   }
5013
5014   case nir_intrinsic_ssbo_atomic_add:
5015   case nir_intrinsic_ssbo_atomic_imin:
5016   case nir_intrinsic_ssbo_atomic_umin:
5017   case nir_intrinsic_ssbo_atomic_imax:
5018   case nir_intrinsic_ssbo_atomic_umax:
5019   case nir_intrinsic_ssbo_atomic_and:
5020   case nir_intrinsic_ssbo_atomic_or:
5021   case nir_intrinsic_ssbo_atomic_xor:
5022   case nir_intrinsic_ssbo_atomic_exchange:
5023   case nir_intrinsic_ssbo_atomic_comp_swap:
5024      nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr);
5025      break;
5026   case nir_intrinsic_ssbo_atomic_fadd:
5027   case nir_intrinsic_ssbo_atomic_fmin:
5028   case nir_intrinsic_ssbo_atomic_fmax:
5029   case nir_intrinsic_ssbo_atomic_fcomp_swap:
5030      nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr);
5031      break;
5032
5033   case nir_intrinsic_get_ssbo_size: {
5034      assert(nir_src_num_components(instr->src[0]) == 1);
5035      unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
5036                            nir_src_as_uint(instr->src[0]) : 0;
5037
5038      /* A resinfo's sampler message is used to get the buffer size.  The
5039       * SIMD8's writeback message consists of four registers and SIMD16's
5040       * writeback message consists of 8 destination registers (two per each
5041       * component).  Because we are only interested on the first channel of
5042       * the first returned component, where resinfo returns the buffer size
5043       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
5044       * the dispatch width.
5045       */
5046      const fs_builder ubld = bld.exec_all().group(8, 0);
5047      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5048      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
5049
5050      /* Set LOD = 0 */
5051      ubld.MOV(src_payload, brw_imm_d(0));
5052
5053      const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
5054      fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
5055                                src_payload, brw_imm_ud(index));
5056      inst->header_size = 0;
5057      inst->mlen = 1;
5058      inst->size_written = 4 * REG_SIZE;
5059
5060      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
5061       *
5062       * "Out-of-bounds checking is always performed at a DWord granularity. If
5063       * any part of the DWord is out-of-bounds then the whole DWord is
5064       * considered out-of-bounds."
5065       *
5066       * This implies that types with size smaller than 4-bytes need to be
5067       * padded if they don't complete the last dword of the buffer. But as we
5068       * need to maintain the original size we need to reverse the padding
5069       * calculation to return the correct size to know the number of elements
5070       * of an unsized array. As we stored in the last two bits of the surface
5071       * size the needed padding for the buffer, we calculate here the
5072       * original buffer_size reversing the surface_size calculation:
5073       *
5074       * surface_size = isl_align(buffer_size, 4) +
5075       *                (isl_align(buffer_size) - buffer_size)
5076       *
5077       * buffer_size = surface_size & ~3 - surface_size & 3
5078       */
5079
5080      fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5081      fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5082      fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5083
5084      ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
5085      ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
5086      ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
5087
5088      bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
5089      break;
5090   }
5091
5092   case nir_intrinsic_load_scratch: {
5093      assert(devinfo->ver >= 7);
5094
5095      assert(nir_dest_num_components(instr->dest) == 1);
5096      const unsigned bit_size = nir_dest_bit_size(instr->dest);
5097      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5098
5099      if (devinfo->verx10 >= 125) {
5100         const fs_builder ubld = bld.exec_all().group(1, 0);
5101         fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
5102         ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
5103                          brw_imm_ud(~0x3ffu));
5104         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
5105      } else if (devinfo->ver >= 8) {
5106         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5107            brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5108      } else {
5109         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
5110      }
5111
5112      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5113      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5114      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5115      const fs_reg nir_addr = get_nir_src(instr->src[0]);
5116
5117      /* Make dest unsigned because that's what the temporary will be */
5118      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5119
5120      /* Read the vector */
5121      assert(nir_dest_num_components(instr->dest) == 1);
5122      assert(nir_dest_bit_size(instr->dest) <= 32);
5123      assert(nir_intrinsic_align(instr) > 0);
5124      if (devinfo->verx10 >= 125) {
5125         assert(nir_dest_bit_size(instr->dest) == 32 &&
5126                nir_intrinsic_align(instr) >= 4);
5127
5128         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5129            swizzle_nir_scratch_addr(bld, nir_addr, false);
5130         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
5131
5132         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
5133                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5134      } else if (nir_dest_bit_size(instr->dest) >= 4 &&
5135                 nir_intrinsic_align(instr) >= 4) {
5136         /* The offset for a DWORD scattered message is in dwords. */
5137         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5138            swizzle_nir_scratch_addr(bld, nir_addr, true);
5139
5140         bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL,
5141                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5142      } else {
5143         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5144            swizzle_nir_scratch_addr(bld, nir_addr, false);
5145
5146         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
5147         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
5148                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
5149         bld.MOV(dest, read_result);
5150      }
5151      break;
5152   }
5153
5154   case nir_intrinsic_store_scratch: {
5155      assert(devinfo->ver >= 7);
5156
5157      assert(nir_src_num_components(instr->src[0]) == 1);
5158      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5159      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5160
5161      if (devinfo->verx10 >= 125) {
5162         const fs_builder ubld = bld.exec_all().group(1, 0);
5163         fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0);
5164         ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
5165                          brw_imm_ud(~0x3ffu));
5166         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle;
5167      } else if (devinfo->ver >= 8) {
5168         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
5169            brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT);
5170      } else {
5171         srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS);
5172      }
5173
5174      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5175      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
5176      /**
5177       * While this instruction has side-effects, it should not be predicated
5178       * on sample mask, because otherwise fs helper invocations would
5179       * load undefined values from scratch memory. And scratch memory
5180       * load-stores are produced from operations without side-effects, thus
5181       * they should not have different behaviour in the helper invocations.
5182       */
5183      srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0);
5184      const fs_reg nir_addr = get_nir_src(instr->src[1]);
5185
5186      fs_reg data = get_nir_src(instr->src[0]);
5187      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
5188
5189      assert(nir_src_num_components(instr->src[0]) == 1);
5190      assert(nir_src_bit_size(instr->src[0]) <= 32);
5191      assert(nir_intrinsic_write_mask(instr) == 1);
5192      assert(nir_intrinsic_align(instr) > 0);
5193      if (devinfo->verx10 >= 125) {
5194         assert(nir_src_bit_size(instr->src[0]) == 32 &&
5195                nir_intrinsic_align(instr) >= 4);
5196         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5197
5198         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5199            swizzle_nir_scratch_addr(bld, nir_addr, false);
5200         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1);
5201
5202         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
5203                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5204      } else if (nir_src_bit_size(instr->src[0]) == 32 &&
5205                 nir_intrinsic_align(instr) >= 4) {
5206         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5207
5208         /* The offset for a DWORD scattered message is in dwords. */
5209         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5210            swizzle_nir_scratch_addr(bld, nir_addr, true);
5211
5212         bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL,
5213                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5214      } else {
5215         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
5216         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
5217
5218         srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5219            swizzle_nir_scratch_addr(bld, nir_addr, false);
5220
5221         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
5222                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5223      }
5224      break;
5225   }
5226
5227   case nir_intrinsic_load_subgroup_size:
5228      /* This should only happen for fragment shaders because every other case
5229       * is lowered in NIR so we can optimize on it.
5230       */
5231      assert(stage == MESA_SHADER_FRAGMENT);
5232      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width));
5233      break;
5234
5235   case nir_intrinsic_load_subgroup_invocation:
5236      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
5237              nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
5238      break;
5239
5240   case nir_intrinsic_load_subgroup_eq_mask:
5241   case nir_intrinsic_load_subgroup_ge_mask:
5242   case nir_intrinsic_load_subgroup_gt_mask:
5243   case nir_intrinsic_load_subgroup_le_mask:
5244   case nir_intrinsic_load_subgroup_lt_mask:
5245      unreachable("not reached");
5246
5247   case nir_intrinsic_vote_any: {
5248      const fs_builder ubld = bld.exec_all().group(1, 0);
5249
5250      /* The any/all predicates do not consider channel enables. To prevent
5251       * dead channels from affecting the result, we initialize the flag with
5252       * with the identity value for the logical operation.
5253       */
5254      if (dispatch_width == 32) {
5255         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5256         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5257                         brw_imm_ud(0));
5258      } else {
5259         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
5260      }
5261      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
5262
5263      /* For some reason, the any/all predicates don't work properly with
5264       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5265       * doesn't read the correct subset of the flag register and you end up
5266       * getting garbage in the second half.  Work around this by using a pair
5267       * of 1-wide MOVs and scattering the result.
5268       */
5269      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5270      ubld.MOV(res1, brw_imm_d(0));
5271      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
5272                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
5273                                           BRW_PREDICATE_ALIGN1_ANY32H,
5274                    ubld.MOV(res1, brw_imm_d(-1)));
5275
5276      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5277      break;
5278   }
5279   case nir_intrinsic_vote_all: {
5280      const fs_builder ubld = bld.exec_all().group(1, 0);
5281
5282      /* The any/all predicates do not consider channel enables. To prevent
5283       * dead channels from affecting the result, we initialize the flag with
5284       * with the identity value for the logical operation.
5285       */
5286      if (dispatch_width == 32) {
5287         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5288         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5289                         brw_imm_ud(0xffffffff));
5290      } else {
5291         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
5292      }
5293      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
5294
5295      /* For some reason, the any/all predicates don't work properly with
5296       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5297       * doesn't read the correct subset of the flag register and you end up
5298       * getting garbage in the second half.  Work around this by using a pair
5299       * of 1-wide MOVs and scattering the result.
5300       */
5301      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5302      ubld.MOV(res1, brw_imm_d(0));
5303      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
5304                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
5305                                           BRW_PREDICATE_ALIGN1_ALL32H,
5306                    ubld.MOV(res1, brw_imm_d(-1)));
5307
5308      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5309      break;
5310   }
5311   case nir_intrinsic_vote_feq:
5312   case nir_intrinsic_vote_ieq: {
5313      fs_reg value = get_nir_src(instr->src[0]);
5314      if (instr->intrinsic == nir_intrinsic_vote_feq) {
5315         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
5316         value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
5317            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
5318      }
5319
5320      fs_reg uniformized = bld.emit_uniformize(value);
5321      const fs_builder ubld = bld.exec_all().group(1, 0);
5322
5323      /* The any/all predicates do not consider channel enables. To prevent
5324       * dead channels from affecting the result, we initialize the flag with
5325       * with the identity value for the logical operation.
5326       */
5327      if (dispatch_width == 32) {
5328         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
5329         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
5330                         brw_imm_ud(0xffffffff));
5331      } else {
5332         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
5333      }
5334      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
5335
5336      /* For some reason, the any/all predicates don't work properly with
5337       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
5338       * doesn't read the correct subset of the flag register and you end up
5339       * getting garbage in the second half.  Work around this by using a pair
5340       * of 1-wide MOVs and scattering the result.
5341       */
5342      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
5343      ubld.MOV(res1, brw_imm_d(0));
5344      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
5345                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
5346                                           BRW_PREDICATE_ALIGN1_ALL32H,
5347                    ubld.MOV(res1, brw_imm_d(-1)));
5348
5349      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
5350      break;
5351   }
5352
5353   case nir_intrinsic_ballot: {
5354      const fs_reg value = retype(get_nir_src(instr->src[0]),
5355                                  BRW_REGISTER_TYPE_UD);
5356      struct brw_reg flag = brw_flag_reg(0, 0);
5357      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
5358       * as f0.0.  This is a problem for fragment programs as we currently use
5359       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
5360       * programs yet so this isn't a problem.  When we do, something will
5361       * have to change.
5362       */
5363      if (dispatch_width == 32)
5364         flag.type = BRW_REGISTER_TYPE_UD;
5365
5366      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
5367      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
5368
5369      if (instr->dest.ssa.bit_size > 32) {
5370         dest.type = BRW_REGISTER_TYPE_UQ;
5371      } else {
5372         dest.type = BRW_REGISTER_TYPE_UD;
5373      }
5374      bld.MOV(dest, flag);
5375      break;
5376   }
5377
5378   case nir_intrinsic_read_invocation: {
5379      const fs_reg value = get_nir_src(instr->src[0]);
5380      const fs_reg invocation = get_nir_src(instr->src[1]);
5381      fs_reg tmp = bld.vgrf(value.type);
5382
5383      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
5384                          bld.emit_uniformize(invocation));
5385
5386      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
5387      break;
5388   }
5389
5390   case nir_intrinsic_read_first_invocation: {
5391      const fs_reg value = get_nir_src(instr->src[0]);
5392      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
5393      break;
5394   }
5395
5396   case nir_intrinsic_shuffle: {
5397      const fs_reg value = get_nir_src(instr->src[0]);
5398      const fs_reg index = get_nir_src(instr->src[1]);
5399
5400      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
5401      break;
5402   }
5403
5404   case nir_intrinsic_first_invocation: {
5405      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
5406      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
5407      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5408              fs_reg(component(tmp, 0)));
5409      break;
5410   }
5411
5412   case nir_intrinsic_quad_broadcast: {
5413      const fs_reg value = get_nir_src(instr->src[0]);
5414      const unsigned index = nir_src_as_uint(instr->src[1]);
5415
5416      bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
5417               value, brw_imm_ud(index), brw_imm_ud(4));
5418      break;
5419   }
5420
5421   case nir_intrinsic_quad_swap_horizontal: {
5422      const fs_reg value = get_nir_src(instr->src[0]);
5423      const fs_reg tmp = bld.vgrf(value.type);
5424      if (devinfo->ver <= 7) {
5425         /* The hardware doesn't seem to support these crazy regions with
5426          * compressed instructions on gfx7 and earlier so we fall back to
5427          * using quad swizzles.  Fortunately, we don't support 64-bit
5428          * anything in Vulkan on gfx7.
5429          */
5430         assert(nir_src_bit_size(instr->src[0]) == 32);
5431         const fs_builder ubld = bld.exec_all();
5432         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5433                   brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
5434         bld.MOV(retype(dest, value.type), tmp);
5435      } else {
5436         const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
5437
5438         const fs_reg src_left = horiz_stride(value, 2);
5439         const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
5440         const fs_reg tmp_left = horiz_stride(tmp, 2);
5441         const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
5442
5443         ubld.MOV(tmp_left, src_right);
5444         ubld.MOV(tmp_right, src_left);
5445
5446      }
5447      bld.MOV(retype(dest, value.type), tmp);
5448      break;
5449   }
5450
5451   case nir_intrinsic_quad_swap_vertical: {
5452      const fs_reg value = get_nir_src(instr->src[0]);
5453      if (nir_src_bit_size(instr->src[0]) == 32) {
5454         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5455         const fs_reg tmp = bld.vgrf(value.type);
5456         const fs_builder ubld = bld.exec_all();
5457         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5458                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
5459         bld.MOV(retype(dest, value.type), tmp);
5460      } else {
5461         /* For larger data types, we have to either emit dispatch_width many
5462          * MOVs or else fall back to doing indirects.
5463          */
5464         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5465         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5466                      brw_imm_w(0x2));
5467         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5468      }
5469      break;
5470   }
5471
5472   case nir_intrinsic_quad_swap_diagonal: {
5473      const fs_reg value = get_nir_src(instr->src[0]);
5474      if (nir_src_bit_size(instr->src[0]) == 32) {
5475         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
5476         const fs_reg tmp = bld.vgrf(value.type);
5477         const fs_builder ubld = bld.exec_all();
5478         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
5479                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
5480         bld.MOV(retype(dest, value.type), tmp);
5481      } else {
5482         /* For larger data types, we have to either emit dispatch_width many
5483          * MOVs or else fall back to doing indirects.
5484          */
5485         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5486         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5487                      brw_imm_w(0x3));
5488         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
5489      }
5490      break;
5491   }
5492
5493   case nir_intrinsic_reduce: {
5494      fs_reg src = get_nir_src(instr->src[0]);
5495      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5496      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
5497      if (cluster_size == 0 || cluster_size > dispatch_width)
5498         cluster_size = dispatch_width;
5499
5500      /* Figure out the source type */
5501      src.type = brw_type_for_nir_type(devinfo,
5502         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5503                        nir_src_bit_size(instr->src[0])));
5504
5505      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5506      opcode brw_op = brw_op_for_nir_reduction_op(redop);
5507      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5508
5509      /* Set up a register for all of our scratching around and initialize it
5510       * to reduction operation's identity value.
5511       */
5512      fs_reg scan = bld.vgrf(src.type);
5513      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5514
5515      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
5516
5517      dest.type = src.type;
5518      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
5519         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
5520          * the distance between clusters is at least 2 GRFs.  In this case,
5521          * we don't need the weird striding of the CLUSTER_BROADCAST
5522          * instruction and can just do regular MOVs.
5523          */
5524         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
5525         const unsigned groups =
5526            (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
5527         const unsigned group_size = dispatch_width / groups;
5528         for (unsigned i = 0; i < groups; i++) {
5529            const unsigned cluster = (i * group_size) / cluster_size;
5530            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
5531            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
5532                                         component(scan, comp));
5533         }
5534      } else {
5535         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
5536                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
5537      }
5538      break;
5539   }
5540
5541   case nir_intrinsic_inclusive_scan:
5542   case nir_intrinsic_exclusive_scan: {
5543      fs_reg src = get_nir_src(instr->src[0]);
5544      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
5545
5546      /* Figure out the source type */
5547      src.type = brw_type_for_nir_type(devinfo,
5548         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
5549                        nir_src_bit_size(instr->src[0])));
5550
5551      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
5552      opcode brw_op = brw_op_for_nir_reduction_op(redop);
5553      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
5554
5555      /* Set up a register for all of our scratching around and initialize it
5556       * to reduction operation's identity value.
5557       */
5558      fs_reg scan = bld.vgrf(src.type);
5559      const fs_builder allbld = bld.exec_all();
5560      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
5561
5562      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
5563         /* Exclusive scan is a bit harder because we have to do an annoying
5564          * shift of the contents before we can begin.  To make things worse,
5565          * we can't do this with a normal stride; we have to use indirects.
5566          */
5567         fs_reg shifted = bld.vgrf(src.type);
5568         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
5569         allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
5570                         brw_imm_w(-1));
5571         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
5572         allbld.group(1, 0).MOV(component(shifted, 0), identity);
5573         scan = shifted;
5574      }
5575
5576      bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
5577
5578      bld.MOV(retype(dest, src.type), scan);
5579      break;
5580   }
5581
5582   case nir_intrinsic_load_global_block_intel: {
5583      assert(nir_dest_bit_size(instr->dest) == 32);
5584
5585      fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0]));
5586
5587      const fs_builder ubld1 = bld.exec_all().group(1, 0);
5588      const fs_builder ubld8 = bld.exec_all().group(8, 0);
5589      const fs_builder ubld16 = bld.exec_all().group(16, 0);
5590
5591      const unsigned total = instr->num_components * dispatch_width;
5592      unsigned loaded = 0;
5593
5594      while (loaded < total) {
5595         const unsigned block =
5596            choose_oword_block_size_dwords(total - loaded);
5597         const unsigned block_bytes = block * 4;
5598
5599         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5600         ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5601                   retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
5602                   address,
5603                   fs_reg(), /* No source data */
5604                   brw_imm_ud(block))->size_written = block_bytes;
5605
5606         increment_a64_address(ubld1, address, block_bytes);
5607         loaded += block;
5608      }
5609
5610      assert(loaded == total);
5611      break;
5612   }
5613
5614   case nir_intrinsic_store_global_block_intel: {
5615      assert(nir_src_bit_size(instr->src[0]) == 32);
5616
5617      fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1]));
5618      fs_reg src = get_nir_src(instr->src[0]);
5619
5620      const fs_builder ubld1 = bld.exec_all().group(1, 0);
5621      const fs_builder ubld8 = bld.exec_all().group(8, 0);
5622      const fs_builder ubld16 = bld.exec_all().group(16, 0);
5623
5624      const unsigned total = instr->num_components * dispatch_width;
5625      unsigned written = 0;
5626
5627      while (written < total) {
5628         const unsigned block =
5629            choose_oword_block_size_dwords(total - written);
5630
5631         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5632         ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL,
5633                   fs_reg(),
5634                   address,
5635                   retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD),
5636                   brw_imm_ud(block));
5637
5638         const unsigned block_bytes = block * 4;
5639         increment_a64_address(ubld1, address, block_bytes);
5640         written += block;
5641      }
5642
5643      assert(written == total);
5644      break;
5645   }
5646
5647   case nir_intrinsic_load_shared_block_intel:
5648   case nir_intrinsic_load_ssbo_block_intel: {
5649      assert(nir_dest_bit_size(instr->dest) == 32);
5650
5651      const bool is_ssbo =
5652         instr->intrinsic == nir_intrinsic_load_ssbo_block_intel;
5653      fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0]));
5654
5655      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5656      srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
5657         get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));
5658      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
5659
5660      const fs_builder ubld1 = bld.exec_all().group(1, 0);
5661      const fs_builder ubld8 = bld.exec_all().group(8, 0);
5662      const fs_builder ubld16 = bld.exec_all().group(16, 0);
5663
5664      const unsigned total = instr->num_components * dispatch_width;
5665      unsigned loaded = 0;
5666
5667      while (loaded < total) {
5668         const unsigned block =
5669            choose_oword_block_size_dwords(total - loaded);
5670         const unsigned block_bytes = block * 4;
5671
5672         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
5673
5674         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5675         ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL,
5676                   retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD),
5677                   srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes;
5678
5679         ubld1.ADD(address, address, brw_imm_ud(block_bytes));
5680         loaded += block;
5681      }
5682
5683      assert(loaded == total);
5684      break;
5685   }
5686
5687   case nir_intrinsic_store_shared_block_intel:
5688   case nir_intrinsic_store_ssbo_block_intel: {
5689      assert(nir_src_bit_size(instr->src[0]) == 32);
5690
5691      const bool is_ssbo =
5692         instr->intrinsic == nir_intrinsic_store_ssbo_block_intel;
5693
5694      fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1]));
5695      fs_reg src = get_nir_src(instr->src[0]);
5696
5697      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5698      srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ?
5699         get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM));
5700      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address;
5701
5702      const fs_builder ubld1 = bld.exec_all().group(1, 0);
5703      const fs_builder ubld8 = bld.exec_all().group(8, 0);
5704      const fs_builder ubld16 = bld.exec_all().group(16, 0);
5705
5706      const unsigned total = instr->num_components * dispatch_width;
5707      unsigned written = 0;
5708
5709      while (written < total) {
5710         const unsigned block =
5711            choose_oword_block_size_dwords(total - written);
5712
5713         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block);
5714         srcs[SURFACE_LOGICAL_SRC_DATA] =
5715            retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD);
5716
5717         const fs_builder &ubld = block == 8 ? ubld8 : ubld16;
5718         ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL,
5719                   fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
5720
5721         const unsigned block_bytes = block * 4;
5722         ubld1.ADD(address, address, brw_imm_ud(block_bytes));
5723         written += block;
5724      }
5725
5726      assert(written == total);
5727      break;
5728   }
5729
5730   case nir_intrinsic_load_btd_dss_id_intel:
5731      bld.emit(SHADER_OPCODE_GET_DSS_ID,
5732               retype(dest, BRW_REGISTER_TYPE_UD));
5733      break;
5734
5735   case nir_intrinsic_load_btd_stack_id_intel:
5736      if (stage == MESA_SHADER_COMPUTE) {
5737         assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5738      } else {
5739         assert(brw_shader_stage_is_bindless(stage));
5740      }
5741      /* Stack IDs are always in R1 regardless of whether we're coming from a
5742       * bindless shader or a regular compute shader.
5743       */
5744      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
5745              retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
5746      break;
5747
5748   case nir_intrinsic_btd_spawn_intel:
5749      if (stage == MESA_SHADER_COMPUTE) {
5750         assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5751      } else {
5752         assert(brw_shader_stage_is_bindless(stage));
5753      }
5754      bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(),
5755               bld.emit_uniformize(get_nir_src(instr->src[0])),
5756               get_nir_src(instr->src[1]));
5757      break;
5758
5759   case nir_intrinsic_btd_retire_intel:
5760      if (stage == MESA_SHADER_COMPUTE) {
5761         assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids);
5762      } else {
5763         assert(brw_shader_stage_is_bindless(stage));
5764      }
5765      bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL);
5766      break;
5767
5768   default:
5769      unreachable("unknown intrinsic");
5770   }
5771}
5772
5773void
5774fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
5775                                 int op, nir_intrinsic_instr *instr)
5776{
5777   /* The BTI untyped atomic messages only support 32-bit atomics.  If you
5778    * just look at the big table of messages in the Vol 7 of the SKL PRM, they
5779    * appear to exist.  However, if you look at Vol 2a, there are no message
5780    * descriptors provided for Qword atomic ops except for A64 messages.
5781    */
5782   assert(nir_dest_bit_size(instr->dest) == 32 ||
5783          (nir_dest_bit_size(instr->dest) == 64 && devinfo->has_lsc));
5784
5785   fs_reg dest;
5786   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5787      dest = get_nir_dest(instr->dest);
5788
5789   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5790   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5791   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5792   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5793   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5794   srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5795
5796   fs_reg data;
5797   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5798      data = get_nir_src(instr->src[2]);
5799
5800   if (op == BRW_AOP_CMPWR) {
5801      fs_reg tmp = bld.vgrf(data.type, 2);
5802      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5803      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5804      data = tmp;
5805   }
5806   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5807
5808   /* Emit the actual atomic operation */
5809
5810   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5811            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5812}
5813
5814void
5815fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
5816                                       int op, nir_intrinsic_instr *instr)
5817{
5818   fs_reg dest;
5819   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5820      dest = get_nir_dest(instr->dest);
5821
5822   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5823   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5824   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5825   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5826   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5827   srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5828
5829   fs_reg data = get_nir_src(instr->src[2]);
5830   if (op == BRW_AOP_FCMPWR) {
5831      fs_reg tmp = bld.vgrf(data.type, 2);
5832      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5833      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5834      data = tmp;
5835   }
5836   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5837
5838   /* Emit the actual atomic operation */
5839
5840   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5841            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5842}
5843
5844void
5845fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
5846                                   int op, nir_intrinsic_instr *instr)
5847{
5848   fs_reg dest;
5849   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5850      dest = get_nir_dest(instr->dest);
5851
5852   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5853   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
5854   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5855   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5856   srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5857
5858   fs_reg data;
5859   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5860      data = get_nir_src(instr->src[1]);
5861   if (op == BRW_AOP_CMPWR) {
5862      fs_reg tmp = bld.vgrf(data.type, 2);
5863      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5864      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5865      data = tmp;
5866   }
5867   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5868
5869   /* Get the offset */
5870   if (nir_src_is_const(instr->src[0])) {
5871      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5872         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5873   } else {
5874      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5875      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5876	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5877	      brw_imm_ud(instr->const_index[0]));
5878   }
5879
5880   /* Emit the actual atomic operation operation */
5881
5882   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5883            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5884}
5885
5886void
5887fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
5888                                         int op, nir_intrinsic_instr *instr)
5889{
5890   fs_reg dest;
5891   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5892      dest = get_nir_dest(instr->dest);
5893
5894   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5895   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM);
5896   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5897   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5898   srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1);
5899
5900   fs_reg data = get_nir_src(instr->src[1]);
5901   if (op == BRW_AOP_FCMPWR) {
5902      fs_reg tmp = bld.vgrf(data.type, 2);
5903      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5904      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5905      data = tmp;
5906   }
5907   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5908
5909   /* Get the offset */
5910   if (nir_src_is_const(instr->src[0])) {
5911      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5912         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5913   } else {
5914      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5915      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5916	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5917	      brw_imm_ud(instr->const_index[0]));
5918   }
5919
5920   /* Emit the actual atomic operation operation */
5921
5922   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5923            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5924}
5925
5926static fs_reg
5927expand_to_32bit(const fs_builder &bld, const fs_reg &src)
5928{
5929   if (type_sz(src.type) == 2) {
5930      fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
5931      bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW));
5932      return src32;
5933   } else {
5934      return src;
5935   }
5936}
5937
5938void
5939fs_visitor::nir_emit_global_atomic(const fs_builder &bld,
5940                                   int op, nir_intrinsic_instr *instr)
5941{
5942   fs_reg dest;
5943   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5944      dest = get_nir_dest(instr->dest);
5945
5946   fs_reg addr = get_nir_src(instr->src[0]);
5947
5948   fs_reg data;
5949   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5950      data = expand_to_32bit(bld, get_nir_src(instr->src[1]));
5951
5952   if (op == BRW_AOP_CMPWR) {
5953      fs_reg tmp = bld.vgrf(data.type, 2);
5954      fs_reg sources[2] = {
5955         data,
5956         expand_to_32bit(bld, get_nir_src(instr->src[2]))
5957      };
5958      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5959      data = tmp;
5960   }
5961
5962   switch (nir_dest_bit_size(instr->dest)) {
5963   case 16: {
5964      fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
5965      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL,
5966               dest32, addr, data, brw_imm_ud(op));
5967      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
5968      break;
5969   }
5970   case 32:
5971      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5972               dest, addr, data, brw_imm_ud(op));
5973      break;
5974   case 64:
5975      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,
5976               dest, addr, data, brw_imm_ud(op));
5977      break;
5978   default:
5979      unreachable("Unsupported bit size");
5980   }
5981}
5982
5983void
5984fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
5985                                         int op, nir_intrinsic_instr *instr)
5986{
5987   assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
5988   fs_reg dest = get_nir_dest(instr->dest);
5989
5990   fs_reg addr = get_nir_src(instr->src[0]);
5991
5992   assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
5993   fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1]));
5994
5995   if (op == BRW_AOP_FCMPWR) {
5996      fs_reg tmp = bld.vgrf(data.type, 2);
5997      fs_reg sources[2] = {
5998         data,
5999         expand_to_32bit(bld, get_nir_src(instr->src[2]))
6000      };
6001      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
6002      data = tmp;
6003   }
6004
6005   switch (nir_dest_bit_size(instr->dest)) {
6006   case 16: {
6007      fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD);
6008      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL,
6009               dest32, addr, data, brw_imm_ud(op));
6010      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32);
6011      break;
6012   }
6013   case 32:
6014      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL,
6015               dest, addr, data, brw_imm_ud(op));
6016      break;
6017   case 64:
6018      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL,
6019               dest, addr, data, brw_imm_ud(op));
6020      break;
6021   default:
6022      unreachable("Unsupported bit size");
6023   }
6024}
6025
6026void
6027fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
6028{
6029   unsigned texture = instr->texture_index;
6030   unsigned sampler = instr->sampler_index;
6031
6032   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
6033
6034   srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
6035   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
6036
6037   int lod_components = 0;
6038
6039   /* The hardware requires a LOD for buffer textures */
6040   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
6041      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
6042
6043   uint32_t header_bits = 0;
6044   for (unsigned i = 0; i < instr->num_srcs; i++) {
6045      fs_reg src = get_nir_src(instr->src[i].src);
6046      switch (instr->src[i].src_type) {
6047      case nir_tex_src_bias:
6048         srcs[TEX_LOGICAL_SRC_LOD] =
6049            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6050         break;
6051      case nir_tex_src_comparator:
6052         srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
6053         break;
6054      case nir_tex_src_coord:
6055         switch (instr->op) {
6056         case nir_texop_txf:
6057         case nir_texop_txf_ms:
6058         case nir_texop_txf_ms_mcs_intel:
6059         case nir_texop_samples_identical:
6060            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
6061            break;
6062         default:
6063            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
6064            break;
6065         }
6066
6067         /* Wa_14013363432:
6068          *
6069          * Compiler should send U,V,R parameters even if V,R are 0.
6070          */
6071         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125)
6072            assert(instr->coord_components >= 3u);
6073         break;
6074      case nir_tex_src_ddx:
6075         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
6076         lod_components = nir_tex_instr_src_size(instr, i);
6077         break;
6078      case nir_tex_src_ddy:
6079         srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
6080         break;
6081      case nir_tex_src_lod:
6082         switch (instr->op) {
6083         case nir_texop_txs:
6084            srcs[TEX_LOGICAL_SRC_LOD] =
6085               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
6086            break;
6087         case nir_texop_txf:
6088            srcs[TEX_LOGICAL_SRC_LOD] =
6089               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
6090            break;
6091         default:
6092            srcs[TEX_LOGICAL_SRC_LOD] =
6093               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6094            break;
6095         }
6096         break;
6097      case nir_tex_src_min_lod:
6098         srcs[TEX_LOGICAL_SRC_MIN_LOD] =
6099            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
6100         break;
6101      case nir_tex_src_ms_index:
6102         srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
6103         break;
6104
6105      case nir_tex_src_offset: {
6106         uint32_t offset_bits = 0;
6107         if (brw_texture_offset(instr, i, &offset_bits)) {
6108            header_bits |= offset_bits;
6109         } else {
6110            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
6111               retype(src, BRW_REGISTER_TYPE_D);
6112         }
6113         break;
6114      }
6115
6116      case nir_tex_src_projector:
6117         unreachable("should be lowered");
6118
6119      case nir_tex_src_texture_offset: {
6120         /* Emit code to evaluate the actual indexing expression */
6121         fs_reg tmp = vgrf(glsl_type::uint_type);
6122         bld.ADD(tmp, src, brw_imm_ud(texture));
6123         srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
6124         break;
6125      }
6126
6127      case nir_tex_src_sampler_offset: {
6128         /* Emit code to evaluate the actual indexing expression */
6129         fs_reg tmp = vgrf(glsl_type::uint_type);
6130         bld.ADD(tmp, src, brw_imm_ud(sampler));
6131         srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
6132         break;
6133      }
6134
6135      case nir_tex_src_texture_handle:
6136         assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
6137         srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
6138         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
6139         break;
6140
6141      case nir_tex_src_sampler_handle:
6142         assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
6143         srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
6144         srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
6145         break;
6146
6147      case nir_tex_src_ms_mcs_intel:
6148         assert(instr->op == nir_texop_txf_ms);
6149         srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
6150         break;
6151
6152      case nir_tex_src_plane: {
6153         const uint32_t plane = nir_src_as_uint(instr->src[i].src);
6154         const uint32_t texture_index =
6155            instr->texture_index +
6156            stage_prog_data->binding_table.plane_start[plane] -
6157            stage_prog_data->binding_table.texture_start;
6158
6159         srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
6160         break;
6161      }
6162
6163      default:
6164         unreachable("unknown texture source");
6165      }
6166   }
6167
6168   if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
6169       (instr->op == nir_texop_txf_ms ||
6170        instr->op == nir_texop_samples_identical)) {
6171      if (devinfo->ver >= 7 &&
6172          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
6173         srcs[TEX_LOGICAL_SRC_MCS] =
6174            emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
6175                           instr->coord_components,
6176                           srcs[TEX_LOGICAL_SRC_SURFACE],
6177                           srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
6178      } else {
6179         srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
6180      }
6181   }
6182
6183   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
6184   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
6185
6186   enum opcode opcode;
6187   switch (instr->op) {
6188   case nir_texop_tex:
6189      opcode = SHADER_OPCODE_TEX_LOGICAL;
6190      break;
6191   case nir_texop_txb:
6192      opcode = FS_OPCODE_TXB_LOGICAL;
6193      break;
6194   case nir_texop_txl:
6195      opcode = SHADER_OPCODE_TXL_LOGICAL;
6196      break;
6197   case nir_texop_txd:
6198      opcode = SHADER_OPCODE_TXD_LOGICAL;
6199      break;
6200   case nir_texop_txf:
6201      opcode = SHADER_OPCODE_TXF_LOGICAL;
6202      break;
6203   case nir_texop_txf_ms:
6204      if ((key_tex->msaa_16 & (1 << sampler)))
6205         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
6206      else
6207         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
6208      break;
6209   case nir_texop_txf_ms_mcs_intel:
6210      opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
6211      break;
6212   case nir_texop_query_levels:
6213   case nir_texop_txs:
6214      opcode = SHADER_OPCODE_TXS_LOGICAL;
6215      break;
6216   case nir_texop_lod:
6217      opcode = SHADER_OPCODE_LOD_LOGICAL;
6218      break;
6219   case nir_texop_tg4:
6220      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
6221         opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
6222      else
6223         opcode = SHADER_OPCODE_TG4_LOGICAL;
6224      break;
6225   case nir_texop_texture_samples:
6226      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
6227      break;
6228   case nir_texop_samples_identical: {
6229      fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
6230
6231      /* If mcs is an immediate value, it means there is no MCS.  In that case
6232       * just return false.
6233       */
6234      if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
6235         bld.MOV(dst, brw_imm_ud(0u));
6236      } else if ((key_tex->msaa_16 & (1 << sampler))) {
6237         fs_reg tmp = vgrf(glsl_type::uint_type);
6238         bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
6239                offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
6240         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
6241      } else {
6242         bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
6243                 BRW_CONDITIONAL_EQ);
6244      }
6245      return;
6246   }
6247   default:
6248      unreachable("unknown texture opcode");
6249   }
6250
6251   if (instr->op == nir_texop_tg4) {
6252      if (instr->component == 1 &&
6253          key_tex->gather_channel_quirk_mask & (1 << texture)) {
6254         /* gather4 sampler is broken for green channel on RG32F --
6255          * we must ask for blue instead.
6256          */
6257         header_bits |= 2 << 16;
6258      } else {
6259         header_bits |= instr->component << 16;
6260      }
6261   }
6262
6263   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
6264   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
6265   inst->offset = header_bits;
6266
6267   const unsigned dest_size = nir_tex_instr_dest_size(instr);
6268   if (devinfo->ver >= 9 &&
6269       instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
6270      unsigned write_mask = instr->dest.is_ssa ?
6271                            nir_ssa_def_components_read(&instr->dest.ssa):
6272                            (1 << dest_size) - 1;
6273      assert(write_mask != 0); /* dead code should have been eliminated */
6274      inst->size_written = util_last_bit(write_mask) *
6275                           inst->dst.component_size(inst->exec_size);
6276   } else {
6277      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
6278   }
6279
6280   if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
6281      inst->shadow_compare = true;
6282
6283   if (instr->op == nir_texop_tg4 && devinfo->ver == 6)
6284      emit_gfx6_gather_wa(key_tex->gfx6_gather_wa[texture], dst);
6285
6286   fs_reg nir_dest[5];
6287   for (unsigned i = 0; i < dest_size; i++)
6288      nir_dest[i] = offset(dst, bld, i);
6289
6290   if (instr->op == nir_texop_query_levels) {
6291      /* # levels is in .w */
6292      if (devinfo->ver <= 9) {
6293         /**
6294          * Wa_1940217:
6295          *
6296          * When a surface of type SURFTYPE_NULL is accessed by resinfo, the
6297          * MIPCount returned is undefined instead of 0.
6298          */
6299         fs_inst *mov = bld.MOV(bld.null_reg_d(), dst);
6300         mov->conditional_mod = BRW_CONDITIONAL_NZ;
6301         nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D);
6302         fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0));
6303         sel->predicate = BRW_PREDICATE_NORMAL;
6304      } else {
6305         nir_dest[0] = offset(dst, bld, 3);
6306      }
6307   } else if (instr->op == nir_texop_txs &&
6308              dest_size >= 3 && devinfo->ver < 7) {
6309      /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */
6310      fs_reg depth = offset(dst, bld, 2);
6311      nir_dest[2] = vgrf(glsl_type::int_type);
6312      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
6313   }
6314
6315   bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
6316}
6317
6318void
6319fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
6320{
6321   switch (instr->type) {
6322   case nir_jump_break:
6323      bld.emit(BRW_OPCODE_BREAK);
6324      break;
6325   case nir_jump_continue:
6326      bld.emit(BRW_OPCODE_CONTINUE);
6327      break;
6328   case nir_jump_halt:
6329      bld.emit(BRW_OPCODE_HALT);
6330      break;
6331   case nir_jump_return:
6332   default:
6333      unreachable("unknown jump");
6334   }
6335}
6336
6337/*
6338 * This helper takes a source register and un/shuffles it into the destination
6339 * register.
6340 *
6341 * If source type size is smaller than destination type size the operation
6342 * needed is a component shuffle. The opposite case would be an unshuffle. If
6343 * source/destination type size is equal a shuffle is done that would be
6344 * equivalent to a simple MOV.
6345 *
6346 * For example, if source is a 16-bit type and destination is 32-bit. A 3
6347 * components .xyz 16-bit vector on SIMD8 would be.
6348 *
6349 *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
6350 *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
6351 *
6352 * This helper will return the following 2 32-bit components with the 16-bit
6353 * values shuffled:
6354 *
6355 *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
6356 *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
6357 *
6358 * For unshuffle, the example would be the opposite, a 64-bit type source
6359 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
6360 * would be:
6361 *
6362 *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
6363 *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
6364 *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
6365 *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
6366 *
6367 * The returned result would be the following 4 32-bit components unshuffled:
6368 *
6369 *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
6370 *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
6371 *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
6372 *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
6373 *
6374 * - Source and destination register must not be overlapped.
6375 * - components units are measured in terms of the smaller type between
6376 *   source and destination because we are un/shuffling the smaller
6377 *   components from/into the bigger ones.
6378 * - first_component parameter allows skipping source components.
6379 */
6380void
6381shuffle_src_to_dst(const fs_builder &bld,
6382                   const fs_reg &dst,
6383                   const fs_reg &src,
6384                   uint32_t first_component,
6385                   uint32_t components)
6386{
6387   if (type_sz(src.type) == type_sz(dst.type)) {
6388      assert(!regions_overlap(dst,
6389         type_sz(dst.type) * bld.dispatch_width() * components,
6390         offset(src, bld, first_component),
6391         type_sz(src.type) * bld.dispatch_width() * components));
6392      for (unsigned i = 0; i < components; i++) {
6393         bld.MOV(retype(offset(dst, bld, i), src.type),
6394                 offset(src, bld, i + first_component));
6395      }
6396   } else if (type_sz(src.type) < type_sz(dst.type)) {
6397      /* Source is shuffled into destination */
6398      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
6399      assert(!regions_overlap(dst,
6400         type_sz(dst.type) * bld.dispatch_width() *
6401         DIV_ROUND_UP(components, size_ratio),
6402         offset(src, bld, first_component),
6403         type_sz(src.type) * bld.dispatch_width() * components));
6404
6405      brw_reg_type shuffle_type =
6406         brw_reg_type_from_bit_size(8 * type_sz(src.type),
6407                                    BRW_REGISTER_TYPE_D);
6408      for (unsigned i = 0; i < components; i++) {
6409         fs_reg shuffle_component_i =
6410            subscript(offset(dst, bld, i / size_ratio),
6411                      shuffle_type, i % size_ratio);
6412         bld.MOV(shuffle_component_i,
6413                 retype(offset(src, bld, i + first_component), shuffle_type));
6414      }
6415   } else {
6416      /* Source is unshuffled into destination */
6417      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
6418      assert(!regions_overlap(dst,
6419         type_sz(dst.type) * bld.dispatch_width() * components,
6420         offset(src, bld, first_component / size_ratio),
6421         type_sz(src.type) * bld.dispatch_width() *
6422         DIV_ROUND_UP(components + (first_component % size_ratio),
6423                      size_ratio)));
6424
6425      brw_reg_type shuffle_type =
6426         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
6427                                    BRW_REGISTER_TYPE_D);
6428      for (unsigned i = 0; i < components; i++) {
6429         fs_reg shuffle_component_i =
6430            subscript(offset(src, bld, (first_component + i) / size_ratio),
6431                      shuffle_type, (first_component + i) % size_ratio);
6432         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
6433                 shuffle_component_i);
6434      }
6435   }
6436}
6437
6438void
6439shuffle_from_32bit_read(const fs_builder &bld,
6440                        const fs_reg &dst,
6441                        const fs_reg &src,
6442                        uint32_t first_component,
6443                        uint32_t components)
6444{
6445   assert(type_sz(src.type) == 4);
6446
6447   /* This function takes components in units of the destination type while
6448    * shuffle_src_to_dst takes components in units of the smallest type
6449    */
6450   if (type_sz(dst.type) > 4) {
6451      assert(type_sz(dst.type) == 8);
6452      first_component *= 2;
6453      components *= 2;
6454   }
6455
6456   shuffle_src_to_dst(bld, dst, src, first_component, components);
6457}
6458
6459fs_reg
6460setup_imm_df(const fs_builder &bld, double v)
6461{
6462   const struct intel_device_info *devinfo = bld.shader->devinfo;
6463   assert(devinfo->ver >= 7);
6464
6465   if (devinfo->ver >= 8)
6466      return brw_imm_df(v);
6467
6468   /* gfx7.5 does not support DF immediates straighforward but the DIM
6469    * instruction allows to set the 64-bit immediate value.
6470    */
6471   if (devinfo->is_haswell) {
6472      const fs_builder ubld = bld.exec_all().group(1, 0);
6473      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
6474      ubld.DIM(dst, brw_imm_df(v));
6475      return component(dst, 0);
6476   }
6477
6478   /* gfx7 does not support DF immediates, so we generate a 64-bit constant by
6479    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
6480    * the high 32-bit to suboffset 4 and then applying a stride of 0.
6481    *
6482    * Alternatively, we could also produce a normal VGRF (without stride 0)
6483    * by writing to all the channels in the VGRF, however, that would hit the
6484    * gfx7 bug where we have to split writes that span more than 1 register
6485    * into instructions with a width of 4 (otherwise the write to the second
6486    * register written runs into an execmask hardware bug) which isn't very
6487    * nice.
6488    */
6489   union {
6490      double d;
6491      struct {
6492         uint32_t i1;
6493         uint32_t i2;
6494      };
6495   } di;
6496
6497   di.d = v;
6498
6499   const fs_builder ubld = bld.exec_all().group(1, 0);
6500   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
6501   ubld.MOV(tmp, brw_imm_ud(di.i1));
6502   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
6503
6504   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
6505}
6506
6507fs_reg
6508setup_imm_b(const fs_builder &bld, int8_t v)
6509{
6510   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
6511   bld.MOV(tmp, brw_imm_w(v));
6512   return tmp;
6513}
6514
6515fs_reg
6516setup_imm_ub(const fs_builder &bld, uint8_t v)
6517{
6518   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
6519   bld.MOV(tmp, brw_imm_uw(v));
6520   return tmp;
6521}
6522