1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2010 Intel Corporation
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg#include "compiler/glsl/ir.h"
25b8e80941Smrg#include "brw_fs.h"
26b8e80941Smrg#include "brw_nir.h"
27b8e80941Smrg#include "nir_search_helpers.h"
28b8e80941Smrg#include "util/u_math.h"
29b8e80941Smrg#include "util/bitscan.h"
30b8e80941Smrg
31b8e80941Smrgusing namespace brw;
32b8e80941Smrg
33b8e80941Smrgvoid
34b8e80941Smrgfs_visitor::emit_nir_code()
35b8e80941Smrg{
36b8e80941Smrg   /* emit the arrays used for inputs and outputs - load/store intrinsics will
37b8e80941Smrg    * be converted to reads/writes of these arrays
38b8e80941Smrg    */
39b8e80941Smrg   nir_setup_outputs();
40b8e80941Smrg   nir_setup_uniforms();
41b8e80941Smrg   nir_emit_system_values();
42b8e80941Smrg
43b8e80941Smrg   nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir));
44b8e80941Smrg}
45b8e80941Smrg
46b8e80941Smrgvoid
47b8e80941Smrgfs_visitor::nir_setup_outputs()
48b8e80941Smrg{
49b8e80941Smrg   if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
50b8e80941Smrg      return;
51b8e80941Smrg
52b8e80941Smrg   unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, };
53b8e80941Smrg
54b8e80941Smrg   /* Calculate the size of output registers in a separate pass, before
55b8e80941Smrg    * allocating them.  With ARB_enhanced_layouts, multiple output variables
56b8e80941Smrg    * may occupy the same slot, but have different type sizes.
57b8e80941Smrg    */
58b8e80941Smrg   nir_foreach_variable(var, &nir->outputs) {
59b8e80941Smrg      const int loc = var->data.driver_location;
60b8e80941Smrg      const unsigned var_vec4s =
61b8e80941Smrg         var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
62b8e80941Smrg                           : type_size_vec4(var->type, true);
63b8e80941Smrg      vec4s[loc] = MAX2(vec4s[loc], var_vec4s);
64b8e80941Smrg   }
65b8e80941Smrg
66b8e80941Smrg   for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) {
67b8e80941Smrg      if (vec4s[loc] == 0) {
68b8e80941Smrg         loc++;
69b8e80941Smrg         continue;
70b8e80941Smrg      }
71b8e80941Smrg
72b8e80941Smrg      unsigned reg_size = vec4s[loc];
73b8e80941Smrg
74b8e80941Smrg      /* Check if there are any ranges that start within this range and extend
75b8e80941Smrg       * past it. If so, include them in this allocation.
76b8e80941Smrg       */
77b8e80941Smrg      for (unsigned i = 1; i < reg_size; i++)
78b8e80941Smrg         reg_size = MAX2(vec4s[i + loc] + i, reg_size);
79b8e80941Smrg
80b8e80941Smrg      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size);
81b8e80941Smrg      for (unsigned i = 0; i < reg_size; i++)
82b8e80941Smrg         outputs[loc + i] = offset(reg, bld, 4 * i);
83b8e80941Smrg
84b8e80941Smrg      loc += reg_size;
85b8e80941Smrg   }
86b8e80941Smrg}
87b8e80941Smrg
88b8e80941Smrgvoid
89b8e80941Smrgfs_visitor::nir_setup_uniforms()
90b8e80941Smrg{
91b8e80941Smrg   /* Only the first compile gets to set up uniforms. */
92b8e80941Smrg   if (push_constant_loc) {
93b8e80941Smrg      assert(pull_constant_loc);
94b8e80941Smrg      return;
95b8e80941Smrg   }
96b8e80941Smrg
97b8e80941Smrg   uniforms = nir->num_uniforms / 4;
98b8e80941Smrg
99b8e80941Smrg   if (stage == MESA_SHADER_COMPUTE) {
100b8e80941Smrg      /* Add a uniform for the thread local id.  It must be the last uniform
101b8e80941Smrg       * on the list.
102b8e80941Smrg       */
103b8e80941Smrg      assert(uniforms == prog_data->nr_params);
104b8e80941Smrg      uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1);
105b8e80941Smrg      *param = BRW_PARAM_BUILTIN_SUBGROUP_ID;
106b8e80941Smrg      subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD);
107b8e80941Smrg   }
108b8e80941Smrg}
109b8e80941Smrg
110b8e80941Smrgstatic bool
111b8e80941Smrgemit_system_values_block(nir_block *block, fs_visitor *v)
112b8e80941Smrg{
113b8e80941Smrg   fs_reg *reg;
114b8e80941Smrg
115b8e80941Smrg   nir_foreach_instr(instr, block) {
116b8e80941Smrg      if (instr->type != nir_instr_type_intrinsic)
117b8e80941Smrg         continue;
118b8e80941Smrg
119b8e80941Smrg      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
120b8e80941Smrg      switch (intrin->intrinsic) {
121b8e80941Smrg      case nir_intrinsic_load_vertex_id:
122b8e80941Smrg      case nir_intrinsic_load_base_vertex:
123b8e80941Smrg         unreachable("should be lowered by nir_lower_system_values().");
124b8e80941Smrg
125b8e80941Smrg      case nir_intrinsic_load_vertex_id_zero_base:
126b8e80941Smrg      case nir_intrinsic_load_is_indexed_draw:
127b8e80941Smrg      case nir_intrinsic_load_first_vertex:
128b8e80941Smrg      case nir_intrinsic_load_instance_id:
129b8e80941Smrg      case nir_intrinsic_load_base_instance:
130b8e80941Smrg      case nir_intrinsic_load_draw_id:
131b8e80941Smrg         unreachable("should be lowered by brw_nir_lower_vs_inputs().");
132b8e80941Smrg
133b8e80941Smrg      case nir_intrinsic_load_invocation_id:
134b8e80941Smrg         if (v->stage == MESA_SHADER_TESS_CTRL)
135b8e80941Smrg            break;
136b8e80941Smrg         assert(v->stage == MESA_SHADER_GEOMETRY);
137b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
138b8e80941Smrg         if (reg->file == BAD_FILE) {
139b8e80941Smrg            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
140b8e80941Smrg            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
141b8e80941Smrg            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
142b8e80941Smrg            abld.SHR(iid, g1, brw_imm_ud(27u));
143b8e80941Smrg            *reg = iid;
144b8e80941Smrg         }
145b8e80941Smrg         break;
146b8e80941Smrg
147b8e80941Smrg      case nir_intrinsic_load_sample_pos:
148b8e80941Smrg         assert(v->stage == MESA_SHADER_FRAGMENT);
149b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
150b8e80941Smrg         if (reg->file == BAD_FILE)
151b8e80941Smrg            *reg = *v->emit_samplepos_setup();
152b8e80941Smrg         break;
153b8e80941Smrg
154b8e80941Smrg      case nir_intrinsic_load_sample_id:
155b8e80941Smrg         assert(v->stage == MESA_SHADER_FRAGMENT);
156b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
157b8e80941Smrg         if (reg->file == BAD_FILE)
158b8e80941Smrg            *reg = *v->emit_sampleid_setup();
159b8e80941Smrg         break;
160b8e80941Smrg
161b8e80941Smrg      case nir_intrinsic_load_sample_mask_in:
162b8e80941Smrg         assert(v->stage == MESA_SHADER_FRAGMENT);
163b8e80941Smrg         assert(v->devinfo->gen >= 7);
164b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
165b8e80941Smrg         if (reg->file == BAD_FILE)
166b8e80941Smrg            *reg = *v->emit_samplemaskin_setup();
167b8e80941Smrg         break;
168b8e80941Smrg
169b8e80941Smrg      case nir_intrinsic_load_work_group_id:
170b8e80941Smrg         assert(v->stage == MESA_SHADER_COMPUTE);
171b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
172b8e80941Smrg         if (reg->file == BAD_FILE)
173b8e80941Smrg            *reg = *v->emit_cs_work_group_id_setup();
174b8e80941Smrg         break;
175b8e80941Smrg
176b8e80941Smrg      case nir_intrinsic_load_helper_invocation:
177b8e80941Smrg         assert(v->stage == MESA_SHADER_FRAGMENT);
178b8e80941Smrg         reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
179b8e80941Smrg         if (reg->file == BAD_FILE) {
180b8e80941Smrg            const fs_builder abld =
181b8e80941Smrg               v->bld.annotate("gl_HelperInvocation", NULL);
182b8e80941Smrg
183b8e80941Smrg            /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
184b8e80941Smrg             * pixel mask is in g1.7 of the thread payload.
185b8e80941Smrg             *
186b8e80941Smrg             * We move the per-channel pixel enable bit to the low bit of each
187b8e80941Smrg             * channel by shifting the byte containing the pixel mask by the
188b8e80941Smrg             * vector immediate 0x76543210UV.
189b8e80941Smrg             *
190b8e80941Smrg             * The region of <1,8,0> reads only 1 byte (the pixel masks for
191b8e80941Smrg             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
192b8e80941Smrg             * masks for 2 and 3) in SIMD16.
193b8e80941Smrg             */
194b8e80941Smrg            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
195b8e80941Smrg
196b8e80941Smrg            for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) {
197b8e80941Smrg               const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i);
198b8e80941Smrg               hbld.SHR(offset(shifted, hbld, i),
199b8e80941Smrg                        stride(retype(brw_vec1_grf(1 + i, 7),
200b8e80941Smrg                                      BRW_REGISTER_TYPE_UB),
201b8e80941Smrg                               1, 8, 0),
202b8e80941Smrg                        brw_imm_v(0x76543210));
203b8e80941Smrg            }
204b8e80941Smrg
205b8e80941Smrg            /* A set bit in the pixel mask means the channel is enabled, but
206b8e80941Smrg             * that is the opposite of gl_HelperInvocation so we need to invert
207b8e80941Smrg             * the mask.
208b8e80941Smrg             *
209b8e80941Smrg             * The negate source-modifier bit of logical instructions on Gen8+
210b8e80941Smrg             * performs 1's complement negation, so we can use that instead of
211b8e80941Smrg             * a NOT instruction.
212b8e80941Smrg             */
213b8e80941Smrg            fs_reg inverted = negate(shifted);
214b8e80941Smrg            if (v->devinfo->gen < 8) {
215b8e80941Smrg               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
216b8e80941Smrg               abld.NOT(inverted, shifted);
217b8e80941Smrg            }
218b8e80941Smrg
219b8e80941Smrg            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
220b8e80941Smrg             * with 1 and negating.
221b8e80941Smrg             */
222b8e80941Smrg            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
223b8e80941Smrg            abld.AND(anded, inverted, brw_imm_uw(1));
224b8e80941Smrg
225b8e80941Smrg            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
226b8e80941Smrg            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
227b8e80941Smrg            *reg = dst;
228b8e80941Smrg         }
229b8e80941Smrg         break;
230b8e80941Smrg
231b8e80941Smrg      default:
232b8e80941Smrg         break;
233b8e80941Smrg      }
234b8e80941Smrg   }
235b8e80941Smrg
236b8e80941Smrg   return true;
237b8e80941Smrg}
238b8e80941Smrg
239b8e80941Smrgvoid
240b8e80941Smrgfs_visitor::nir_emit_system_values()
241b8e80941Smrg{
242b8e80941Smrg   nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
243b8e80941Smrg   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
244b8e80941Smrg      nir_system_values[i] = fs_reg();
245b8e80941Smrg   }
246b8e80941Smrg
247b8e80941Smrg   /* Always emit SUBGROUP_INVOCATION.  Dead code will clean it up if we
248b8e80941Smrg    * never end up using it.
249b8e80941Smrg    */
250b8e80941Smrg   {
251b8e80941Smrg      const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL);
252b8e80941Smrg      fs_reg &reg = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION];
253b8e80941Smrg      reg = abld.vgrf(BRW_REGISTER_TYPE_UW);
254b8e80941Smrg
255b8e80941Smrg      const fs_builder allbld8 = abld.group(8, 0).exec_all();
256b8e80941Smrg      allbld8.MOV(reg, brw_imm_v(0x76543210));
257b8e80941Smrg      if (dispatch_width > 8)
258b8e80941Smrg         allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u));
259b8e80941Smrg      if (dispatch_width > 16) {
260b8e80941Smrg         const fs_builder allbld16 = abld.group(16, 0).exec_all();
261b8e80941Smrg         allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u));
262b8e80941Smrg      }
263b8e80941Smrg   }
264b8e80941Smrg
265b8e80941Smrg   nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir);
266b8e80941Smrg   nir_foreach_block(block, impl)
267b8e80941Smrg      emit_system_values_block(block, this);
268b8e80941Smrg}
269b8e80941Smrg
270b8e80941Smrg/*
271b8e80941Smrg * Returns a type based on a reference_type (word, float, half-float) and a
272b8e80941Smrg * given bit_size.
273b8e80941Smrg *
274b8e80941Smrg * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD.
275b8e80941Smrg *
276b8e80941Smrg * @FIXME: 64-bit return types are always DF on integer types to maintain
277b8e80941Smrg * compability with uses of DF previously to the introduction of int64
278b8e80941Smrg * support.
279b8e80941Smrg */
280b8e80941Smrgstatic brw_reg_type
281b8e80941Smrgbrw_reg_type_from_bit_size(const unsigned bit_size,
282b8e80941Smrg                           const brw_reg_type reference_type)
283b8e80941Smrg{
284b8e80941Smrg   switch(reference_type) {
285b8e80941Smrg   case BRW_REGISTER_TYPE_HF:
286b8e80941Smrg   case BRW_REGISTER_TYPE_F:
287b8e80941Smrg   case BRW_REGISTER_TYPE_DF:
288b8e80941Smrg      switch(bit_size) {
289b8e80941Smrg      case 16:
290b8e80941Smrg         return BRW_REGISTER_TYPE_HF;
291b8e80941Smrg      case 32:
292b8e80941Smrg         return BRW_REGISTER_TYPE_F;
293b8e80941Smrg      case 64:
294b8e80941Smrg         return BRW_REGISTER_TYPE_DF;
295b8e80941Smrg      default:
296b8e80941Smrg         unreachable("Invalid bit size");
297b8e80941Smrg      }
298b8e80941Smrg   case BRW_REGISTER_TYPE_B:
299b8e80941Smrg   case BRW_REGISTER_TYPE_W:
300b8e80941Smrg   case BRW_REGISTER_TYPE_D:
301b8e80941Smrg   case BRW_REGISTER_TYPE_Q:
302b8e80941Smrg      switch(bit_size) {
303b8e80941Smrg      case 8:
304b8e80941Smrg         return BRW_REGISTER_TYPE_B;
305b8e80941Smrg      case 16:
306b8e80941Smrg         return BRW_REGISTER_TYPE_W;
307b8e80941Smrg      case 32:
308b8e80941Smrg         return BRW_REGISTER_TYPE_D;
309b8e80941Smrg      case 64:
310b8e80941Smrg         return BRW_REGISTER_TYPE_Q;
311b8e80941Smrg      default:
312b8e80941Smrg         unreachable("Invalid bit size");
313b8e80941Smrg      }
314b8e80941Smrg   case BRW_REGISTER_TYPE_UB:
315b8e80941Smrg   case BRW_REGISTER_TYPE_UW:
316b8e80941Smrg   case BRW_REGISTER_TYPE_UD:
317b8e80941Smrg   case BRW_REGISTER_TYPE_UQ:
318b8e80941Smrg      switch(bit_size) {
319b8e80941Smrg      case 8:
320b8e80941Smrg         return BRW_REGISTER_TYPE_UB;
321b8e80941Smrg      case 16:
322b8e80941Smrg         return BRW_REGISTER_TYPE_UW;
323b8e80941Smrg      case 32:
324b8e80941Smrg         return BRW_REGISTER_TYPE_UD;
325b8e80941Smrg      case 64:
326b8e80941Smrg         return BRW_REGISTER_TYPE_UQ;
327b8e80941Smrg      default:
328b8e80941Smrg         unreachable("Invalid bit size");
329b8e80941Smrg      }
330b8e80941Smrg   default:
331b8e80941Smrg      unreachable("Unknown type");
332b8e80941Smrg   }
333b8e80941Smrg}
334b8e80941Smrg
335b8e80941Smrgvoid
336b8e80941Smrgfs_visitor::nir_emit_impl(nir_function_impl *impl)
337b8e80941Smrg{
338b8e80941Smrg   nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
339b8e80941Smrg   for (unsigned i = 0; i < impl->reg_alloc; i++) {
340b8e80941Smrg      nir_locals[i] = fs_reg();
341b8e80941Smrg   }
342b8e80941Smrg
343b8e80941Smrg   foreach_list_typed(nir_register, reg, node, &impl->registers) {
344b8e80941Smrg      unsigned array_elems =
345b8e80941Smrg         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
346b8e80941Smrg      unsigned size = array_elems * reg->num_components;
347b8e80941Smrg      const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B :
348b8e80941Smrg         brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F);
349b8e80941Smrg      nir_locals[reg->index] = bld.vgrf(reg_type, size);
350b8e80941Smrg   }
351b8e80941Smrg
352b8e80941Smrg   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
353b8e80941Smrg                             impl->ssa_alloc);
354b8e80941Smrg
355b8e80941Smrg   nir_emit_cf_list(&impl->body);
356b8e80941Smrg}
357b8e80941Smrg
358b8e80941Smrgvoid
359b8e80941Smrgfs_visitor::nir_emit_cf_list(exec_list *list)
360b8e80941Smrg{
361b8e80941Smrg   exec_list_validate(list);
362b8e80941Smrg   foreach_list_typed(nir_cf_node, node, node, list) {
363b8e80941Smrg      switch (node->type) {
364b8e80941Smrg      case nir_cf_node_if:
365b8e80941Smrg         nir_emit_if(nir_cf_node_as_if(node));
366b8e80941Smrg         break;
367b8e80941Smrg
368b8e80941Smrg      case nir_cf_node_loop:
369b8e80941Smrg         nir_emit_loop(nir_cf_node_as_loop(node));
370b8e80941Smrg         break;
371b8e80941Smrg
372b8e80941Smrg      case nir_cf_node_block:
373b8e80941Smrg         nir_emit_block(nir_cf_node_as_block(node));
374b8e80941Smrg         break;
375b8e80941Smrg
376b8e80941Smrg      default:
377b8e80941Smrg         unreachable("Invalid CFG node block");
378b8e80941Smrg      }
379b8e80941Smrg   }
380b8e80941Smrg}
381b8e80941Smrg
382b8e80941Smrgvoid
383b8e80941Smrgfs_visitor::nir_emit_if(nir_if *if_stmt)
384b8e80941Smrg{
385b8e80941Smrg   bool invert;
386b8e80941Smrg   fs_reg cond_reg;
387b8e80941Smrg
388b8e80941Smrg   /* If the condition has the form !other_condition, use other_condition as
389b8e80941Smrg    * the source, but invert the predicate on the if instruction.
390b8e80941Smrg    */
391b8e80941Smrg   nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition);
392b8e80941Smrg   if (cond != NULL && cond->op == nir_op_inot) {
393b8e80941Smrg      assert(!cond->src[0].negate);
394b8e80941Smrg      assert(!cond->src[0].abs);
395b8e80941Smrg
396b8e80941Smrg      invert = true;
397b8e80941Smrg      cond_reg = get_nir_src(cond->src[0].src);
398b8e80941Smrg   } else {
399b8e80941Smrg      invert = false;
400b8e80941Smrg      cond_reg = get_nir_src(if_stmt->condition);
401b8e80941Smrg   }
402b8e80941Smrg
403b8e80941Smrg   /* first, put the condition into f0 */
404b8e80941Smrg   fs_inst *inst = bld.MOV(bld.null_reg_d(),
405b8e80941Smrg                           retype(cond_reg, BRW_REGISTER_TYPE_D));
406b8e80941Smrg   inst->conditional_mod = BRW_CONDITIONAL_NZ;
407b8e80941Smrg
408b8e80941Smrg   bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert;
409b8e80941Smrg
410b8e80941Smrg   nir_emit_cf_list(&if_stmt->then_list);
411b8e80941Smrg
412b8e80941Smrg   if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) {
413b8e80941Smrg      bld.emit(BRW_OPCODE_ELSE);
414b8e80941Smrg      nir_emit_cf_list(&if_stmt->else_list);
415b8e80941Smrg   }
416b8e80941Smrg
417b8e80941Smrg   bld.emit(BRW_OPCODE_ENDIF);
418b8e80941Smrg
419b8e80941Smrg   if (devinfo->gen < 7)
420b8e80941Smrg      limit_dispatch_width(16, "Non-uniform control flow unsupported "
421b8e80941Smrg                           "in SIMD32 mode.");
422b8e80941Smrg}
423b8e80941Smrg
424b8e80941Smrgvoid
425b8e80941Smrgfs_visitor::nir_emit_loop(nir_loop *loop)
426b8e80941Smrg{
427b8e80941Smrg   bld.emit(BRW_OPCODE_DO);
428b8e80941Smrg
429b8e80941Smrg   nir_emit_cf_list(&loop->body);
430b8e80941Smrg
431b8e80941Smrg   bld.emit(BRW_OPCODE_WHILE);
432b8e80941Smrg
433b8e80941Smrg   if (devinfo->gen < 7)
434b8e80941Smrg      limit_dispatch_width(16, "Non-uniform control flow unsupported "
435b8e80941Smrg                           "in SIMD32 mode.");
436b8e80941Smrg}
437b8e80941Smrg
438b8e80941Smrgvoid
439b8e80941Smrgfs_visitor::nir_emit_block(nir_block *block)
440b8e80941Smrg{
441b8e80941Smrg   nir_foreach_instr(instr, block) {
442b8e80941Smrg      nir_emit_instr(instr);
443b8e80941Smrg   }
444b8e80941Smrg}
445b8e80941Smrg
446b8e80941Smrgvoid
447b8e80941Smrgfs_visitor::nir_emit_instr(nir_instr *instr)
448b8e80941Smrg{
449b8e80941Smrg   const fs_builder abld = bld.annotate(NULL, instr);
450b8e80941Smrg
451b8e80941Smrg   switch (instr->type) {
452b8e80941Smrg   case nir_instr_type_alu:
453b8e80941Smrg      nir_emit_alu(abld, nir_instr_as_alu(instr));
454b8e80941Smrg      break;
455b8e80941Smrg
456b8e80941Smrg   case nir_instr_type_deref:
457b8e80941Smrg      unreachable("All derefs should've been lowered");
458b8e80941Smrg      break;
459b8e80941Smrg
460b8e80941Smrg   case nir_instr_type_intrinsic:
461b8e80941Smrg      switch (stage) {
462b8e80941Smrg      case MESA_SHADER_VERTEX:
463b8e80941Smrg         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
464b8e80941Smrg         break;
465b8e80941Smrg      case MESA_SHADER_TESS_CTRL:
466b8e80941Smrg         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
467b8e80941Smrg         break;
468b8e80941Smrg      case MESA_SHADER_TESS_EVAL:
469b8e80941Smrg         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
470b8e80941Smrg         break;
471b8e80941Smrg      case MESA_SHADER_GEOMETRY:
472b8e80941Smrg         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
473b8e80941Smrg         break;
474b8e80941Smrg      case MESA_SHADER_FRAGMENT:
475b8e80941Smrg         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
476b8e80941Smrg         break;
477b8e80941Smrg      case MESA_SHADER_COMPUTE:
478b8e80941Smrg         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
479b8e80941Smrg         break;
480b8e80941Smrg      default:
481b8e80941Smrg         unreachable("unsupported shader stage");
482b8e80941Smrg      }
483b8e80941Smrg      break;
484b8e80941Smrg
485b8e80941Smrg   case nir_instr_type_tex:
486b8e80941Smrg      nir_emit_texture(abld, nir_instr_as_tex(instr));
487b8e80941Smrg      break;
488b8e80941Smrg
489b8e80941Smrg   case nir_instr_type_load_const:
490b8e80941Smrg      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
491b8e80941Smrg      break;
492b8e80941Smrg
493b8e80941Smrg   case nir_instr_type_ssa_undef:
494b8e80941Smrg      /* We create a new VGRF for undefs on every use (by handling
495b8e80941Smrg       * them in get_nir_src()), rather than for each definition.
496b8e80941Smrg       * This helps register coalescing eliminate MOVs from undef.
497b8e80941Smrg       */
498b8e80941Smrg      break;
499b8e80941Smrg
500b8e80941Smrg   case nir_instr_type_jump:
501b8e80941Smrg      nir_emit_jump(abld, nir_instr_as_jump(instr));
502b8e80941Smrg      break;
503b8e80941Smrg
504b8e80941Smrg   default:
505b8e80941Smrg      unreachable("unknown instruction type");
506b8e80941Smrg   }
507b8e80941Smrg}
508b8e80941Smrg
509b8e80941Smrg/**
510b8e80941Smrg * Recognizes a parent instruction of nir_op_extract_* and changes the type to
511b8e80941Smrg * match instr.
512b8e80941Smrg */
513b8e80941Smrgbool
514b8e80941Smrgfs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
515b8e80941Smrg                                      const fs_reg &result)
516b8e80941Smrg{
517b8e80941Smrg   if (!instr->src[0].src.is_ssa ||
518b8e80941Smrg       !instr->src[0].src.ssa->parent_instr)
519b8e80941Smrg      return false;
520b8e80941Smrg
521b8e80941Smrg   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
522b8e80941Smrg      return false;
523b8e80941Smrg
524b8e80941Smrg   nir_alu_instr *src0 =
525b8e80941Smrg      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
526b8e80941Smrg
527b8e80941Smrg   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
528b8e80941Smrg       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
529b8e80941Smrg      return false;
530b8e80941Smrg
531b8e80941Smrg   /* If either opcode has source modifiers, bail.
532b8e80941Smrg    *
533b8e80941Smrg    * TODO: We can potentially handle source modifiers if both of the opcodes
534b8e80941Smrg    * we're combining are signed integers.
535b8e80941Smrg    */
536b8e80941Smrg   if (instr->src[0].abs || instr->src[0].negate ||
537b8e80941Smrg       src0->src[0].abs || src0->src[0].negate)
538b8e80941Smrg      return false;
539b8e80941Smrg
540b8e80941Smrg   unsigned element = nir_src_as_uint(src0->src[1].src);
541b8e80941Smrg
542b8e80941Smrg   /* Element type to extract.*/
543b8e80941Smrg   const brw_reg_type type = brw_int_type(
544b8e80941Smrg      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
545b8e80941Smrg      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
546b8e80941Smrg
547b8e80941Smrg   fs_reg op0 = get_nir_src(src0->src[0].src);
548b8e80941Smrg   op0.type = brw_type_for_nir_type(devinfo,
549b8e80941Smrg      (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
550b8e80941Smrg                     nir_src_bit_size(src0->src[0].src)));
551b8e80941Smrg   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
552b8e80941Smrg
553b8e80941Smrg   set_saturate(instr->dest.saturate,
554b8e80941Smrg                bld.MOV(result, subscript(op0, type, element)));
555b8e80941Smrg   return true;
556b8e80941Smrg}
557b8e80941Smrg
558b8e80941Smrgbool
559b8e80941Smrgfs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
560b8e80941Smrg                                         const fs_reg &result)
561b8e80941Smrg{
562b8e80941Smrg   nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src);
563b8e80941Smrg   if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face)
564b8e80941Smrg      return false;
565b8e80941Smrg
566b8e80941Smrg   if (!nir_src_is_const(instr->src[1].src) ||
567b8e80941Smrg       !nir_src_is_const(instr->src[2].src))
568b8e80941Smrg      return false;
569b8e80941Smrg
570b8e80941Smrg   const float value1 = nir_src_as_float(instr->src[1].src);
571b8e80941Smrg   const float value2 = nir_src_as_float(instr->src[2].src);
572b8e80941Smrg   if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f)
573b8e80941Smrg      return false;
574b8e80941Smrg
575b8e80941Smrg   /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */
576b8e80941Smrg   assert(value1 == -value2);
577b8e80941Smrg
578b8e80941Smrg   fs_reg tmp = vgrf(glsl_type::int_type);
579b8e80941Smrg
580b8e80941Smrg   if (devinfo->gen >= 6) {
581b8e80941Smrg      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
582b8e80941Smrg      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
583b8e80941Smrg
584b8e80941Smrg      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
585b8e80941Smrg       *
586b8e80941Smrg       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
587b8e80941Smrg       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
588b8e80941Smrg       *
589b8e80941Smrg       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
590b8e80941Smrg       *
591b8e80941Smrg       * This negation looks like it's safe in practice, because bits 0:4 will
592b8e80941Smrg       * surely be TRIANGLES
593b8e80941Smrg       */
594b8e80941Smrg
595b8e80941Smrg      if (value1 == -1.0f) {
596b8e80941Smrg         g0.negate = true;
597b8e80941Smrg      }
598b8e80941Smrg
599b8e80941Smrg      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
600b8e80941Smrg             g0, brw_imm_uw(0x3f80));
601b8e80941Smrg   } else {
602b8e80941Smrg      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
603b8e80941Smrg      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
604b8e80941Smrg
605b8e80941Smrg      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
606b8e80941Smrg       *
607b8e80941Smrg       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
608b8e80941Smrg       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
609b8e80941Smrg       *
610b8e80941Smrg       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
611b8e80941Smrg       *
612b8e80941Smrg       * This negation looks like it's safe in practice, because bits 0:4 will
613b8e80941Smrg       * surely be TRIANGLES
614b8e80941Smrg       */
615b8e80941Smrg
616b8e80941Smrg      if (value1 == -1.0f) {
617b8e80941Smrg         g1_6.negate = true;
618b8e80941Smrg      }
619b8e80941Smrg
620b8e80941Smrg      bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
621b8e80941Smrg   }
622b8e80941Smrg   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
623b8e80941Smrg
624b8e80941Smrg   return true;
625b8e80941Smrg}
626b8e80941Smrg
627b8e80941Smrgstatic void
628b8e80941Smrgemit_find_msb_using_lzd(const fs_builder &bld,
629b8e80941Smrg                        const fs_reg &result,
630b8e80941Smrg                        const fs_reg &src,
631b8e80941Smrg                        bool is_signed)
632b8e80941Smrg{
633b8e80941Smrg   fs_inst *inst;
634b8e80941Smrg   fs_reg temp = src;
635b8e80941Smrg
636b8e80941Smrg   if (is_signed) {
637b8e80941Smrg      /* LZD of an absolute value source almost always does the right
638b8e80941Smrg       * thing.  There are two problem values:
639b8e80941Smrg       *
640b8e80941Smrg       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
641b8e80941Smrg       *   0.  However, findMSB(int(0x80000000)) == 30.
642b8e80941Smrg       *
643b8e80941Smrg       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
644b8e80941Smrg       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
645b8e80941Smrg       *
646b8e80941Smrg       *    For a value of zero or negative one, -1 will be returned.
647b8e80941Smrg       *
648b8e80941Smrg       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
649b8e80941Smrg       *   findMSB(-(1<<x)) should return x-1.
650b8e80941Smrg       *
651b8e80941Smrg       * For all negative number cases, including 0x80000000 and
652b8e80941Smrg       * 0xffffffff, the correct value is obtained from LZD if instead of
653b8e80941Smrg       * negating the (already negative) value the logical-not is used.  A
654b8e80941Smrg       * conditonal logical-not can be achieved in two instructions.
655b8e80941Smrg       */
656b8e80941Smrg      temp = bld.vgrf(BRW_REGISTER_TYPE_D);
657b8e80941Smrg
658b8e80941Smrg      bld.ASR(temp, src, brw_imm_d(31));
659b8e80941Smrg      bld.XOR(temp, temp, src);
660b8e80941Smrg   }
661b8e80941Smrg
662b8e80941Smrg   bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
663b8e80941Smrg           retype(temp, BRW_REGISTER_TYPE_UD));
664b8e80941Smrg
665b8e80941Smrg   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
666b8e80941Smrg    * from the LSB side. Subtract the result from 31 to convert the MSB
667b8e80941Smrg    * count into an LSB count.  If no bits are set, LZD will return 32.
668b8e80941Smrg    * 31-32 = -1, which is exactly what findMSB() is supposed to return.
669b8e80941Smrg    */
670b8e80941Smrg   inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
671b8e80941Smrg   inst->src[0].negate = true;
672b8e80941Smrg}
673b8e80941Smrg
674b8e80941Smrgstatic brw_rnd_mode
675b8e80941Smrgbrw_rnd_mode_from_nir_op (const nir_op op) {
676b8e80941Smrg   switch (op) {
677b8e80941Smrg   case nir_op_f2f16_rtz:
678b8e80941Smrg      return BRW_RND_MODE_RTZ;
679b8e80941Smrg   case nir_op_f2f16_rtne:
680b8e80941Smrg      return BRW_RND_MODE_RTNE;
681b8e80941Smrg   default:
682b8e80941Smrg      unreachable("Operation doesn't support rounding mode");
683b8e80941Smrg   }
684b8e80941Smrg}
685b8e80941Smrg
686b8e80941Smrgfs_reg
687b8e80941Smrgfs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld,
688b8e80941Smrg                                                nir_alu_instr *instr,
689b8e80941Smrg                                                fs_reg *op,
690b8e80941Smrg                                                bool need_dest)
691b8e80941Smrg{
692b8e80941Smrg   fs_reg result =
693b8e80941Smrg      need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud();
694b8e80941Smrg
695b8e80941Smrg   result.type = brw_type_for_nir_type(devinfo,
696b8e80941Smrg      (nir_alu_type)(nir_op_infos[instr->op].output_type |
697b8e80941Smrg                     nir_dest_bit_size(instr->dest.dest)));
698b8e80941Smrg
699b8e80941Smrg   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
700b8e80941Smrg      op[i] = get_nir_src(instr->src[i].src);
701b8e80941Smrg      op[i].type = brw_type_for_nir_type(devinfo,
702b8e80941Smrg         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
703b8e80941Smrg                        nir_src_bit_size(instr->src[i].src)));
704b8e80941Smrg      op[i].abs = instr->src[i].abs;
705b8e80941Smrg      op[i].negate = instr->src[i].negate;
706b8e80941Smrg   }
707b8e80941Smrg
708b8e80941Smrg   /* Move and vecN instrutions may still be vectored.  Return the raw,
709b8e80941Smrg    * vectored source and destination so that fs_visitor::nir_emit_alu can
710b8e80941Smrg    * handle it.  Other callers should not have to handle these kinds of
711b8e80941Smrg    * instructions.
712b8e80941Smrg    */
713b8e80941Smrg   switch (instr->op) {
714b8e80941Smrg   case nir_op_imov:
715b8e80941Smrg   case nir_op_fmov:
716b8e80941Smrg   case nir_op_vec2:
717b8e80941Smrg   case nir_op_vec3:
718b8e80941Smrg   case nir_op_vec4:
719b8e80941Smrg      return result;
720b8e80941Smrg   default:
721b8e80941Smrg      break;
722b8e80941Smrg   }
723b8e80941Smrg
724b8e80941Smrg   /* At this point, we have dealt with any instruction that operates on
725b8e80941Smrg    * more than a single channel.  Therefore, we can just adjust the source
726b8e80941Smrg    * and destination registers for that channel and emit the instruction.
727b8e80941Smrg    */
728b8e80941Smrg   unsigned channel = 0;
729b8e80941Smrg   if (nir_op_infos[instr->op].output_size == 0) {
730b8e80941Smrg      /* Since NIR is doing the scalarizing for us, we should only ever see
731b8e80941Smrg       * vectorized operations with a single channel.
732b8e80941Smrg       */
733b8e80941Smrg      assert(util_bitcount(instr->dest.write_mask) == 1);
734b8e80941Smrg      channel = ffs(instr->dest.write_mask) - 1;
735b8e80941Smrg
736b8e80941Smrg      result = offset(result, bld, channel);
737b8e80941Smrg   }
738b8e80941Smrg
739b8e80941Smrg   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
740b8e80941Smrg      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
741b8e80941Smrg      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
742b8e80941Smrg   }
743b8e80941Smrg
744b8e80941Smrg   return result;
745b8e80941Smrg}
746b8e80941Smrg
747b8e80941Smrgvoid
748b8e80941Smrgfs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr,
749b8e80941Smrg                                 fs_reg *op)
750b8e80941Smrg{
751b8e80941Smrg   for (unsigned i = 0; i < 2; i++) {
752b8e80941Smrg      nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src);
753b8e80941Smrg
754b8e80941Smrg      if (inot_instr != NULL && inot_instr->op == nir_op_inot &&
755b8e80941Smrg          !inot_instr->src[0].abs && !inot_instr->src[0].negate) {
756b8e80941Smrg         /* The source of the inot is now the source of instr. */
757b8e80941Smrg         prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false);
758b8e80941Smrg
759b8e80941Smrg         assert(!op[i].negate);
760b8e80941Smrg         op[i].negate = true;
761b8e80941Smrg      } else {
762b8e80941Smrg         op[i] = resolve_source_modifiers(op[i]);
763b8e80941Smrg      }
764b8e80941Smrg   }
765b8e80941Smrg}
766b8e80941Smrg
767b8e80941Smrgbool
768b8e80941Smrgfs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld,
769b8e80941Smrg                                  fs_reg result,
770b8e80941Smrg                                  nir_alu_instr *instr)
771b8e80941Smrg{
772b8e80941Smrg   if (devinfo->gen < 6 || devinfo->gen >= 12)
773b8e80941Smrg      return false;
774b8e80941Smrg
775b8e80941Smrg   nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src);
776b8e80941Smrg
777b8e80941Smrg   if (inot_instr == NULL || inot_instr->op != nir_op_inot)
778b8e80941Smrg      return false;
779b8e80941Smrg
780b8e80941Smrg   /* HF is also possible as a destination on BDW+.  For nir_op_b2i, the set
781b8e80941Smrg    * of valid size-changing combinations is a bit more complex.
782b8e80941Smrg    *
783b8e80941Smrg    * The source restriction is just because I was lazy about generating the
784b8e80941Smrg    * constant below.
785b8e80941Smrg    */
786b8e80941Smrg   if (nir_dest_bit_size(instr->dest.dest) != 32 ||
787b8e80941Smrg       nir_src_bit_size(inot_instr->src[0].src) != 32)
788b8e80941Smrg      return false;
789b8e80941Smrg
790b8e80941Smrg   /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0.  Since a can only be 0 or -1,
791b8e80941Smrg    * this is float(1 + a).
792b8e80941Smrg    */
793b8e80941Smrg   fs_reg op;
794b8e80941Smrg
795b8e80941Smrg   prepare_alu_destination_and_sources(bld, inot_instr, &op, false);
796b8e80941Smrg
797b8e80941Smrg   /* Ignore the saturate modifier, if there is one.  The result of the
798b8e80941Smrg    * arithmetic can only be 0 or 1, so the clamping will do nothing anyway.
799b8e80941Smrg    */
800b8e80941Smrg   bld.ADD(result, op, brw_imm_d(1));
801b8e80941Smrg
802b8e80941Smrg   return true;
803b8e80941Smrg}
804b8e80941Smrg
805b8e80941Smrg/**
806b8e80941Smrg * Emit code for nir_op_fsign possibly fused with a nir_op_fmul
807b8e80941Smrg *
808b8e80941Smrg * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of
809b8e80941Smrg * the source of \c instr that is a \c nir_op_fsign.
810b8e80941Smrg */
811b8e80941Smrgvoid
812b8e80941Smrgfs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr,
813b8e80941Smrg                       fs_reg result, fs_reg *op, unsigned fsign_src)
814b8e80941Smrg{
815b8e80941Smrg   fs_inst *inst;
816b8e80941Smrg
817b8e80941Smrg   assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul);
818b8e80941Smrg   assert(fsign_src < nir_op_infos[instr->op].num_inputs);
819b8e80941Smrg
820b8e80941Smrg   if (instr->op != nir_op_fsign) {
821b8e80941Smrg      const nir_alu_instr *const fsign_instr =
822b8e80941Smrg         nir_src_as_alu_instr(instr->src[fsign_src].src);
823b8e80941Smrg
824b8e80941Smrg      assert(!fsign_instr->dest.saturate);
825b8e80941Smrg
826b8e80941Smrg      /* op[fsign_src] has the nominal result of the fsign, and op[1 -
827b8e80941Smrg       * fsign_src] has the other multiply source.  This must be rearranged so
828b8e80941Smrg       * that op[0] is the source of the fsign op[1] is the other multiply
829b8e80941Smrg       * source.
830b8e80941Smrg       */
831b8e80941Smrg      if (fsign_src != 0)
832b8e80941Smrg         op[1] = op[0];
833b8e80941Smrg
834b8e80941Smrg      op[0] = get_nir_src(fsign_instr->src[0].src);
835b8e80941Smrg
836b8e80941Smrg      const nir_alu_type t =
837b8e80941Smrg         (nir_alu_type)(nir_op_infos[instr->op].input_types[0] |
838b8e80941Smrg                        nir_src_bit_size(fsign_instr->src[0].src));
839b8e80941Smrg
840b8e80941Smrg      op[0].type = brw_type_for_nir_type(devinfo, t);
841b8e80941Smrg      op[0].abs = fsign_instr->src[0].abs;
842b8e80941Smrg      op[0].negate = fsign_instr->src[0].negate;
843b8e80941Smrg
844b8e80941Smrg      unsigned channel = 0;
845b8e80941Smrg      if (nir_op_infos[instr->op].output_size == 0) {
846b8e80941Smrg         /* Since NIR is doing the scalarizing for us, we should only ever see
847b8e80941Smrg          * vectorized operations with a single channel.
848b8e80941Smrg          */
849b8e80941Smrg         assert(util_bitcount(instr->dest.write_mask) == 1);
850b8e80941Smrg         channel = ffs(instr->dest.write_mask) - 1;
851b8e80941Smrg      }
852b8e80941Smrg
853b8e80941Smrg      op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]);
854b8e80941Smrg   } else {
855b8e80941Smrg      assert(!instr->dest.saturate);
856b8e80941Smrg   }
857b8e80941Smrg
858b8e80941Smrg   if (op[0].abs) {
859b8e80941Smrg      /* Straightforward since the source can be assumed to be either strictly
860b8e80941Smrg       * >= 0 or strictly <= 0 depending on the setting of the negate flag.
861b8e80941Smrg       */
862b8e80941Smrg      set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
863b8e80941Smrg
864b8e80941Smrg      if (instr->op == nir_op_fsign) {
865b8e80941Smrg         inst = (op[0].negate)
866b8e80941Smrg            ? bld.MOV(result, brw_imm_f(-1.0f))
867b8e80941Smrg            : bld.MOV(result, brw_imm_f(1.0f));
868b8e80941Smrg      } else {
869b8e80941Smrg         op[1].negate = (op[0].negate != op[1].negate);
870b8e80941Smrg         inst = bld.MOV(result, op[1]);
871b8e80941Smrg      }
872b8e80941Smrg
873b8e80941Smrg      set_predicate(BRW_PREDICATE_NORMAL, inst);
874b8e80941Smrg   } else if (type_sz(op[0].type) == 2) {
875b8e80941Smrg      /* AND(val, 0x8000) gives the sign bit.
876b8e80941Smrg       *
877b8e80941Smrg       * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero.
878b8e80941Smrg       */
879b8e80941Smrg      fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF);
880b8e80941Smrg      bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ);
881b8e80941Smrg
882b8e80941Smrg      op[0].type = BRW_REGISTER_TYPE_UW;
883b8e80941Smrg      result.type = BRW_REGISTER_TYPE_UW;
884b8e80941Smrg      bld.AND(result, op[0], brw_imm_uw(0x8000u));
885b8e80941Smrg
886b8e80941Smrg      if (instr->op == nir_op_fsign)
887b8e80941Smrg         inst = bld.OR(result, result, brw_imm_uw(0x3c00u));
888b8e80941Smrg      else {
889b8e80941Smrg         /* Use XOR here to get the result sign correct. */
890b8e80941Smrg         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW));
891b8e80941Smrg      }
892b8e80941Smrg
893b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
894b8e80941Smrg   } else if (type_sz(op[0].type) == 4) {
895b8e80941Smrg      /* AND(val, 0x80000000) gives the sign bit.
896b8e80941Smrg       *
897b8e80941Smrg       * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
898b8e80941Smrg       * zero.
899b8e80941Smrg       */
900b8e80941Smrg      bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
901b8e80941Smrg
902b8e80941Smrg      op[0].type = BRW_REGISTER_TYPE_UD;
903b8e80941Smrg      result.type = BRW_REGISTER_TYPE_UD;
904b8e80941Smrg      bld.AND(result, op[0], brw_imm_ud(0x80000000u));
905b8e80941Smrg
906b8e80941Smrg      if (instr->op == nir_op_fsign)
907b8e80941Smrg         inst = bld.OR(result, result, brw_imm_ud(0x3f800000u));
908b8e80941Smrg      else {
909b8e80941Smrg         /* Use XOR here to get the result sign correct. */
910b8e80941Smrg         inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD));
911b8e80941Smrg      }
912b8e80941Smrg
913b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
914b8e80941Smrg   } else {
915b8e80941Smrg      /* For doubles we do the same but we need to consider:
916b8e80941Smrg       *
917b8e80941Smrg       * - 2-src instructions can't operate with 64-bit immediates
918b8e80941Smrg       * - The sign is encoded in the high 32-bit of each DF
919b8e80941Smrg       * - We need to produce a DF result.
920b8e80941Smrg       */
921b8e80941Smrg
922b8e80941Smrg      fs_reg zero = vgrf(glsl_type::double_type);
923b8e80941Smrg      bld.MOV(zero, setup_imm_df(bld, 0.0));
924b8e80941Smrg      bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ);
925b8e80941Smrg
926b8e80941Smrg      bld.MOV(result, zero);
927b8e80941Smrg
928b8e80941Smrg      fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1);
929b8e80941Smrg      bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1),
930b8e80941Smrg              brw_imm_ud(0x80000000u));
931b8e80941Smrg
932b8e80941Smrg      if (instr->op == nir_op_fsign) {
933b8e80941Smrg         set_predicate(BRW_PREDICATE_NORMAL,
934b8e80941Smrg                       bld.OR(r, r, brw_imm_ud(0x3ff00000u)));
935b8e80941Smrg      } else {
936b8e80941Smrg         /* This could be done better in some cases.  If the scale is an
937b8e80941Smrg          * immediate with the low 32-bits all 0, emitting a separate XOR and
938b8e80941Smrg          * OR would allow an algebraic optimization to remove the OR.  There
939b8e80941Smrg          * are currently zero instances of fsign(double(x))*IMM in shader-db
940b8e80941Smrg          * or any test suite, so it is hard to care at this time.
941b8e80941Smrg          */
942b8e80941Smrg         fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ);
943b8e80941Smrg         inst = bld.XOR(result_int64, result_int64,
944b8e80941Smrg                        retype(op[1], BRW_REGISTER_TYPE_UQ));
945b8e80941Smrg      }
946b8e80941Smrg   }
947b8e80941Smrg}
948b8e80941Smrg
949b8e80941Smrg/**
950b8e80941Smrg * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign
951b8e80941Smrg *
952b8e80941Smrg * Checks the operands of a \c nir_op_fmul to determine whether or not
953b8e80941Smrg * \c emit_fsign could fuse the multiplication with the \c sign() calculation.
954b8e80941Smrg *
955b8e80941Smrg * \param instr  The multiplication instruction
956b8e80941Smrg *
957b8e80941Smrg * \param fsign_src The source of \c instr that may or may not be a
958b8e80941Smrg *                  \c nir_op_fsign
959b8e80941Smrg */
960b8e80941Smrgstatic bool
961b8e80941Smrgcan_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src)
962b8e80941Smrg{
963b8e80941Smrg   assert(instr->op == nir_op_fmul);
964b8e80941Smrg
965b8e80941Smrg   nir_alu_instr *const fsign_instr =
966b8e80941Smrg      nir_src_as_alu_instr(instr->src[fsign_src].src);
967b8e80941Smrg
968b8e80941Smrg   /* Rules:
969b8e80941Smrg    *
970b8e80941Smrg    * 1. instr->src[fsign_src] must be a nir_op_fsign.
971b8e80941Smrg    * 2. The nir_op_fsign can only be used by this multiplication.
972b8e80941Smrg    * 3. The source that is the nir_op_fsign does not have source modifiers.
973b8e80941Smrg    *    \c emit_fsign only examines the source modifiers of the source of the
974b8e80941Smrg    *    \c nir_op_fsign.
975b8e80941Smrg    *
976b8e80941Smrg    * The nir_op_fsign must also not have the saturate modifier, but steps
977b8e80941Smrg    * have already been taken (in nir_opt_algebraic) to ensure that.
978b8e80941Smrg    */
979b8e80941Smrg   return fsign_instr != NULL && fsign_instr->op == nir_op_fsign &&
980b8e80941Smrg          is_used_once(fsign_instr) &&
981b8e80941Smrg          !instr->src[fsign_src].abs && !instr->src[fsign_src].negate;
982b8e80941Smrg}
983b8e80941Smrg
984b8e80941Smrgvoid
985b8e80941Smrgfs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
986b8e80941Smrg{
987b8e80941Smrg   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
988b8e80941Smrg   fs_inst *inst;
989b8e80941Smrg
990b8e80941Smrg   fs_reg op[4];
991b8e80941Smrg   fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true);
992b8e80941Smrg
993b8e80941Smrg   switch (instr->op) {
994b8e80941Smrg   case nir_op_imov:
995b8e80941Smrg   case nir_op_fmov:
996b8e80941Smrg   case nir_op_vec2:
997b8e80941Smrg   case nir_op_vec3:
998b8e80941Smrg   case nir_op_vec4: {
999b8e80941Smrg      fs_reg temp = result;
1000b8e80941Smrg      bool need_extra_copy = false;
1001b8e80941Smrg      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
1002b8e80941Smrg         if (!instr->src[i].src.is_ssa &&
1003b8e80941Smrg             instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
1004b8e80941Smrg            need_extra_copy = true;
1005b8e80941Smrg            temp = bld.vgrf(result.type, 4);
1006b8e80941Smrg            break;
1007b8e80941Smrg         }
1008b8e80941Smrg      }
1009b8e80941Smrg
1010b8e80941Smrg      for (unsigned i = 0; i < 4; i++) {
1011b8e80941Smrg         if (!(instr->dest.write_mask & (1 << i)))
1012b8e80941Smrg            continue;
1013b8e80941Smrg
1014b8e80941Smrg         if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
1015b8e80941Smrg            inst = bld.MOV(offset(temp, bld, i),
1016b8e80941Smrg                           offset(op[0], bld, instr->src[0].swizzle[i]));
1017b8e80941Smrg         } else {
1018b8e80941Smrg            inst = bld.MOV(offset(temp, bld, i),
1019b8e80941Smrg                           offset(op[i], bld, instr->src[i].swizzle[0]));
1020b8e80941Smrg         }
1021b8e80941Smrg         inst->saturate = instr->dest.saturate;
1022b8e80941Smrg      }
1023b8e80941Smrg
1024b8e80941Smrg      /* In this case the source and destination registers were the same,
1025b8e80941Smrg       * so we need to insert an extra set of moves in order to deal with
1026b8e80941Smrg       * any swizzling.
1027b8e80941Smrg       */
1028b8e80941Smrg      if (need_extra_copy) {
1029b8e80941Smrg         for (unsigned i = 0; i < 4; i++) {
1030b8e80941Smrg            if (!(instr->dest.write_mask & (1 << i)))
1031b8e80941Smrg               continue;
1032b8e80941Smrg
1033b8e80941Smrg            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
1034b8e80941Smrg         }
1035b8e80941Smrg      }
1036b8e80941Smrg      return;
1037b8e80941Smrg   }
1038b8e80941Smrg
1039b8e80941Smrg   case nir_op_i2f32:
1040b8e80941Smrg   case nir_op_u2f32:
1041b8e80941Smrg      if (optimize_extract_to_float(instr, result))
1042b8e80941Smrg         return;
1043b8e80941Smrg      inst = bld.MOV(result, op[0]);
1044b8e80941Smrg      inst->saturate = instr->dest.saturate;
1045b8e80941Smrg      break;
1046b8e80941Smrg
1047b8e80941Smrg   case nir_op_f2f16_rtne:
1048b8e80941Smrg   case nir_op_f2f16_rtz:
1049b8e80941Smrg      bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(),
1050b8e80941Smrg               brw_imm_d(brw_rnd_mode_from_nir_op(instr->op)));
1051b8e80941Smrg      /* fallthrough */
1052b8e80941Smrg   case nir_op_f2f16:
1053b8e80941Smrg      /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending
1054b8e80941Smrg       * on the HW gen, it is a special hw opcode or just a MOV, and
1055b8e80941Smrg       * brw_F32TO16 (at brw_eu_emit) would do the work to chose.
1056b8e80941Smrg       *
1057b8e80941Smrg       * But if we want to use that opcode, we need to provide support on
1058b8e80941Smrg       * different optimizations and lowerings. As right now HF support is
1059b8e80941Smrg       * only for gen8+, it will be better to use directly the MOV, and use
1060b8e80941Smrg       * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7.
1061b8e80941Smrg       */
1062b8e80941Smrg      assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1063b8e80941Smrg      inst = bld.MOV(result, op[0]);
1064b8e80941Smrg      inst->saturate = instr->dest.saturate;
1065b8e80941Smrg      break;
1066b8e80941Smrg
1067b8e80941Smrg   case nir_op_b2i8:
1068b8e80941Smrg   case nir_op_b2i16:
1069b8e80941Smrg   case nir_op_b2i32:
1070b8e80941Smrg   case nir_op_b2i64:
1071b8e80941Smrg   case nir_op_b2f16:
1072b8e80941Smrg   case nir_op_b2f32:
1073b8e80941Smrg   case nir_op_b2f64:
1074b8e80941Smrg      if (try_emit_b2fi_of_inot(bld, result, instr))
1075b8e80941Smrg         break;
1076b8e80941Smrg      op[0].type = BRW_REGISTER_TYPE_D;
1077b8e80941Smrg      op[0].negate = !op[0].negate;
1078b8e80941Smrg      /* fallthrough */
1079b8e80941Smrg   case nir_op_i2f64:
1080b8e80941Smrg   case nir_op_i2i64:
1081b8e80941Smrg   case nir_op_u2f64:
1082b8e80941Smrg   case nir_op_u2u64:
1083b8e80941Smrg   case nir_op_f2f64:
1084b8e80941Smrg   case nir_op_f2i64:
1085b8e80941Smrg   case nir_op_f2u64:
1086b8e80941Smrg   case nir_op_i2i32:
1087b8e80941Smrg   case nir_op_u2u32:
1088b8e80941Smrg   case nir_op_f2f32:
1089b8e80941Smrg   case nir_op_f2i32:
1090b8e80941Smrg   case nir_op_f2u32:
1091b8e80941Smrg   case nir_op_i2f16:
1092b8e80941Smrg   case nir_op_i2i16:
1093b8e80941Smrg   case nir_op_u2f16:
1094b8e80941Smrg   case nir_op_u2u16:
1095b8e80941Smrg   case nir_op_f2i16:
1096b8e80941Smrg   case nir_op_f2u16:
1097b8e80941Smrg   case nir_op_i2i8:
1098b8e80941Smrg   case nir_op_u2u8:
1099b8e80941Smrg   case nir_op_f2i8:
1100b8e80941Smrg   case nir_op_f2u8:
1101b8e80941Smrg      if (result.type == BRW_REGISTER_TYPE_B ||
1102b8e80941Smrg          result.type == BRW_REGISTER_TYPE_UB ||
1103b8e80941Smrg          result.type == BRW_REGISTER_TYPE_HF)
1104b8e80941Smrg         assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */
1105b8e80941Smrg
1106b8e80941Smrg      if (op[0].type == BRW_REGISTER_TYPE_B ||
1107b8e80941Smrg          op[0].type == BRW_REGISTER_TYPE_UB ||
1108b8e80941Smrg          op[0].type == BRW_REGISTER_TYPE_HF)
1109b8e80941Smrg         assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */
1110b8e80941Smrg
1111b8e80941Smrg      inst = bld.MOV(result, op[0]);
1112b8e80941Smrg      inst->saturate = instr->dest.saturate;
1113b8e80941Smrg      break;
1114b8e80941Smrg
1115b8e80941Smrg   case nir_op_fsign:
1116b8e80941Smrg      emit_fsign(bld, instr, result, op, 0);
1117b8e80941Smrg      break;
1118b8e80941Smrg
1119b8e80941Smrg   case nir_op_frcp:
1120b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
1121b8e80941Smrg      inst->saturate = instr->dest.saturate;
1122b8e80941Smrg      break;
1123b8e80941Smrg
1124b8e80941Smrg   case nir_op_fexp2:
1125b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
1126b8e80941Smrg      inst->saturate = instr->dest.saturate;
1127b8e80941Smrg      break;
1128b8e80941Smrg
1129b8e80941Smrg   case nir_op_flog2:
1130b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
1131b8e80941Smrg      inst->saturate = instr->dest.saturate;
1132b8e80941Smrg      break;
1133b8e80941Smrg
1134b8e80941Smrg   case nir_op_fsin:
1135b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
1136b8e80941Smrg      inst->saturate = instr->dest.saturate;
1137b8e80941Smrg      break;
1138b8e80941Smrg
1139b8e80941Smrg   case nir_op_fcos:
1140b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
1141b8e80941Smrg      inst->saturate = instr->dest.saturate;
1142b8e80941Smrg      break;
1143b8e80941Smrg
1144b8e80941Smrg   case nir_op_fddx:
1145b8e80941Smrg      if (fs_key->high_quality_derivatives) {
1146b8e80941Smrg         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1147b8e80941Smrg      } else {
1148b8e80941Smrg         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1149b8e80941Smrg      }
1150b8e80941Smrg      inst->saturate = instr->dest.saturate;
1151b8e80941Smrg      break;
1152b8e80941Smrg   case nir_op_fddx_fine:
1153b8e80941Smrg      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
1154b8e80941Smrg      inst->saturate = instr->dest.saturate;
1155b8e80941Smrg      break;
1156b8e80941Smrg   case nir_op_fddx_coarse:
1157b8e80941Smrg      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
1158b8e80941Smrg      inst->saturate = instr->dest.saturate;
1159b8e80941Smrg      break;
1160b8e80941Smrg   case nir_op_fddy:
1161b8e80941Smrg      if (fs_key->high_quality_derivatives) {
1162b8e80941Smrg         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1163b8e80941Smrg      } else {
1164b8e80941Smrg         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1165b8e80941Smrg      }
1166b8e80941Smrg      inst->saturate = instr->dest.saturate;
1167b8e80941Smrg      break;
1168b8e80941Smrg   case nir_op_fddy_fine:
1169b8e80941Smrg      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
1170b8e80941Smrg      inst->saturate = instr->dest.saturate;
1171b8e80941Smrg      break;
1172b8e80941Smrg   case nir_op_fddy_coarse:
1173b8e80941Smrg      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
1174b8e80941Smrg      inst->saturate = instr->dest.saturate;
1175b8e80941Smrg      break;
1176b8e80941Smrg
1177b8e80941Smrg   case nir_op_iadd:
1178b8e80941Smrg   case nir_op_fadd:
1179b8e80941Smrg      inst = bld.ADD(result, op[0], op[1]);
1180b8e80941Smrg      inst->saturate = instr->dest.saturate;
1181b8e80941Smrg      break;
1182b8e80941Smrg
1183b8e80941Smrg   case nir_op_uadd_sat:
1184b8e80941Smrg      inst = bld.ADD(result, op[0], op[1]);
1185b8e80941Smrg      inst->saturate = true;
1186b8e80941Smrg      break;
1187b8e80941Smrg
1188b8e80941Smrg   case nir_op_fmul:
1189b8e80941Smrg      for (unsigned i = 0; i < 2; i++) {
1190b8e80941Smrg         if (can_fuse_fmul_fsign(instr, i)) {
1191b8e80941Smrg            emit_fsign(bld, instr, result, op, i);
1192b8e80941Smrg            return;
1193b8e80941Smrg         }
1194b8e80941Smrg      }
1195b8e80941Smrg
1196b8e80941Smrg      inst = bld.MUL(result, op[0], op[1]);
1197b8e80941Smrg      inst->saturate = instr->dest.saturate;
1198b8e80941Smrg      break;
1199b8e80941Smrg
1200b8e80941Smrg   case nir_op_imul_2x32_64:
1201b8e80941Smrg   case nir_op_umul_2x32_64:
1202b8e80941Smrg      bld.MUL(result, op[0], op[1]);
1203b8e80941Smrg      break;
1204b8e80941Smrg
1205b8e80941Smrg   case nir_op_imul:
1206b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1207b8e80941Smrg      bld.MUL(result, op[0], op[1]);
1208b8e80941Smrg      break;
1209b8e80941Smrg
1210b8e80941Smrg   case nir_op_imul_high:
1211b8e80941Smrg   case nir_op_umul_high:
1212b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1213b8e80941Smrg      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
1214b8e80941Smrg      break;
1215b8e80941Smrg
1216b8e80941Smrg   case nir_op_idiv:
1217b8e80941Smrg   case nir_op_udiv:
1218b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1219b8e80941Smrg      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
1220b8e80941Smrg      break;
1221b8e80941Smrg
1222b8e80941Smrg   case nir_op_uadd_carry:
1223b8e80941Smrg      unreachable("Should have been lowered by carry_to_arith().");
1224b8e80941Smrg
1225b8e80941Smrg   case nir_op_usub_borrow:
1226b8e80941Smrg      unreachable("Should have been lowered by borrow_to_arith().");
1227b8e80941Smrg
1228b8e80941Smrg   case nir_op_umod:
1229b8e80941Smrg   case nir_op_irem:
1230b8e80941Smrg      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
1231b8e80941Smrg       * appears that our hardware just does the right thing for signed
1232b8e80941Smrg       * remainder.
1233b8e80941Smrg       */
1234b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1235b8e80941Smrg      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1236b8e80941Smrg      break;
1237b8e80941Smrg
1238b8e80941Smrg   case nir_op_imod: {
1239b8e80941Smrg      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
1240b8e80941Smrg      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
1241b8e80941Smrg
1242b8e80941Smrg      /* Math instructions don't support conditional mod */
1243b8e80941Smrg      inst = bld.MOV(bld.null_reg_d(), result);
1244b8e80941Smrg      inst->conditional_mod = BRW_CONDITIONAL_NZ;
1245b8e80941Smrg
1246b8e80941Smrg      /* Now, we need to determine if signs of the sources are different.
1247b8e80941Smrg       * When we XOR the sources, the top bit is 0 if they are the same and 1
1248b8e80941Smrg       * if they are different.  We can then use a conditional modifier to
1249b8e80941Smrg       * turn that into a predicate.  This leads us to an XOR.l instruction.
1250b8e80941Smrg       *
1251b8e80941Smrg       * Technically, according to the PRM, you're not allowed to use .l on a
1252b8e80941Smrg       * XOR instruction.  However, emperical experiments and Curro's reading
1253b8e80941Smrg       * of the simulator source both indicate that it's safe.
1254b8e80941Smrg       */
1255b8e80941Smrg      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
1256b8e80941Smrg      inst = bld.XOR(tmp, op[0], op[1]);
1257b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
1258b8e80941Smrg      inst->conditional_mod = BRW_CONDITIONAL_L;
1259b8e80941Smrg
1260b8e80941Smrg      /* If the result of the initial remainder operation is non-zero and the
1261b8e80941Smrg       * two sources have different signs, add in a copy of op[1] to get the
1262b8e80941Smrg       * final integer modulus value.
1263b8e80941Smrg       */
1264b8e80941Smrg      inst = bld.ADD(result, result, op[1]);
1265b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
1266b8e80941Smrg      break;
1267b8e80941Smrg   }
1268b8e80941Smrg
1269b8e80941Smrg   case nir_op_flt32:
1270b8e80941Smrg   case nir_op_fge32:
1271b8e80941Smrg   case nir_op_feq32:
1272b8e80941Smrg   case nir_op_fne32: {
1273b8e80941Smrg      fs_reg dest = result;
1274b8e80941Smrg
1275b8e80941Smrg      const uint32_t bit_size =  nir_src_bit_size(instr->src[0].src);
1276b8e80941Smrg      if (bit_size != 32)
1277b8e80941Smrg         dest = bld.vgrf(op[0].type, 1);
1278b8e80941Smrg
1279b8e80941Smrg      brw_conditional_mod cond;
1280b8e80941Smrg      switch (instr->op) {
1281b8e80941Smrg      case nir_op_flt32:
1282b8e80941Smrg         cond = BRW_CONDITIONAL_L;
1283b8e80941Smrg         break;
1284b8e80941Smrg      case nir_op_fge32:
1285b8e80941Smrg         cond = BRW_CONDITIONAL_GE;
1286b8e80941Smrg         break;
1287b8e80941Smrg      case nir_op_feq32:
1288b8e80941Smrg         cond = BRW_CONDITIONAL_Z;
1289b8e80941Smrg         break;
1290b8e80941Smrg      case nir_op_fne32:
1291b8e80941Smrg         cond = BRW_CONDITIONAL_NZ;
1292b8e80941Smrg         break;
1293b8e80941Smrg      default:
1294b8e80941Smrg         unreachable("bad opcode");
1295b8e80941Smrg      }
1296b8e80941Smrg
1297b8e80941Smrg      bld.CMP(dest, op[0], op[1], cond);
1298b8e80941Smrg
1299b8e80941Smrg      if (bit_size > 32) {
1300b8e80941Smrg         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1301b8e80941Smrg      } else if(bit_size < 32) {
1302b8e80941Smrg         /* When we convert the result to 32-bit we need to be careful and do
1303b8e80941Smrg          * it as a signed conversion to get sign extension (for 32-bit true)
1304b8e80941Smrg          */
1305b8e80941Smrg         const brw_reg_type src_type =
1306b8e80941Smrg            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1307b8e80941Smrg
1308b8e80941Smrg         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1309b8e80941Smrg      }
1310b8e80941Smrg      break;
1311b8e80941Smrg   }
1312b8e80941Smrg
1313b8e80941Smrg   case nir_op_ilt32:
1314b8e80941Smrg   case nir_op_ult32:
1315b8e80941Smrg   case nir_op_ige32:
1316b8e80941Smrg   case nir_op_uge32:
1317b8e80941Smrg   case nir_op_ieq32:
1318b8e80941Smrg   case nir_op_ine32: {
1319b8e80941Smrg      fs_reg dest = result;
1320b8e80941Smrg
1321b8e80941Smrg      /* On Gen11 we have an additional issue being that src1 cannot be a byte
1322b8e80941Smrg       * type. So we convert both operands for the comparison.
1323b8e80941Smrg       */
1324b8e80941Smrg      fs_reg temp_op[2];
1325b8e80941Smrg      temp_op[0] = bld.fix_byte_src(op[0]);
1326b8e80941Smrg      temp_op[1] = bld.fix_byte_src(op[1]);
1327b8e80941Smrg
1328b8e80941Smrg      const uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1329b8e80941Smrg      if (bit_size != 32)
1330b8e80941Smrg         dest = bld.vgrf(temp_op[0].type, 1);
1331b8e80941Smrg
1332b8e80941Smrg      brw_conditional_mod cond;
1333b8e80941Smrg      switch (instr->op) {
1334b8e80941Smrg      case nir_op_ilt32:
1335b8e80941Smrg      case nir_op_ult32:
1336b8e80941Smrg         cond = BRW_CONDITIONAL_L;
1337b8e80941Smrg         break;
1338b8e80941Smrg      case nir_op_ige32:
1339b8e80941Smrg      case nir_op_uge32:
1340b8e80941Smrg         cond = BRW_CONDITIONAL_GE;
1341b8e80941Smrg         break;
1342b8e80941Smrg      case nir_op_ieq32:
1343b8e80941Smrg         cond = BRW_CONDITIONAL_Z;
1344b8e80941Smrg         break;
1345b8e80941Smrg      case nir_op_ine32:
1346b8e80941Smrg         cond = BRW_CONDITIONAL_NZ;
1347b8e80941Smrg         break;
1348b8e80941Smrg      default:
1349b8e80941Smrg         unreachable("bad opcode");
1350b8e80941Smrg      }
1351b8e80941Smrg      bld.CMP(dest, temp_op[0], temp_op[1], cond);
1352b8e80941Smrg
1353b8e80941Smrg      if (bit_size > 32) {
1354b8e80941Smrg         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
1355b8e80941Smrg      } else if (bit_size < 32) {
1356b8e80941Smrg         /* When we convert the result to 32-bit we need to be careful and do
1357b8e80941Smrg          * it as a signed conversion to get sign extension (for 32-bit true)
1358b8e80941Smrg          */
1359b8e80941Smrg         const brw_reg_type src_type =
1360b8e80941Smrg            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D);
1361b8e80941Smrg
1362b8e80941Smrg         bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type));
1363b8e80941Smrg      }
1364b8e80941Smrg      break;
1365b8e80941Smrg   }
1366b8e80941Smrg
1367b8e80941Smrg   case nir_op_inot:
1368b8e80941Smrg      if (devinfo->gen >= 8) {
1369b8e80941Smrg         nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src);
1370b8e80941Smrg
1371b8e80941Smrg         if (inot_src_instr != NULL &&
1372b8e80941Smrg             (inot_src_instr->op == nir_op_ior ||
1373b8e80941Smrg              inot_src_instr->op == nir_op_ixor ||
1374b8e80941Smrg              inot_src_instr->op == nir_op_iand) &&
1375b8e80941Smrg             !inot_src_instr->src[0].abs &&
1376b8e80941Smrg             !inot_src_instr->src[0].negate &&
1377b8e80941Smrg             !inot_src_instr->src[1].abs &&
1378b8e80941Smrg             !inot_src_instr->src[1].negate) {
1379b8e80941Smrg            /* The sources of the source logical instruction are now the
1380b8e80941Smrg             * sources of the instruction that will be generated.
1381b8e80941Smrg             */
1382b8e80941Smrg            prepare_alu_destination_and_sources(bld, inot_src_instr, op, false);
1383b8e80941Smrg            resolve_inot_sources(bld, inot_src_instr, op);
1384b8e80941Smrg
1385b8e80941Smrg            /* Smash all of the sources and destination to be signed.  This
1386b8e80941Smrg             * doesn't matter for the operation of the instruction, but cmod
1387b8e80941Smrg             * propagation fails on unsigned sources with negation (due to
1388b8e80941Smrg             * fs_inst::can_do_cmod returning false).
1389b8e80941Smrg             */
1390b8e80941Smrg            result.type =
1391b8e80941Smrg               brw_type_for_nir_type(devinfo,
1392b8e80941Smrg                                     (nir_alu_type)(nir_type_int |
1393b8e80941Smrg                                                    nir_dest_bit_size(instr->dest.dest)));
1394b8e80941Smrg            op[0].type =
1395b8e80941Smrg               brw_type_for_nir_type(devinfo,
1396b8e80941Smrg                                     (nir_alu_type)(nir_type_int |
1397b8e80941Smrg                                                    nir_src_bit_size(inot_src_instr->src[0].src)));
1398b8e80941Smrg            op[1].type =
1399b8e80941Smrg               brw_type_for_nir_type(devinfo,
1400b8e80941Smrg                                     (nir_alu_type)(nir_type_int |
1401b8e80941Smrg                                                    nir_src_bit_size(inot_src_instr->src[1].src)));
1402b8e80941Smrg
1403b8e80941Smrg            /* For XOR, only invert one of the sources.  Arbitrarily choose
1404b8e80941Smrg             * the first source.
1405b8e80941Smrg             */
1406b8e80941Smrg            op[0].negate = !op[0].negate;
1407b8e80941Smrg            if (inot_src_instr->op != nir_op_ixor)
1408b8e80941Smrg               op[1].negate = !op[1].negate;
1409b8e80941Smrg
1410b8e80941Smrg            switch (inot_src_instr->op) {
1411b8e80941Smrg            case nir_op_ior:
1412b8e80941Smrg               bld.AND(result, op[0], op[1]);
1413b8e80941Smrg               return;
1414b8e80941Smrg
1415b8e80941Smrg            case nir_op_iand:
1416b8e80941Smrg               bld.OR(result, op[0], op[1]);
1417b8e80941Smrg               return;
1418b8e80941Smrg
1419b8e80941Smrg            case nir_op_ixor:
1420b8e80941Smrg               bld.XOR(result, op[0], op[1]);
1421b8e80941Smrg               return;
1422b8e80941Smrg
1423b8e80941Smrg            default:
1424b8e80941Smrg               unreachable("impossible opcode");
1425b8e80941Smrg            }
1426b8e80941Smrg         }
1427b8e80941Smrg         op[0] = resolve_source_modifiers(op[0]);
1428b8e80941Smrg      }
1429b8e80941Smrg      bld.NOT(result, op[0]);
1430b8e80941Smrg      break;
1431b8e80941Smrg   case nir_op_ixor:
1432b8e80941Smrg      if (devinfo->gen >= 8) {
1433b8e80941Smrg         resolve_inot_sources(bld, instr, op);
1434b8e80941Smrg      }
1435b8e80941Smrg      bld.XOR(result, op[0], op[1]);
1436b8e80941Smrg      break;
1437b8e80941Smrg   case nir_op_ior:
1438b8e80941Smrg      if (devinfo->gen >= 8) {
1439b8e80941Smrg         resolve_inot_sources(bld, instr, op);
1440b8e80941Smrg      }
1441b8e80941Smrg      bld.OR(result, op[0], op[1]);
1442b8e80941Smrg      break;
1443b8e80941Smrg   case nir_op_iand:
1444b8e80941Smrg      if (devinfo->gen >= 8) {
1445b8e80941Smrg         resolve_inot_sources(bld, instr, op);
1446b8e80941Smrg      }
1447b8e80941Smrg      bld.AND(result, op[0], op[1]);
1448b8e80941Smrg      break;
1449b8e80941Smrg
1450b8e80941Smrg   case nir_op_fdot2:
1451b8e80941Smrg   case nir_op_fdot3:
1452b8e80941Smrg   case nir_op_fdot4:
1453b8e80941Smrg   case nir_op_b32all_fequal2:
1454b8e80941Smrg   case nir_op_b32all_iequal2:
1455b8e80941Smrg   case nir_op_b32all_fequal3:
1456b8e80941Smrg   case nir_op_b32all_iequal3:
1457b8e80941Smrg   case nir_op_b32all_fequal4:
1458b8e80941Smrg   case nir_op_b32all_iequal4:
1459b8e80941Smrg   case nir_op_b32any_fnequal2:
1460b8e80941Smrg   case nir_op_b32any_inequal2:
1461b8e80941Smrg   case nir_op_b32any_fnequal3:
1462b8e80941Smrg   case nir_op_b32any_inequal3:
1463b8e80941Smrg   case nir_op_b32any_fnequal4:
1464b8e80941Smrg   case nir_op_b32any_inequal4:
1465b8e80941Smrg      unreachable("Lowered by nir_lower_alu_reductions");
1466b8e80941Smrg
1467b8e80941Smrg   case nir_op_fnoise1_1:
1468b8e80941Smrg   case nir_op_fnoise1_2:
1469b8e80941Smrg   case nir_op_fnoise1_3:
1470b8e80941Smrg   case nir_op_fnoise1_4:
1471b8e80941Smrg   case nir_op_fnoise2_1:
1472b8e80941Smrg   case nir_op_fnoise2_2:
1473b8e80941Smrg   case nir_op_fnoise2_3:
1474b8e80941Smrg   case nir_op_fnoise2_4:
1475b8e80941Smrg   case nir_op_fnoise3_1:
1476b8e80941Smrg   case nir_op_fnoise3_2:
1477b8e80941Smrg   case nir_op_fnoise3_3:
1478b8e80941Smrg   case nir_op_fnoise3_4:
1479b8e80941Smrg   case nir_op_fnoise4_1:
1480b8e80941Smrg   case nir_op_fnoise4_2:
1481b8e80941Smrg   case nir_op_fnoise4_3:
1482b8e80941Smrg   case nir_op_fnoise4_4:
1483b8e80941Smrg      unreachable("not reached: should be handled by lower_noise");
1484b8e80941Smrg
1485b8e80941Smrg   case nir_op_ldexp:
1486b8e80941Smrg      unreachable("not reached: should be handled by ldexp_to_arith()");
1487b8e80941Smrg
1488b8e80941Smrg   case nir_op_fsqrt:
1489b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
1490b8e80941Smrg      inst->saturate = instr->dest.saturate;
1491b8e80941Smrg      break;
1492b8e80941Smrg
1493b8e80941Smrg   case nir_op_frsq:
1494b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
1495b8e80941Smrg      inst->saturate = instr->dest.saturate;
1496b8e80941Smrg      break;
1497b8e80941Smrg
1498b8e80941Smrg   case nir_op_i2b32:
1499b8e80941Smrg   case nir_op_f2b32: {
1500b8e80941Smrg      uint32_t bit_size = nir_src_bit_size(instr->src[0].src);
1501b8e80941Smrg      if (bit_size == 64) {
1502b8e80941Smrg         /* two-argument instructions can't take 64-bit immediates */
1503b8e80941Smrg         fs_reg zero;
1504b8e80941Smrg         fs_reg tmp;
1505b8e80941Smrg
1506b8e80941Smrg         if (instr->op == nir_op_f2b32) {
1507b8e80941Smrg            zero = vgrf(glsl_type::double_type);
1508b8e80941Smrg            tmp = vgrf(glsl_type::double_type);
1509b8e80941Smrg            bld.MOV(zero, setup_imm_df(bld, 0.0));
1510b8e80941Smrg         } else {
1511b8e80941Smrg            zero = vgrf(glsl_type::int64_t_type);
1512b8e80941Smrg            tmp = vgrf(glsl_type::int64_t_type);
1513b8e80941Smrg            bld.MOV(zero, brw_imm_q(0));
1514b8e80941Smrg         }
1515b8e80941Smrg
1516b8e80941Smrg         /* A SIMD16 execution needs to be split in two instructions, so use
1517b8e80941Smrg          * a vgrf instead of the flag register as dst so instruction splitting
1518b8e80941Smrg          * works
1519b8e80941Smrg          */
1520b8e80941Smrg         bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
1521b8e80941Smrg         bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
1522b8e80941Smrg      } else {
1523b8e80941Smrg         fs_reg zero;
1524b8e80941Smrg         if (bit_size == 32) {
1525b8e80941Smrg            zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0);
1526b8e80941Smrg         } else {
1527b8e80941Smrg            assert(bit_size == 16);
1528b8e80941Smrg            zero = instr->op == nir_op_f2b32 ?
1529b8e80941Smrg               retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0);
1530b8e80941Smrg         }
1531b8e80941Smrg         bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ);
1532b8e80941Smrg      }
1533b8e80941Smrg      break;
1534b8e80941Smrg   }
1535b8e80941Smrg
1536b8e80941Smrg   case nir_op_ftrunc:
1537b8e80941Smrg      inst = bld.RNDZ(result, op[0]);
1538b8e80941Smrg      inst->saturate = instr->dest.saturate;
1539b8e80941Smrg      break;
1540b8e80941Smrg
1541b8e80941Smrg   case nir_op_fceil: {
1542b8e80941Smrg      op[0].negate = !op[0].negate;
1543b8e80941Smrg      fs_reg temp = vgrf(glsl_type::float_type);
1544b8e80941Smrg      bld.RNDD(temp, op[0]);
1545b8e80941Smrg      temp.negate = true;
1546b8e80941Smrg      inst = bld.MOV(result, temp);
1547b8e80941Smrg      inst->saturate = instr->dest.saturate;
1548b8e80941Smrg      break;
1549b8e80941Smrg   }
1550b8e80941Smrg   case nir_op_ffloor:
1551b8e80941Smrg      inst = bld.RNDD(result, op[0]);
1552b8e80941Smrg      inst->saturate = instr->dest.saturate;
1553b8e80941Smrg      break;
1554b8e80941Smrg   case nir_op_ffract:
1555b8e80941Smrg      inst = bld.FRC(result, op[0]);
1556b8e80941Smrg      inst->saturate = instr->dest.saturate;
1557b8e80941Smrg      break;
1558b8e80941Smrg   case nir_op_fround_even:
1559b8e80941Smrg      inst = bld.RNDE(result, op[0]);
1560b8e80941Smrg      inst->saturate = instr->dest.saturate;
1561b8e80941Smrg      break;
1562b8e80941Smrg
1563b8e80941Smrg   case nir_op_fquantize2f16: {
1564b8e80941Smrg      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
1565b8e80941Smrg      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
1566b8e80941Smrg      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
1567b8e80941Smrg
1568b8e80941Smrg      /* The destination stride must be at least as big as the source stride. */
1569b8e80941Smrg      tmp16.type = BRW_REGISTER_TYPE_W;
1570b8e80941Smrg      tmp16.stride = 2;
1571b8e80941Smrg
1572b8e80941Smrg      /* Check for denormal */
1573b8e80941Smrg      fs_reg abs_src0 = op[0];
1574b8e80941Smrg      abs_src0.abs = true;
1575b8e80941Smrg      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
1576b8e80941Smrg              BRW_CONDITIONAL_L);
1577b8e80941Smrg      /* Get the appropriately signed zero */
1578b8e80941Smrg      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
1579b8e80941Smrg              retype(op[0], BRW_REGISTER_TYPE_UD),
1580b8e80941Smrg              brw_imm_ud(0x80000000));
1581b8e80941Smrg      /* Do the actual F32 -> F16 -> F32 conversion */
1582b8e80941Smrg      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
1583b8e80941Smrg      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
1584b8e80941Smrg      /* Select that or zero based on normal status */
1585b8e80941Smrg      inst = bld.SEL(result, zero, tmp32);
1586b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
1587b8e80941Smrg      inst->saturate = instr->dest.saturate;
1588b8e80941Smrg      break;
1589b8e80941Smrg   }
1590b8e80941Smrg
1591b8e80941Smrg   case nir_op_imin:
1592b8e80941Smrg   case nir_op_umin:
1593b8e80941Smrg   case nir_op_fmin:
1594b8e80941Smrg      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
1595b8e80941Smrg      inst->saturate = instr->dest.saturate;
1596b8e80941Smrg      break;
1597b8e80941Smrg
1598b8e80941Smrg   case nir_op_imax:
1599b8e80941Smrg   case nir_op_umax:
1600b8e80941Smrg   case nir_op_fmax:
1601b8e80941Smrg      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
1602b8e80941Smrg      inst->saturate = instr->dest.saturate;
1603b8e80941Smrg      break;
1604b8e80941Smrg
1605b8e80941Smrg   case nir_op_pack_snorm_2x16:
1606b8e80941Smrg   case nir_op_pack_snorm_4x8:
1607b8e80941Smrg   case nir_op_pack_unorm_2x16:
1608b8e80941Smrg   case nir_op_pack_unorm_4x8:
1609b8e80941Smrg   case nir_op_unpack_snorm_2x16:
1610b8e80941Smrg   case nir_op_unpack_snorm_4x8:
1611b8e80941Smrg   case nir_op_unpack_unorm_2x16:
1612b8e80941Smrg   case nir_op_unpack_unorm_4x8:
1613b8e80941Smrg   case nir_op_unpack_half_2x16:
1614b8e80941Smrg   case nir_op_pack_half_2x16:
1615b8e80941Smrg      unreachable("not reached: should be handled by lower_packing_builtins");
1616b8e80941Smrg
1617b8e80941Smrg   case nir_op_unpack_half_2x16_split_x:
1618b8e80941Smrg      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1619b8e80941Smrg                      subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1620b8e80941Smrg      inst->saturate = instr->dest.saturate;
1621b8e80941Smrg      break;
1622b8e80941Smrg   case nir_op_unpack_half_2x16_split_y:
1623b8e80941Smrg      inst = bld.emit(BRW_OPCODE_F16TO32, result,
1624b8e80941Smrg                      subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1625b8e80941Smrg      inst->saturate = instr->dest.saturate;
1626b8e80941Smrg      break;
1627b8e80941Smrg
1628b8e80941Smrg   case nir_op_pack_64_2x32_split:
1629b8e80941Smrg   case nir_op_pack_32_2x16_split:
1630b8e80941Smrg      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
1631b8e80941Smrg      break;
1632b8e80941Smrg
1633b8e80941Smrg   case nir_op_unpack_64_2x32_split_x:
1634b8e80941Smrg   case nir_op_unpack_64_2x32_split_y: {
1635b8e80941Smrg      if (instr->op == nir_op_unpack_64_2x32_split_x)
1636b8e80941Smrg         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
1637b8e80941Smrg      else
1638b8e80941Smrg         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
1639b8e80941Smrg      break;
1640b8e80941Smrg   }
1641b8e80941Smrg
1642b8e80941Smrg   case nir_op_unpack_32_2x16_split_x:
1643b8e80941Smrg   case nir_op_unpack_32_2x16_split_y: {
1644b8e80941Smrg      if (instr->op == nir_op_unpack_32_2x16_split_x)
1645b8e80941Smrg         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0));
1646b8e80941Smrg      else
1647b8e80941Smrg         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1));
1648b8e80941Smrg      break;
1649b8e80941Smrg   }
1650b8e80941Smrg
1651b8e80941Smrg   case nir_op_fpow:
1652b8e80941Smrg      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
1653b8e80941Smrg      inst->saturate = instr->dest.saturate;
1654b8e80941Smrg      break;
1655b8e80941Smrg
1656b8e80941Smrg   case nir_op_bitfield_reverse:
1657b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1658b8e80941Smrg      bld.BFREV(result, op[0]);
1659b8e80941Smrg      break;
1660b8e80941Smrg
1661b8e80941Smrg   case nir_op_bit_count:
1662b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1663b8e80941Smrg      bld.CBIT(result, op[0]);
1664b8e80941Smrg      break;
1665b8e80941Smrg
1666b8e80941Smrg   case nir_op_ufind_msb: {
1667b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1668b8e80941Smrg      emit_find_msb_using_lzd(bld, result, op[0], false);
1669b8e80941Smrg      break;
1670b8e80941Smrg   }
1671b8e80941Smrg
1672b8e80941Smrg   case nir_op_ifind_msb: {
1673b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1674b8e80941Smrg
1675b8e80941Smrg      if (devinfo->gen < 7) {
1676b8e80941Smrg         emit_find_msb_using_lzd(bld, result, op[0], true);
1677b8e80941Smrg      } else {
1678b8e80941Smrg         bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
1679b8e80941Smrg
1680b8e80941Smrg         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
1681b8e80941Smrg          * count from the LSB side. If FBH didn't return an error
1682b8e80941Smrg          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
1683b8e80941Smrg          * count into an LSB count.
1684b8e80941Smrg          */
1685b8e80941Smrg         bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
1686b8e80941Smrg
1687b8e80941Smrg         inst = bld.ADD(result, result, brw_imm_d(31));
1688b8e80941Smrg         inst->predicate = BRW_PREDICATE_NORMAL;
1689b8e80941Smrg         inst->src[0].negate = true;
1690b8e80941Smrg      }
1691b8e80941Smrg      break;
1692b8e80941Smrg   }
1693b8e80941Smrg
1694b8e80941Smrg   case nir_op_find_lsb:
1695b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1696b8e80941Smrg
1697b8e80941Smrg      if (devinfo->gen < 7) {
1698b8e80941Smrg         fs_reg temp = vgrf(glsl_type::int_type);
1699b8e80941Smrg
1700b8e80941Smrg         /* (x & -x) generates a value that consists of only the LSB of x.
1701b8e80941Smrg          * For all powers of 2, findMSB(y) == findLSB(y).
1702b8e80941Smrg          */
1703b8e80941Smrg         fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
1704b8e80941Smrg         fs_reg negated_src = src;
1705b8e80941Smrg
1706b8e80941Smrg         /* One must be negated, and the other must be non-negated.  It
1707b8e80941Smrg          * doesn't matter which is which.
1708b8e80941Smrg          */
1709b8e80941Smrg         negated_src.negate = true;
1710b8e80941Smrg         src.negate = false;
1711b8e80941Smrg
1712b8e80941Smrg         bld.AND(temp, src, negated_src);
1713b8e80941Smrg         emit_find_msb_using_lzd(bld, result, temp, false);
1714b8e80941Smrg      } else {
1715b8e80941Smrg         bld.FBL(result, op[0]);
1716b8e80941Smrg      }
1717b8e80941Smrg      break;
1718b8e80941Smrg
1719b8e80941Smrg   case nir_op_ubitfield_extract:
1720b8e80941Smrg   case nir_op_ibitfield_extract:
1721b8e80941Smrg      unreachable("should have been lowered");
1722b8e80941Smrg   case nir_op_ubfe:
1723b8e80941Smrg   case nir_op_ibfe:
1724b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1725b8e80941Smrg      bld.BFE(result, op[2], op[1], op[0]);
1726b8e80941Smrg      break;
1727b8e80941Smrg   case nir_op_bfm:
1728b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1729b8e80941Smrg      bld.BFI1(result, op[0], op[1]);
1730b8e80941Smrg      break;
1731b8e80941Smrg   case nir_op_bfi:
1732b8e80941Smrg      assert(nir_dest_bit_size(instr->dest.dest) < 64);
1733b8e80941Smrg      bld.BFI2(result, op[0], op[1], op[2]);
1734b8e80941Smrg      break;
1735b8e80941Smrg
1736b8e80941Smrg   case nir_op_bitfield_insert:
1737b8e80941Smrg      unreachable("not reached: should have been lowered");
1738b8e80941Smrg
1739b8e80941Smrg   case nir_op_ishl:
1740b8e80941Smrg      bld.SHL(result, op[0], op[1]);
1741b8e80941Smrg      break;
1742b8e80941Smrg   case nir_op_ishr:
1743b8e80941Smrg      bld.ASR(result, op[0], op[1]);
1744b8e80941Smrg      break;
1745b8e80941Smrg   case nir_op_ushr:
1746b8e80941Smrg      bld.SHR(result, op[0], op[1]);
1747b8e80941Smrg      break;
1748b8e80941Smrg
1749b8e80941Smrg   case nir_op_pack_half_2x16_split:
1750b8e80941Smrg      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
1751b8e80941Smrg      break;
1752b8e80941Smrg
1753b8e80941Smrg   case nir_op_ffma:
1754b8e80941Smrg      inst = bld.MAD(result, op[2], op[1], op[0]);
1755b8e80941Smrg      inst->saturate = instr->dest.saturate;
1756b8e80941Smrg      break;
1757b8e80941Smrg
1758b8e80941Smrg   case nir_op_flrp:
1759b8e80941Smrg      inst = bld.LRP(result, op[0], op[1], op[2]);
1760b8e80941Smrg      inst->saturate = instr->dest.saturate;
1761b8e80941Smrg      break;
1762b8e80941Smrg
1763b8e80941Smrg   case nir_op_b32csel:
1764b8e80941Smrg      if (optimize_frontfacing_ternary(instr, result))
1765b8e80941Smrg         return;
1766b8e80941Smrg
1767b8e80941Smrg      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
1768b8e80941Smrg      inst = bld.SEL(result, op[1], op[2]);
1769b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
1770b8e80941Smrg      break;
1771b8e80941Smrg
1772b8e80941Smrg   case nir_op_extract_u8:
1773b8e80941Smrg   case nir_op_extract_i8: {
1774b8e80941Smrg      unsigned byte = nir_src_as_uint(instr->src[1].src);
1775b8e80941Smrg
1776b8e80941Smrg      /* The PRMs say:
1777b8e80941Smrg       *
1778b8e80941Smrg       *    BDW+
1779b8e80941Smrg       *    There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB.
1780b8e80941Smrg       *    Use two instructions and a word or DWord intermediate integer type.
1781b8e80941Smrg       */
1782b8e80941Smrg      if (nir_dest_bit_size(instr->dest.dest) == 64) {
1783b8e80941Smrg         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1784b8e80941Smrg
1785b8e80941Smrg         if (instr->op == nir_op_extract_i8) {
1786b8e80941Smrg            /* If we need to sign extend, extract to a word first */
1787b8e80941Smrg            fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W);
1788b8e80941Smrg            bld.MOV(w_temp, subscript(op[0], type, byte));
1789b8e80941Smrg            bld.MOV(result, w_temp);
1790b8e80941Smrg         } else if (byte & 1) {
1791b8e80941Smrg            /* Extract the high byte from the word containing the desired byte
1792b8e80941Smrg             * offset.
1793b8e80941Smrg             */
1794b8e80941Smrg            bld.SHR(result,
1795b8e80941Smrg                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1796b8e80941Smrg                    brw_imm_uw(8));
1797b8e80941Smrg         } else {
1798b8e80941Smrg            /* Otherwise use an AND with 0xff and a word type */
1799b8e80941Smrg            bld.AND(result,
1800b8e80941Smrg                    subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2),
1801b8e80941Smrg                    brw_imm_uw(0xff));
1802b8e80941Smrg         }
1803b8e80941Smrg      } else {
1804b8e80941Smrg         const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
1805b8e80941Smrg         bld.MOV(result, subscript(op[0], type, byte));
1806b8e80941Smrg      }
1807b8e80941Smrg      break;
1808b8e80941Smrg   }
1809b8e80941Smrg
1810b8e80941Smrg   case nir_op_extract_u16:
1811b8e80941Smrg   case nir_op_extract_i16: {
1812b8e80941Smrg      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
1813b8e80941Smrg      unsigned word = nir_src_as_uint(instr->src[1].src);
1814b8e80941Smrg      bld.MOV(result, subscript(op[0], type, word));
1815b8e80941Smrg      break;
1816b8e80941Smrg   }
1817b8e80941Smrg
1818b8e80941Smrg   default:
1819b8e80941Smrg      unreachable("unhandled instruction");
1820b8e80941Smrg   }
1821b8e80941Smrg
1822b8e80941Smrg   /* If we need to do a boolean resolve, replace the result with -(x & 1)
1823b8e80941Smrg    * to sign extend the low bit to 0/~0
1824b8e80941Smrg    */
1825b8e80941Smrg   if (devinfo->gen <= 5 &&
1826b8e80941Smrg       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
1827b8e80941Smrg      fs_reg masked = vgrf(glsl_type::int_type);
1828b8e80941Smrg      bld.AND(masked, result, brw_imm_d(1));
1829b8e80941Smrg      masked.negate = true;
1830b8e80941Smrg      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
1831b8e80941Smrg   }
1832b8e80941Smrg}
1833b8e80941Smrg
1834b8e80941Smrgvoid
1835b8e80941Smrgfs_visitor::nir_emit_load_const(const fs_builder &bld,
1836b8e80941Smrg                                nir_load_const_instr *instr)
1837b8e80941Smrg{
1838b8e80941Smrg   const brw_reg_type reg_type =
1839b8e80941Smrg      brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D);
1840b8e80941Smrg   fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
1841b8e80941Smrg
1842b8e80941Smrg   switch (instr->def.bit_size) {
1843b8e80941Smrg   case 8:
1844b8e80941Smrg      for (unsigned i = 0; i < instr->def.num_components; i++)
1845b8e80941Smrg         bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8));
1846b8e80941Smrg      break;
1847b8e80941Smrg
1848b8e80941Smrg   case 16:
1849b8e80941Smrg      for (unsigned i = 0; i < instr->def.num_components; i++)
1850b8e80941Smrg         bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16));
1851b8e80941Smrg      break;
1852b8e80941Smrg
1853b8e80941Smrg   case 32:
1854b8e80941Smrg      for (unsigned i = 0; i < instr->def.num_components; i++)
1855b8e80941Smrg         bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32));
1856b8e80941Smrg      break;
1857b8e80941Smrg
1858b8e80941Smrg   case 64:
1859b8e80941Smrg      assert(devinfo->gen >= 7);
1860b8e80941Smrg      if (devinfo->gen == 7) {
1861b8e80941Smrg         /* We don't get 64-bit integer types until gen8 */
1862b8e80941Smrg         for (unsigned i = 0; i < instr->def.num_components; i++) {
1863b8e80941Smrg            bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF),
1864b8e80941Smrg                    setup_imm_df(bld, instr->value[i].f64));
1865b8e80941Smrg         }
1866b8e80941Smrg      } else {
1867b8e80941Smrg         for (unsigned i = 0; i < instr->def.num_components; i++)
1868b8e80941Smrg            bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64));
1869b8e80941Smrg      }
1870b8e80941Smrg      break;
1871b8e80941Smrg
1872b8e80941Smrg   default:
1873b8e80941Smrg      unreachable("Invalid bit size");
1874b8e80941Smrg   }
1875b8e80941Smrg
1876b8e80941Smrg   nir_ssa_values[instr->def.index] = reg;
1877b8e80941Smrg}
1878b8e80941Smrg
1879b8e80941Smrgfs_reg
1880b8e80941Smrgfs_visitor::get_nir_src(const nir_src &src)
1881b8e80941Smrg{
1882b8e80941Smrg   fs_reg reg;
1883b8e80941Smrg   if (src.is_ssa) {
1884b8e80941Smrg      if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
1885b8e80941Smrg         const brw_reg_type reg_type =
1886b8e80941Smrg            brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D);
1887b8e80941Smrg         reg = bld.vgrf(reg_type, src.ssa->num_components);
1888b8e80941Smrg      } else {
1889b8e80941Smrg         reg = nir_ssa_values[src.ssa->index];
1890b8e80941Smrg      }
1891b8e80941Smrg   } else {
1892b8e80941Smrg      /* We don't handle indirects on locals */
1893b8e80941Smrg      assert(src.reg.indirect == NULL);
1894b8e80941Smrg      reg = offset(nir_locals[src.reg.reg->index], bld,
1895b8e80941Smrg                   src.reg.base_offset * src.reg.reg->num_components);
1896b8e80941Smrg   }
1897b8e80941Smrg
1898b8e80941Smrg   if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) {
1899b8e80941Smrg      /* The only 64-bit type available on gen7 is DF, so use that. */
1900b8e80941Smrg      reg.type = BRW_REGISTER_TYPE_DF;
1901b8e80941Smrg   } else {
1902b8e80941Smrg      /* To avoid floating-point denorm flushing problems, set the type by
1903b8e80941Smrg       * default to an integer type - instructions that need floating point
1904b8e80941Smrg       * semantics will set this to F if they need to
1905b8e80941Smrg       */
1906b8e80941Smrg      reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src),
1907b8e80941Smrg                                            BRW_REGISTER_TYPE_D);
1908b8e80941Smrg   }
1909b8e80941Smrg
1910b8e80941Smrg   return reg;
1911b8e80941Smrg}
1912b8e80941Smrg
1913b8e80941Smrg/**
1914b8e80941Smrg * Return an IMM for constants; otherwise call get_nir_src() as normal.
1915b8e80941Smrg *
1916b8e80941Smrg * This function should not be called on any value which may be 64 bits.
1917b8e80941Smrg * We could theoretically support 64-bit on gen8+ but we choose not to
1918b8e80941Smrg * because it wouldn't work in general (no gen7 support) and there are
1919b8e80941Smrg * enough restrictions in 64-bit immediates that you can't take the return
1920b8e80941Smrg * value and treat it the same as the result of get_nir_src().
1921b8e80941Smrg */
1922b8e80941Smrgfs_reg
1923b8e80941Smrgfs_visitor::get_nir_src_imm(const nir_src &src)
1924b8e80941Smrg{
1925b8e80941Smrg   assert(nir_src_bit_size(src) == 32);
1926b8e80941Smrg   return nir_src_is_const(src) ?
1927b8e80941Smrg          fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src);
1928b8e80941Smrg}
1929b8e80941Smrg
1930b8e80941Smrgfs_reg
1931b8e80941Smrgfs_visitor::get_nir_dest(const nir_dest &dest)
1932b8e80941Smrg{
1933b8e80941Smrg   if (dest.is_ssa) {
1934b8e80941Smrg      const brw_reg_type reg_type =
1935b8e80941Smrg         brw_reg_type_from_bit_size(dest.ssa.bit_size,
1936b8e80941Smrg                                    dest.ssa.bit_size == 8 ?
1937b8e80941Smrg                                    BRW_REGISTER_TYPE_D :
1938b8e80941Smrg                                    BRW_REGISTER_TYPE_F);
1939b8e80941Smrg      nir_ssa_values[dest.ssa.index] =
1940b8e80941Smrg         bld.vgrf(reg_type, dest.ssa.num_components);
1941b8e80941Smrg      return nir_ssa_values[dest.ssa.index];
1942b8e80941Smrg   } else {
1943b8e80941Smrg      /* We don't handle indirects on locals */
1944b8e80941Smrg      assert(dest.reg.indirect == NULL);
1945b8e80941Smrg      return offset(nir_locals[dest.reg.reg->index], bld,
1946b8e80941Smrg                    dest.reg.base_offset * dest.reg.reg->num_components);
1947b8e80941Smrg   }
1948b8e80941Smrg}
1949b8e80941Smrg
1950b8e80941Smrgvoid
1951b8e80941Smrgfs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
1952b8e80941Smrg                         unsigned wr_mask)
1953b8e80941Smrg{
1954b8e80941Smrg   for (unsigned i = 0; i < 4; i++) {
1955b8e80941Smrg      if (!((wr_mask >> i) & 1))
1956b8e80941Smrg         continue;
1957b8e80941Smrg
1958b8e80941Smrg      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
1959b8e80941Smrg      new_inst->dst = offset(new_inst->dst, bld, i);
1960b8e80941Smrg      for (unsigned j = 0; j < new_inst->sources; j++)
1961b8e80941Smrg         if (new_inst->src[j].file == VGRF)
1962b8e80941Smrg            new_inst->src[j] = offset(new_inst->src[j], bld, i);
1963b8e80941Smrg
1964b8e80941Smrg      bld.emit(new_inst);
1965b8e80941Smrg   }
1966b8e80941Smrg}
1967b8e80941Smrg
1968b8e80941Smrgstatic fs_inst *
1969b8e80941Smrgemit_pixel_interpolater_send(const fs_builder &bld,
1970b8e80941Smrg                             enum opcode opcode,
1971b8e80941Smrg                             const fs_reg &dst,
1972b8e80941Smrg                             const fs_reg &src,
1973b8e80941Smrg                             const fs_reg &desc,
1974b8e80941Smrg                             glsl_interp_mode interpolation)
1975b8e80941Smrg{
1976b8e80941Smrg   struct brw_wm_prog_data *wm_prog_data =
1977b8e80941Smrg      brw_wm_prog_data(bld.shader->stage_prog_data);
1978b8e80941Smrg
1979b8e80941Smrg   fs_inst *inst = bld.emit(opcode, dst, src, desc);
1980b8e80941Smrg   /* 2 floats per slot returned */
1981b8e80941Smrg   inst->size_written = 2 * dst.component_size(inst->exec_size);
1982b8e80941Smrg   inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
1983b8e80941Smrg
1984b8e80941Smrg   wm_prog_data->pulls_bary = true;
1985b8e80941Smrg
1986b8e80941Smrg   return inst;
1987b8e80941Smrg}
1988b8e80941Smrg
1989b8e80941Smrg/**
1990b8e80941Smrg * Computes 1 << x, given a D/UD register containing some value x.
1991b8e80941Smrg */
1992b8e80941Smrgstatic fs_reg
1993b8e80941Smrgintexp2(const fs_builder &bld, const fs_reg &x)
1994b8e80941Smrg{
1995b8e80941Smrg   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
1996b8e80941Smrg
1997b8e80941Smrg   fs_reg result = bld.vgrf(x.type, 1);
1998b8e80941Smrg   fs_reg one = bld.vgrf(x.type, 1);
1999b8e80941Smrg
2000b8e80941Smrg   bld.MOV(one, retype(brw_imm_d(1), one.type));
2001b8e80941Smrg   bld.SHL(result, one, x);
2002b8e80941Smrg   return result;
2003b8e80941Smrg}
2004b8e80941Smrg
2005b8e80941Smrgvoid
2006b8e80941Smrgfs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
2007b8e80941Smrg{
2008b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
2009b8e80941Smrg
2010b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2011b8e80941Smrg
2012b8e80941Smrg   if (gs_compile->control_data_header_size_bits == 0)
2013b8e80941Smrg      return;
2014b8e80941Smrg
2015b8e80941Smrg   /* We can only do EndPrimitive() functionality when the control data
2016b8e80941Smrg    * consists of cut bits.  Fortunately, the only time it isn't is when the
2017b8e80941Smrg    * output type is points, in which case EndPrimitive() is a no-op.
2018b8e80941Smrg    */
2019b8e80941Smrg   if (gs_prog_data->control_data_format !=
2020b8e80941Smrg       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
2021b8e80941Smrg      return;
2022b8e80941Smrg   }
2023b8e80941Smrg
2024b8e80941Smrg   /* Cut bits use one bit per vertex. */
2025b8e80941Smrg   assert(gs_compile->control_data_bits_per_vertex == 1);
2026b8e80941Smrg
2027b8e80941Smrg   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2028b8e80941Smrg   vertex_count.type = BRW_REGISTER_TYPE_UD;
2029b8e80941Smrg
2030b8e80941Smrg   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
2031b8e80941Smrg    * vertex n, 0 otherwise.  So all we need to do here is mark bit
2032b8e80941Smrg    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
2033b8e80941Smrg    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
2034b8e80941Smrg    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
2035b8e80941Smrg    *
2036b8e80941Smrg    * Note that if EndPrimitive() is called before emitting any vertices, this
2037b8e80941Smrg    * will cause us to set bit 31 of the control_data_bits register to 1.
2038b8e80941Smrg    * That's fine because:
2039b8e80941Smrg    *
2040b8e80941Smrg    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
2041b8e80941Smrg    *   output, so the hardware will ignore cut bit 31.
2042b8e80941Smrg    *
2043b8e80941Smrg    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
2044b8e80941Smrg    *   last vertex, so setting cut bit 31 has no effect (since the primitive
2045b8e80941Smrg    *   is automatically ended when the GS terminates).
2046b8e80941Smrg    *
2047b8e80941Smrg    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
2048b8e80941Smrg    *   control_data_bits register to 0 when the first vertex is emitted.
2049b8e80941Smrg    */
2050b8e80941Smrg
2051b8e80941Smrg   const fs_builder abld = bld.annotate("end primitive");
2052b8e80941Smrg
2053b8e80941Smrg   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
2054b8e80941Smrg   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2055b8e80941Smrg   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2056b8e80941Smrg   fs_reg mask = intexp2(abld, prev_count);
2057b8e80941Smrg   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2058b8e80941Smrg    * attention to the lower 5 bits of its second source argument, so on this
2059b8e80941Smrg    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
2060b8e80941Smrg    * ((vertex_count - 1) % 32).
2061b8e80941Smrg    */
2062b8e80941Smrg   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2063b8e80941Smrg}
2064b8e80941Smrg
2065b8e80941Smrgvoid
2066b8e80941Smrgfs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
2067b8e80941Smrg{
2068b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
2069b8e80941Smrg   assert(gs_compile->control_data_bits_per_vertex != 0);
2070b8e80941Smrg
2071b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2072b8e80941Smrg
2073b8e80941Smrg   const fs_builder abld = bld.annotate("emit control data bits");
2074b8e80941Smrg   const fs_builder fwa_bld = bld.exec_all();
2075b8e80941Smrg
2076b8e80941Smrg   /* We use a single UD register to accumulate control data bits (32 bits
2077b8e80941Smrg    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
2078b8e80941Smrg    * at a time.
2079b8e80941Smrg    *
2080b8e80941Smrg    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
2081b8e80941Smrg    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
2082b8e80941Smrg    * use the Channel Mask phase to enable/disable which DWord within that
2083b8e80941Smrg    * group to write.  (Remember, different SIMD8 channels may have emitted
2084b8e80941Smrg    * different numbers of vertices, so we may need per-slot offsets.)
2085b8e80941Smrg    *
2086b8e80941Smrg    * Channel masking presents an annoying problem: we may have to replicate
2087b8e80941Smrg    * the data up to 4 times:
2088b8e80941Smrg    *
2089b8e80941Smrg    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
2090b8e80941Smrg    *
2091b8e80941Smrg    * To avoid penalizing shaders that emit a small number of vertices, we
2092b8e80941Smrg    * can avoid these sometimes: if the size of the control data header is
2093b8e80941Smrg    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
2094b8e80941Smrg    * land in the same 128-bit group, so we can skip per-slot offsets.
2095b8e80941Smrg    *
2096b8e80941Smrg    * Similarly, if the control data header is <= 32 bits, there is only one
2097b8e80941Smrg    * DWord, so we can skip channel masks.
2098b8e80941Smrg    */
2099b8e80941Smrg   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
2100b8e80941Smrg
2101b8e80941Smrg   fs_reg channel_mask, per_slot_offset;
2102b8e80941Smrg
2103b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 32) {
2104b8e80941Smrg      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2105b8e80941Smrg      channel_mask = vgrf(glsl_type::uint_type);
2106b8e80941Smrg   }
2107b8e80941Smrg
2108b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 128) {
2109b8e80941Smrg      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
2110b8e80941Smrg      per_slot_offset = vgrf(glsl_type::uint_type);
2111b8e80941Smrg   }
2112b8e80941Smrg
2113b8e80941Smrg   /* Figure out which DWord we're trying to write to using the formula:
2114b8e80941Smrg    *
2115b8e80941Smrg    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
2116b8e80941Smrg    *
2117b8e80941Smrg    * Since bits_per_vertex is a power of two, and is known at compile
2118b8e80941Smrg    * time, this can be optimized to:
2119b8e80941Smrg    *
2120b8e80941Smrg    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
2121b8e80941Smrg    */
2122b8e80941Smrg   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
2123b8e80941Smrg      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2124b8e80941Smrg      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2125b8e80941Smrg      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
2126b8e80941Smrg      unsigned log2_bits_per_vertex =
2127b8e80941Smrg         util_last_bit(gs_compile->control_data_bits_per_vertex);
2128b8e80941Smrg      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
2129b8e80941Smrg
2130b8e80941Smrg      if (per_slot_offset.file != BAD_FILE) {
2131b8e80941Smrg         /* Set the per-slot offset to dword_index / 4, so that we'll write to
2132b8e80941Smrg          * the appropriate OWord within the control data header.
2133b8e80941Smrg          */
2134b8e80941Smrg         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
2135b8e80941Smrg      }
2136b8e80941Smrg
2137b8e80941Smrg      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
2138b8e80941Smrg       * write to the appropriate DWORD within the OWORD.
2139b8e80941Smrg       */
2140b8e80941Smrg      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2141b8e80941Smrg      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
2142b8e80941Smrg      channel_mask = intexp2(fwa_bld, channel);
2143b8e80941Smrg      /* Then the channel masks need to be in bits 23:16. */
2144b8e80941Smrg      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
2145b8e80941Smrg   }
2146b8e80941Smrg
2147b8e80941Smrg   /* Store the control data bits in the message payload and send it. */
2148b8e80941Smrg   unsigned mlen = 2;
2149b8e80941Smrg   if (channel_mask.file != BAD_FILE)
2150b8e80941Smrg      mlen += 4; /* channel masks, plus 3 extra copies of the data */
2151b8e80941Smrg   if (per_slot_offset.file != BAD_FILE)
2152b8e80941Smrg      mlen++;
2153b8e80941Smrg
2154b8e80941Smrg   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2155b8e80941Smrg   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
2156b8e80941Smrg   unsigned i = 0;
2157b8e80941Smrg   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
2158b8e80941Smrg   if (per_slot_offset.file != BAD_FILE)
2159b8e80941Smrg      sources[i++] = per_slot_offset;
2160b8e80941Smrg   if (channel_mask.file != BAD_FILE)
2161b8e80941Smrg      sources[i++] = channel_mask;
2162b8e80941Smrg   while (i < mlen) {
2163b8e80941Smrg      sources[i++] = this->control_data_bits;
2164b8e80941Smrg   }
2165b8e80941Smrg
2166b8e80941Smrg   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
2167b8e80941Smrg   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
2168b8e80941Smrg   inst->mlen = mlen;
2169b8e80941Smrg   /* We need to increment Global Offset by 256-bits to make room for
2170b8e80941Smrg    * Broadwell's extra "Vertex Count" payload at the beginning of the
2171b8e80941Smrg    * URB entry.  Since this is an OWord message, Global Offset is counted
2172b8e80941Smrg    * in 128-bit units, so we must set it to 2.
2173b8e80941Smrg    */
2174b8e80941Smrg   if (gs_prog_data->static_vertex_count == -1)
2175b8e80941Smrg      inst->offset = 2;
2176b8e80941Smrg}
2177b8e80941Smrg
2178b8e80941Smrgvoid
2179b8e80941Smrgfs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
2180b8e80941Smrg                                            unsigned stream_id)
2181b8e80941Smrg{
2182b8e80941Smrg   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
2183b8e80941Smrg
2184b8e80941Smrg   /* Note: we are calling this *before* increasing vertex_count, so
2185b8e80941Smrg    * this->vertex_count == vertex_count - 1 in the formula above.
2186b8e80941Smrg    */
2187b8e80941Smrg
2188b8e80941Smrg   /* Stream mode uses 2 bits per vertex */
2189b8e80941Smrg   assert(gs_compile->control_data_bits_per_vertex == 2);
2190b8e80941Smrg
2191b8e80941Smrg   /* Must be a valid stream */
2192b8e80941Smrg   assert(stream_id < MAX_VERTEX_STREAMS);
2193b8e80941Smrg
2194b8e80941Smrg   /* Control data bits are initialized to 0 so we don't have to set any
2195b8e80941Smrg    * bits when sending vertices to stream 0.
2196b8e80941Smrg    */
2197b8e80941Smrg   if (stream_id == 0)
2198b8e80941Smrg      return;
2199b8e80941Smrg
2200b8e80941Smrg   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
2201b8e80941Smrg
2202b8e80941Smrg   /* reg::sid = stream_id */
2203b8e80941Smrg   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2204b8e80941Smrg   abld.MOV(sid, brw_imm_ud(stream_id));
2205b8e80941Smrg
2206b8e80941Smrg   /* reg:shift_count = 2 * (vertex_count - 1) */
2207b8e80941Smrg   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2208b8e80941Smrg   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
2209b8e80941Smrg
2210b8e80941Smrg   /* Note: we're relying on the fact that the GEN SHL instruction only pays
2211b8e80941Smrg    * attention to the lower 5 bits of its second source argument, so on this
2212b8e80941Smrg    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
2213b8e80941Smrg    * stream_id << ((2 * (vertex_count - 1)) % 32).
2214b8e80941Smrg    */
2215b8e80941Smrg   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2216b8e80941Smrg   abld.SHL(mask, sid, shift_count);
2217b8e80941Smrg   abld.OR(this->control_data_bits, this->control_data_bits, mask);
2218b8e80941Smrg}
2219b8e80941Smrg
2220b8e80941Smrgvoid
2221b8e80941Smrgfs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
2222b8e80941Smrg                           unsigned stream_id)
2223b8e80941Smrg{
2224b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
2225b8e80941Smrg
2226b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2227b8e80941Smrg
2228b8e80941Smrg   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
2229b8e80941Smrg   vertex_count.type = BRW_REGISTER_TYPE_UD;
2230b8e80941Smrg
2231b8e80941Smrg   /* Haswell and later hardware ignores the "Render Stream Select" bits
2232b8e80941Smrg    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
2233b8e80941Smrg    * and instead sends all primitives down the pipeline for rasterization.
2234b8e80941Smrg    * If the SOL stage is enabled, "Render Stream Select" is honored and
2235b8e80941Smrg    * primitives bound to non-zero streams are discarded after stream output.
2236b8e80941Smrg    *
2237b8e80941Smrg    * Since the only purpose of primives sent to non-zero streams is to
2238b8e80941Smrg    * be recorded by transform feedback, we can simply discard all geometry
2239b8e80941Smrg    * bound to these streams when transform feedback is disabled.
2240b8e80941Smrg    */
2241b8e80941Smrg   if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
2242b8e80941Smrg      return;
2243b8e80941Smrg
2244b8e80941Smrg   /* If we're outputting 32 control data bits or less, then we can wait
2245b8e80941Smrg    * until the shader is over to output them all.  Otherwise we need to
2246b8e80941Smrg    * output them as we go.  Now is the time to do it, since we're about to
2247b8e80941Smrg    * output the vertex_count'th vertex, so it's guaranteed that the
2248b8e80941Smrg    * control data bits associated with the (vertex_count - 1)th vertex are
2249b8e80941Smrg    * correct.
2250b8e80941Smrg    */
2251b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 32) {
2252b8e80941Smrg      const fs_builder abld =
2253b8e80941Smrg         bld.annotate("emit vertex: emit control data bits");
2254b8e80941Smrg
2255b8e80941Smrg      /* Only emit control data bits if we've finished accumulating a batch
2256b8e80941Smrg       * of 32 bits.  This is the case when:
2257b8e80941Smrg       *
2258b8e80941Smrg       *     (vertex_count * bits_per_vertex) % 32 == 0
2259b8e80941Smrg       *
2260b8e80941Smrg       * (in other words, when the last 5 bits of vertex_count *
2261b8e80941Smrg       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
2262b8e80941Smrg       * integer n (which is always the case, since bits_per_vertex is
2263b8e80941Smrg       * always 1 or 2), this is equivalent to requiring that the last 5-n
2264b8e80941Smrg       * bits of vertex_count are 0:
2265b8e80941Smrg       *
2266b8e80941Smrg       *     vertex_count & (2^(5-n) - 1) == 0
2267b8e80941Smrg       *
2268b8e80941Smrg       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
2269b8e80941Smrg       * equivalent to:
2270b8e80941Smrg       *
2271b8e80941Smrg       *     vertex_count & (32 / bits_per_vertex - 1) == 0
2272b8e80941Smrg       *
2273b8e80941Smrg       * TODO: If vertex_count is an immediate, we could do some of this math
2274b8e80941Smrg       *       at compile time...
2275b8e80941Smrg       */
2276b8e80941Smrg      fs_inst *inst =
2277b8e80941Smrg         abld.AND(bld.null_reg_d(), vertex_count,
2278b8e80941Smrg                  brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
2279b8e80941Smrg      inst->conditional_mod = BRW_CONDITIONAL_Z;
2280b8e80941Smrg
2281b8e80941Smrg      abld.IF(BRW_PREDICATE_NORMAL);
2282b8e80941Smrg      /* If vertex_count is 0, then no control data bits have been
2283b8e80941Smrg       * accumulated yet, so we can skip emitting them.
2284b8e80941Smrg       */
2285b8e80941Smrg      abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
2286b8e80941Smrg               BRW_CONDITIONAL_NEQ);
2287b8e80941Smrg      abld.IF(BRW_PREDICATE_NORMAL);
2288b8e80941Smrg      emit_gs_control_data_bits(vertex_count);
2289b8e80941Smrg      abld.emit(BRW_OPCODE_ENDIF);
2290b8e80941Smrg
2291b8e80941Smrg      /* Reset control_data_bits to 0 so we can start accumulating a new
2292b8e80941Smrg       * batch.
2293b8e80941Smrg       *
2294b8e80941Smrg       * Note: in the case where vertex_count == 0, this neutralizes the
2295b8e80941Smrg       * effect of any call to EndPrimitive() that the shader may have
2296b8e80941Smrg       * made before outputting its first vertex.
2297b8e80941Smrg       */
2298b8e80941Smrg      inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
2299b8e80941Smrg      inst->force_writemask_all = true;
2300b8e80941Smrg      abld.emit(BRW_OPCODE_ENDIF);
2301b8e80941Smrg   }
2302b8e80941Smrg
2303b8e80941Smrg   emit_urb_writes(vertex_count);
2304b8e80941Smrg
2305b8e80941Smrg   /* In stream mode we have to set control data bits for all vertices
2306b8e80941Smrg    * unless we have disabled control data bits completely (which we do
2307b8e80941Smrg    * do for GL_POINTS outputs that don't use streams).
2308b8e80941Smrg    */
2309b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 0 &&
2310b8e80941Smrg       gs_prog_data->control_data_format ==
2311b8e80941Smrg          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
2312b8e80941Smrg      set_gs_stream_control_data_bits(vertex_count, stream_id);
2313b8e80941Smrg   }
2314b8e80941Smrg}
2315b8e80941Smrg
2316b8e80941Smrgvoid
2317b8e80941Smrgfs_visitor::emit_gs_input_load(const fs_reg &dst,
2318b8e80941Smrg                               const nir_src &vertex_src,
2319b8e80941Smrg                               unsigned base_offset,
2320b8e80941Smrg                               const nir_src &offset_src,
2321b8e80941Smrg                               unsigned num_components,
2322b8e80941Smrg                               unsigned first_component)
2323b8e80941Smrg{
2324b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
2325b8e80941Smrg   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
2326b8e80941Smrg
2327b8e80941Smrg   /* TODO: figure out push input layout for invocations == 1 */
2328b8e80941Smrg   /* TODO: make this work with 64-bit inputs */
2329b8e80941Smrg   if (gs_prog_data->invocations == 1 &&
2330b8e80941Smrg       type_sz(dst.type) <= 4 &&
2331b8e80941Smrg       nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) &&
2332b8e80941Smrg       4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) {
2333b8e80941Smrg      int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 +
2334b8e80941Smrg                       nir_src_as_uint(vertex_src) * push_reg_count;
2335b8e80941Smrg      for (unsigned i = 0; i < num_components; i++) {
2336b8e80941Smrg         bld.MOV(offset(dst, bld, i),
2337b8e80941Smrg                 fs_reg(ATTR, imm_offset + i + first_component, dst.type));
2338b8e80941Smrg      }
2339b8e80941Smrg      return;
2340b8e80941Smrg   }
2341b8e80941Smrg
2342b8e80941Smrg   /* Resort to the pull model.  Ensure the VUE handles are provided. */
2343b8e80941Smrg   assert(gs_prog_data->base.include_vue_handles);
2344b8e80941Smrg
2345b8e80941Smrg   unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
2346b8e80941Smrg   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2347b8e80941Smrg
2348b8e80941Smrg   if (gs_prog_data->invocations == 1) {
2349b8e80941Smrg      if (nir_src_is_const(vertex_src)) {
2350b8e80941Smrg         /* The vertex index is constant; just select the proper URB handle. */
2351b8e80941Smrg         icp_handle =
2352b8e80941Smrg            retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0),
2353b8e80941Smrg                   BRW_REGISTER_TYPE_UD);
2354b8e80941Smrg      } else {
2355b8e80941Smrg         /* The vertex index is non-constant.  We need to use indirect
2356b8e80941Smrg          * addressing to fetch the proper URB handle.
2357b8e80941Smrg          *
2358b8e80941Smrg          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
2359b8e80941Smrg          * indicating that channel <n> should read the handle from
2360b8e80941Smrg          * DWord <n>.  We convert that to bytes by multiplying by 4.
2361b8e80941Smrg          *
2362b8e80941Smrg          * Next, we convert the vertex index to bytes by multiplying
2363b8e80941Smrg          * by 32 (shifting by 5), and add the two together.  This is
2364b8e80941Smrg          * the final indirect byte offset.
2365b8e80941Smrg          */
2366b8e80941Smrg         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1);
2367b8e80941Smrg         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2368b8e80941Smrg         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2369b8e80941Smrg         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2370b8e80941Smrg
2371b8e80941Smrg         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
2372b8e80941Smrg         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
2373b8e80941Smrg         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
2374b8e80941Smrg         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
2375b8e80941Smrg         /* Convert vertex_index to bytes (multiply by 32) */
2376b8e80941Smrg         bld.SHL(vertex_offset_bytes,
2377b8e80941Smrg                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2378b8e80941Smrg                 brw_imm_ud(5u));
2379b8e80941Smrg         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
2380b8e80941Smrg
2381b8e80941Smrg         /* Use first_icp_handle as the base offset.  There is one register
2382b8e80941Smrg          * of URB handles per vertex, so inform the register allocator that
2383b8e80941Smrg          * we might read up to nir->info.gs.vertices_in registers.
2384b8e80941Smrg          */
2385b8e80941Smrg         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2386b8e80941Smrg                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2387b8e80941Smrg                  fs_reg(icp_offset_bytes),
2388b8e80941Smrg                  brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE));
2389b8e80941Smrg      }
2390b8e80941Smrg   } else {
2391b8e80941Smrg      assert(gs_prog_data->invocations > 1);
2392b8e80941Smrg
2393b8e80941Smrg      if (nir_src_is_const(vertex_src)) {
2394b8e80941Smrg         unsigned vertex = nir_src_as_uint(vertex_src);
2395b8e80941Smrg         assert(devinfo->gen >= 9 || vertex <= 5);
2396b8e80941Smrg         bld.MOV(icp_handle,
2397b8e80941Smrg                 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8),
2398b8e80941Smrg                        BRW_REGISTER_TYPE_UD));
2399b8e80941Smrg      } else {
2400b8e80941Smrg         /* The vertex index is non-constant.  We need to use indirect
2401b8e80941Smrg          * addressing to fetch the proper URB handle.
2402b8e80941Smrg          *
2403b8e80941Smrg          */
2404b8e80941Smrg         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2405b8e80941Smrg
2406b8e80941Smrg         /* Convert vertex_index to bytes (multiply by 4) */
2407b8e80941Smrg         bld.SHL(icp_offset_bytes,
2408b8e80941Smrg                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2409b8e80941Smrg                 brw_imm_ud(2u));
2410b8e80941Smrg
2411b8e80941Smrg         /* Use first_icp_handle as the base offset.  There is one DWord
2412b8e80941Smrg          * of URB handles per vertex, so inform the register allocator that
2413b8e80941Smrg          * we might read up to ceil(nir->info.gs.vertices_in / 8) registers.
2414b8e80941Smrg          */
2415b8e80941Smrg         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2416b8e80941Smrg                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
2417b8e80941Smrg                  fs_reg(icp_offset_bytes),
2418b8e80941Smrg                  brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) *
2419b8e80941Smrg                             REG_SIZE));
2420b8e80941Smrg      }
2421b8e80941Smrg   }
2422b8e80941Smrg
2423b8e80941Smrg   fs_inst *inst;
2424b8e80941Smrg
2425b8e80941Smrg   fs_reg tmp_dst = dst;
2426b8e80941Smrg   fs_reg indirect_offset = get_nir_src(offset_src);
2427b8e80941Smrg   unsigned num_iterations = 1;
2428b8e80941Smrg   unsigned orig_num_components = num_components;
2429b8e80941Smrg
2430b8e80941Smrg   if (type_sz(dst.type) == 8) {
2431b8e80941Smrg      if (num_components > 2) {
2432b8e80941Smrg         num_iterations = 2;
2433b8e80941Smrg         num_components = 2;
2434b8e80941Smrg      }
2435b8e80941Smrg      fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2436b8e80941Smrg      tmp_dst = tmp;
2437b8e80941Smrg      first_component = first_component / 2;
2438b8e80941Smrg   }
2439b8e80941Smrg
2440b8e80941Smrg   for (unsigned iter = 0; iter < num_iterations; iter++) {
2441b8e80941Smrg      if (nir_src_is_const(offset_src)) {
2442b8e80941Smrg         /* Constant indexing - use global offset. */
2443b8e80941Smrg         if (first_component != 0) {
2444b8e80941Smrg            unsigned read_components = num_components + first_component;
2445b8e80941Smrg            fs_reg tmp = bld.vgrf(dst.type, read_components);
2446b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2447b8e80941Smrg            inst->size_written = read_components *
2448b8e80941Smrg                                 tmp.component_size(inst->exec_size);
2449b8e80941Smrg            for (unsigned i = 0; i < num_components; i++) {
2450b8e80941Smrg               bld.MOV(offset(tmp_dst, bld, i),
2451b8e80941Smrg                       offset(tmp, bld, i + first_component));
2452b8e80941Smrg            }
2453b8e80941Smrg         } else {
2454b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
2455b8e80941Smrg                            icp_handle);
2456b8e80941Smrg            inst->size_written = num_components *
2457b8e80941Smrg                                 tmp_dst.component_size(inst->exec_size);
2458b8e80941Smrg         }
2459b8e80941Smrg         inst->offset = base_offset + nir_src_as_uint(offset_src);
2460b8e80941Smrg         inst->mlen = 1;
2461b8e80941Smrg      } else {
2462b8e80941Smrg         /* Indirect indexing - use per-slot offsets as well. */
2463b8e80941Smrg         const fs_reg srcs[] = { icp_handle, indirect_offset };
2464b8e80941Smrg         unsigned read_components = num_components + first_component;
2465b8e80941Smrg         fs_reg tmp = bld.vgrf(dst.type, read_components);
2466b8e80941Smrg         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2467b8e80941Smrg         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2468b8e80941Smrg         if (first_component != 0) {
2469b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2470b8e80941Smrg                            payload);
2471b8e80941Smrg            inst->size_written = read_components *
2472b8e80941Smrg                                 tmp.component_size(inst->exec_size);
2473b8e80941Smrg            for (unsigned i = 0; i < num_components; i++) {
2474b8e80941Smrg               bld.MOV(offset(tmp_dst, bld, i),
2475b8e80941Smrg                       offset(tmp, bld, i + first_component));
2476b8e80941Smrg            }
2477b8e80941Smrg         } else {
2478b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
2479b8e80941Smrg                         payload);
2480b8e80941Smrg            inst->size_written = num_components *
2481b8e80941Smrg                                 tmp_dst.component_size(inst->exec_size);
2482b8e80941Smrg         }
2483b8e80941Smrg         inst->offset = base_offset;
2484b8e80941Smrg         inst->mlen = 2;
2485b8e80941Smrg      }
2486b8e80941Smrg
2487b8e80941Smrg      if (type_sz(dst.type) == 8) {
2488b8e80941Smrg         shuffle_from_32bit_read(bld,
2489b8e80941Smrg                                 offset(dst, bld, iter * 2),
2490b8e80941Smrg                                 retype(tmp_dst, BRW_REGISTER_TYPE_D),
2491b8e80941Smrg                                 0,
2492b8e80941Smrg                                 num_components);
2493b8e80941Smrg      }
2494b8e80941Smrg
2495b8e80941Smrg      if (num_iterations > 1) {
2496b8e80941Smrg         num_components = orig_num_components - 2;
2497b8e80941Smrg         if(nir_src_is_const(offset_src)) {
2498b8e80941Smrg            base_offset++;
2499b8e80941Smrg         } else {
2500b8e80941Smrg            fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2501b8e80941Smrg            bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
2502b8e80941Smrg            indirect_offset = new_indirect;
2503b8e80941Smrg         }
2504b8e80941Smrg      }
2505b8e80941Smrg   }
2506b8e80941Smrg}
2507b8e80941Smrg
2508b8e80941Smrgfs_reg
2509b8e80941Smrgfs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
2510b8e80941Smrg{
2511b8e80941Smrg   nir_src *offset_src = nir_get_io_offset_src(instr);
2512b8e80941Smrg
2513b8e80941Smrg   if (nir_src_is_const(*offset_src)) {
2514b8e80941Smrg      /* The only constant offset we should find is 0.  brw_nir.c's
2515b8e80941Smrg       * add_const_offset_to_base() will fold other constant offsets
2516b8e80941Smrg       * into instr->const_index[0].
2517b8e80941Smrg       */
2518b8e80941Smrg      assert(nir_src_as_uint(*offset_src) == 0);
2519b8e80941Smrg      return fs_reg();
2520b8e80941Smrg   }
2521b8e80941Smrg
2522b8e80941Smrg   return get_nir_src(*offset_src);
2523b8e80941Smrg}
2524b8e80941Smrg
2525b8e80941Smrgvoid
2526b8e80941Smrgfs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
2527b8e80941Smrg                                  nir_intrinsic_instr *instr)
2528b8e80941Smrg{
2529b8e80941Smrg   assert(stage == MESA_SHADER_VERTEX);
2530b8e80941Smrg
2531b8e80941Smrg   fs_reg dest;
2532b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2533b8e80941Smrg      dest = get_nir_dest(instr->dest);
2534b8e80941Smrg
2535b8e80941Smrg   switch (instr->intrinsic) {
2536b8e80941Smrg   case nir_intrinsic_load_vertex_id:
2537b8e80941Smrg   case nir_intrinsic_load_base_vertex:
2538b8e80941Smrg      unreachable("should be lowered by nir_lower_system_values()");
2539b8e80941Smrg
2540b8e80941Smrg   case nir_intrinsic_load_input: {
2541b8e80941Smrg      fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type);
2542b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
2543b8e80941Smrg      unsigned num_components = instr->num_components;
2544b8e80941Smrg
2545b8e80941Smrg      src = offset(src, bld, nir_src_as_uint(instr->src[0]));
2546b8e80941Smrg
2547b8e80941Smrg      if (type_sz(dest.type) == 8)
2548b8e80941Smrg         first_component /= 2;
2549b8e80941Smrg
2550b8e80941Smrg      /* For 16-bit support maybe a temporary will be needed to copy from
2551b8e80941Smrg       * the ATTR file.
2552b8e80941Smrg       */
2553b8e80941Smrg      shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D),
2554b8e80941Smrg                              first_component, num_components);
2555b8e80941Smrg      break;
2556b8e80941Smrg   }
2557b8e80941Smrg
2558b8e80941Smrg   case nir_intrinsic_load_vertex_id_zero_base:
2559b8e80941Smrg   case nir_intrinsic_load_instance_id:
2560b8e80941Smrg   case nir_intrinsic_load_base_instance:
2561b8e80941Smrg   case nir_intrinsic_load_draw_id:
2562b8e80941Smrg   case nir_intrinsic_load_first_vertex:
2563b8e80941Smrg   case nir_intrinsic_load_is_indexed_draw:
2564b8e80941Smrg      unreachable("lowered by brw_nir_lower_vs_inputs");
2565b8e80941Smrg
2566b8e80941Smrg   default:
2567b8e80941Smrg      nir_emit_intrinsic(bld, instr);
2568b8e80941Smrg      break;
2569b8e80941Smrg   }
2570b8e80941Smrg}
2571b8e80941Smrg
2572b8e80941Smrgvoid
2573b8e80941Smrgfs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
2574b8e80941Smrg                                   nir_intrinsic_instr *instr)
2575b8e80941Smrg{
2576b8e80941Smrg   assert(stage == MESA_SHADER_TESS_CTRL);
2577b8e80941Smrg   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
2578b8e80941Smrg   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
2579b8e80941Smrg
2580b8e80941Smrg   fs_reg dst;
2581b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2582b8e80941Smrg      dst = get_nir_dest(instr->dest);
2583b8e80941Smrg
2584b8e80941Smrg   switch (instr->intrinsic) {
2585b8e80941Smrg   case nir_intrinsic_load_primitive_id:
2586b8e80941Smrg      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
2587b8e80941Smrg      break;
2588b8e80941Smrg   case nir_intrinsic_load_invocation_id:
2589b8e80941Smrg      bld.MOV(retype(dst, invocation_id.type), invocation_id);
2590b8e80941Smrg      break;
2591b8e80941Smrg   case nir_intrinsic_load_patch_vertices_in:
2592b8e80941Smrg      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
2593b8e80941Smrg              brw_imm_d(tcs_key->input_vertices));
2594b8e80941Smrg      break;
2595b8e80941Smrg
2596b8e80941Smrg   case nir_intrinsic_barrier: {
2597b8e80941Smrg      if (tcs_prog_data->instances == 1)
2598b8e80941Smrg         break;
2599b8e80941Smrg
2600b8e80941Smrg      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2601b8e80941Smrg      fs_reg m0_2 = component(m0, 2);
2602b8e80941Smrg
2603b8e80941Smrg      const fs_builder chanbld = bld.exec_all().group(1, 0);
2604b8e80941Smrg
2605b8e80941Smrg      /* Zero the message header */
2606b8e80941Smrg      bld.exec_all().MOV(m0, brw_imm_ud(0u));
2607b8e80941Smrg
2608b8e80941Smrg      if (devinfo->gen < 11) {
2609b8e80941Smrg         /* Copy "Barrier ID" from r0.2, bits 16:13 */
2610b8e80941Smrg         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2611b8e80941Smrg                     brw_imm_ud(INTEL_MASK(16, 13)));
2612b8e80941Smrg
2613b8e80941Smrg         /* Shift it up to bits 27:24. */
2614b8e80941Smrg         chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
2615b8e80941Smrg      } else {
2616b8e80941Smrg         chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
2617b8e80941Smrg                     brw_imm_ud(INTEL_MASK(30, 24)));
2618b8e80941Smrg      }
2619b8e80941Smrg
2620b8e80941Smrg      /* Set the Barrier Count and the enable bit */
2621b8e80941Smrg      if (devinfo->gen < 11) {
2622b8e80941Smrg         chanbld.OR(m0_2, m0_2,
2623b8e80941Smrg                    brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
2624b8e80941Smrg      } else {
2625b8e80941Smrg         chanbld.OR(m0_2, m0_2,
2626b8e80941Smrg                    brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
2627b8e80941Smrg      }
2628b8e80941Smrg
2629b8e80941Smrg      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
2630b8e80941Smrg      break;
2631b8e80941Smrg   }
2632b8e80941Smrg
2633b8e80941Smrg   case nir_intrinsic_load_input:
2634b8e80941Smrg      unreachable("nir_lower_io should never give us these.");
2635b8e80941Smrg      break;
2636b8e80941Smrg
2637b8e80941Smrg   case nir_intrinsic_load_per_vertex_input: {
2638b8e80941Smrg      fs_reg indirect_offset = get_indirect_offset(instr);
2639b8e80941Smrg      unsigned imm_offset = instr->const_index[0];
2640b8e80941Smrg
2641b8e80941Smrg      const nir_src &vertex_src = instr->src[0];
2642b8e80941Smrg
2643b8e80941Smrg      fs_inst *inst;
2644b8e80941Smrg
2645b8e80941Smrg      fs_reg icp_handle;
2646b8e80941Smrg
2647b8e80941Smrg      if (nir_src_is_const(vertex_src)) {
2648b8e80941Smrg         /* Emit a MOV to resolve <0,1,0> regioning. */
2649b8e80941Smrg         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2650b8e80941Smrg         unsigned vertex = nir_src_as_uint(vertex_src);
2651b8e80941Smrg         bld.MOV(icp_handle,
2652b8e80941Smrg                 retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7),
2653b8e80941Smrg                        BRW_REGISTER_TYPE_UD));
2654b8e80941Smrg      } else if (tcs_prog_data->instances == 1 &&
2655b8e80941Smrg                 nir_src_as_intrinsic(vertex_src) != NULL &&
2656b8e80941Smrg                 nir_src_as_intrinsic(vertex_src)->intrinsic == nir_intrinsic_load_invocation_id) {
2657b8e80941Smrg         /* For the common case of only 1 instance, an array index of
2658b8e80941Smrg          * gl_InvocationID means reading g1.  Skip all the indirect work.
2659b8e80941Smrg          */
2660b8e80941Smrg         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
2661b8e80941Smrg      } else {
2662b8e80941Smrg         /* The vertex index is non-constant.  We need to use indirect
2663b8e80941Smrg          * addressing to fetch the proper URB handle.
2664b8e80941Smrg          */
2665b8e80941Smrg         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2666b8e80941Smrg
2667b8e80941Smrg         /* Each ICP handle is a single DWord (4 bytes) */
2668b8e80941Smrg         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2669b8e80941Smrg         bld.SHL(vertex_offset_bytes,
2670b8e80941Smrg                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
2671b8e80941Smrg                 brw_imm_ud(2u));
2672b8e80941Smrg
2673b8e80941Smrg         /* Start at g1.  We might read up to 4 registers. */
2674b8e80941Smrg         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
2675b8e80941Smrg                  retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
2676b8e80941Smrg                  brw_imm_ud(4 * REG_SIZE));
2677b8e80941Smrg      }
2678b8e80941Smrg
2679b8e80941Smrg      /* We can only read two double components with each URB read, so
2680b8e80941Smrg       * we send two read messages in that case, each one loading up to
2681b8e80941Smrg       * two double components.
2682b8e80941Smrg       */
2683b8e80941Smrg      unsigned num_iterations = 1;
2684b8e80941Smrg      unsigned num_components = instr->num_components;
2685b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
2686b8e80941Smrg      fs_reg orig_dst = dst;
2687b8e80941Smrg      if (type_sz(dst.type) == 8) {
2688b8e80941Smrg         first_component = first_component / 2;
2689b8e80941Smrg         if (instr->num_components > 2) {
2690b8e80941Smrg            num_iterations = 2;
2691b8e80941Smrg            num_components = 2;
2692b8e80941Smrg         }
2693b8e80941Smrg
2694b8e80941Smrg         fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
2695b8e80941Smrg         dst = tmp;
2696b8e80941Smrg      }
2697b8e80941Smrg
2698b8e80941Smrg      for (unsigned iter = 0; iter < num_iterations; iter++) {
2699b8e80941Smrg         if (indirect_offset.file == BAD_FILE) {
2700b8e80941Smrg            /* Constant indexing - use global offset. */
2701b8e80941Smrg            if (first_component != 0) {
2702b8e80941Smrg               unsigned read_components = num_components + first_component;
2703b8e80941Smrg               fs_reg tmp = bld.vgrf(dst.type, read_components);
2704b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
2705b8e80941Smrg               for (unsigned i = 0; i < num_components; i++) {
2706b8e80941Smrg                  bld.MOV(offset(dst, bld, i),
2707b8e80941Smrg                          offset(tmp, bld, i + first_component));
2708b8e80941Smrg               }
2709b8e80941Smrg            } else {
2710b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
2711b8e80941Smrg            }
2712b8e80941Smrg            inst->offset = imm_offset;
2713b8e80941Smrg            inst->mlen = 1;
2714b8e80941Smrg         } else {
2715b8e80941Smrg            /* Indirect indexing - use per-slot offsets as well. */
2716b8e80941Smrg            const fs_reg srcs[] = { icp_handle, indirect_offset };
2717b8e80941Smrg            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2718b8e80941Smrg            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2719b8e80941Smrg            if (first_component != 0) {
2720b8e80941Smrg               unsigned read_components = num_components + first_component;
2721b8e80941Smrg               fs_reg tmp = bld.vgrf(dst.type, read_components);
2722b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2723b8e80941Smrg                               payload);
2724b8e80941Smrg               for (unsigned i = 0; i < num_components; i++) {
2725b8e80941Smrg                  bld.MOV(offset(dst, bld, i),
2726b8e80941Smrg                          offset(tmp, bld, i + first_component));
2727b8e80941Smrg               }
2728b8e80941Smrg            } else {
2729b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2730b8e80941Smrg                               payload);
2731b8e80941Smrg            }
2732b8e80941Smrg            inst->offset = imm_offset;
2733b8e80941Smrg            inst->mlen = 2;
2734b8e80941Smrg         }
2735b8e80941Smrg         inst->size_written = (num_components + first_component) *
2736b8e80941Smrg                              inst->dst.component_size(inst->exec_size);
2737b8e80941Smrg
2738b8e80941Smrg         /* If we are reading 64-bit data using 32-bit read messages we need
2739b8e80941Smrg          * build proper 64-bit data elements by shuffling the low and high
2740b8e80941Smrg          * 32-bit components around like we do for other things like UBOs
2741b8e80941Smrg          * or SSBOs.
2742b8e80941Smrg          */
2743b8e80941Smrg         if (type_sz(dst.type) == 8) {
2744b8e80941Smrg            shuffle_from_32bit_read(bld,
2745b8e80941Smrg                                    offset(orig_dst, bld, iter * 2),
2746b8e80941Smrg                                    retype(dst, BRW_REGISTER_TYPE_D),
2747b8e80941Smrg                                    0, num_components);
2748b8e80941Smrg         }
2749b8e80941Smrg
2750b8e80941Smrg         /* Copy the temporary to the destination to deal with writemasking.
2751b8e80941Smrg          *
2752b8e80941Smrg          * Also attempt to deal with gl_PointSize being in the .w component.
2753b8e80941Smrg          */
2754b8e80941Smrg         if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
2755b8e80941Smrg            assert(type_sz(dst.type) < 8);
2756b8e80941Smrg            inst->dst = bld.vgrf(dst.type, 4);
2757b8e80941Smrg            inst->size_written = 4 * REG_SIZE;
2758b8e80941Smrg            bld.MOV(dst, offset(inst->dst, bld, 3));
2759b8e80941Smrg         }
2760b8e80941Smrg
2761b8e80941Smrg         /* If we are loading double data and we need a second read message
2762b8e80941Smrg          * adjust the write offset
2763b8e80941Smrg          */
2764b8e80941Smrg         if (num_iterations > 1) {
2765b8e80941Smrg            num_components = instr->num_components - 2;
2766b8e80941Smrg            imm_offset++;
2767b8e80941Smrg         }
2768b8e80941Smrg      }
2769b8e80941Smrg      break;
2770b8e80941Smrg   }
2771b8e80941Smrg
2772b8e80941Smrg   case nir_intrinsic_load_output:
2773b8e80941Smrg   case nir_intrinsic_load_per_vertex_output: {
2774b8e80941Smrg      fs_reg indirect_offset = get_indirect_offset(instr);
2775b8e80941Smrg      unsigned imm_offset = instr->const_index[0];
2776b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
2777b8e80941Smrg
2778b8e80941Smrg      fs_inst *inst;
2779b8e80941Smrg      if (indirect_offset.file == BAD_FILE) {
2780b8e80941Smrg         /* Replicate the patch handle to all enabled channels */
2781b8e80941Smrg         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
2782b8e80941Smrg         bld.MOV(patch_handle,
2783b8e80941Smrg                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
2784b8e80941Smrg
2785b8e80941Smrg         {
2786b8e80941Smrg            if (first_component != 0) {
2787b8e80941Smrg               unsigned read_components =
2788b8e80941Smrg                  instr->num_components + first_component;
2789b8e80941Smrg               fs_reg tmp = bld.vgrf(dst.type, read_components);
2790b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
2791b8e80941Smrg                               patch_handle);
2792b8e80941Smrg               inst->size_written = read_components * REG_SIZE;
2793b8e80941Smrg               for (unsigned i = 0; i < instr->num_components; i++) {
2794b8e80941Smrg                  bld.MOV(offset(dst, bld, i),
2795b8e80941Smrg                          offset(tmp, bld, i + first_component));
2796b8e80941Smrg               }
2797b8e80941Smrg            } else {
2798b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
2799b8e80941Smrg                               patch_handle);
2800b8e80941Smrg               inst->size_written = instr->num_components * REG_SIZE;
2801b8e80941Smrg            }
2802b8e80941Smrg            inst->offset = imm_offset;
2803b8e80941Smrg            inst->mlen = 1;
2804b8e80941Smrg         }
2805b8e80941Smrg      } else {
2806b8e80941Smrg         /* Indirect indexing - use per-slot offsets as well. */
2807b8e80941Smrg         const fs_reg srcs[] = {
2808b8e80941Smrg            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2809b8e80941Smrg            indirect_offset
2810b8e80941Smrg         };
2811b8e80941Smrg         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
2812b8e80941Smrg         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
2813b8e80941Smrg         if (first_component != 0) {
2814b8e80941Smrg            unsigned read_components =
2815b8e80941Smrg               instr->num_components + first_component;
2816b8e80941Smrg            fs_reg tmp = bld.vgrf(dst.type, read_components);
2817b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
2818b8e80941Smrg                            payload);
2819b8e80941Smrg            inst->size_written = read_components * REG_SIZE;
2820b8e80941Smrg            for (unsigned i = 0; i < instr->num_components; i++) {
2821b8e80941Smrg               bld.MOV(offset(dst, bld, i),
2822b8e80941Smrg                       offset(tmp, bld, i + first_component));
2823b8e80941Smrg            }
2824b8e80941Smrg         } else {
2825b8e80941Smrg            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
2826b8e80941Smrg                            payload);
2827b8e80941Smrg            inst->size_written = instr->num_components * REG_SIZE;
2828b8e80941Smrg         }
2829b8e80941Smrg         inst->offset = imm_offset;
2830b8e80941Smrg         inst->mlen = 2;
2831b8e80941Smrg      }
2832b8e80941Smrg      break;
2833b8e80941Smrg   }
2834b8e80941Smrg
2835b8e80941Smrg   case nir_intrinsic_store_output:
2836b8e80941Smrg   case nir_intrinsic_store_per_vertex_output: {
2837b8e80941Smrg      fs_reg value = get_nir_src(instr->src[0]);
2838b8e80941Smrg      bool is_64bit = (instr->src[0].is_ssa ?
2839b8e80941Smrg         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
2840b8e80941Smrg      fs_reg indirect_offset = get_indirect_offset(instr);
2841b8e80941Smrg      unsigned imm_offset = instr->const_index[0];
2842b8e80941Smrg      unsigned mask = instr->const_index[1];
2843b8e80941Smrg      unsigned header_regs = 0;
2844b8e80941Smrg      fs_reg srcs[7];
2845b8e80941Smrg      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
2846b8e80941Smrg
2847b8e80941Smrg      if (indirect_offset.file != BAD_FILE) {
2848b8e80941Smrg         srcs[header_regs++] = indirect_offset;
2849b8e80941Smrg      }
2850b8e80941Smrg
2851b8e80941Smrg      if (mask == 0)
2852b8e80941Smrg         break;
2853b8e80941Smrg
2854b8e80941Smrg      unsigned num_components = util_last_bit(mask);
2855b8e80941Smrg      enum opcode opcode;
2856b8e80941Smrg
2857b8e80941Smrg      /* We can only pack two 64-bit components in a single message, so send
2858b8e80941Smrg       * 2 messages if we have more components
2859b8e80941Smrg       */
2860b8e80941Smrg      unsigned num_iterations = 1;
2861b8e80941Smrg      unsigned iter_components = num_components;
2862b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
2863b8e80941Smrg      if (is_64bit) {
2864b8e80941Smrg         first_component = first_component / 2;
2865b8e80941Smrg         if (instr->num_components > 2) {
2866b8e80941Smrg            num_iterations = 2;
2867b8e80941Smrg            iter_components = 2;
2868b8e80941Smrg         }
2869b8e80941Smrg      }
2870b8e80941Smrg
2871b8e80941Smrg      mask = mask << first_component;
2872b8e80941Smrg
2873b8e80941Smrg      for (unsigned iter = 0; iter < num_iterations; iter++) {
2874b8e80941Smrg         if (!is_64bit && mask != WRITEMASK_XYZW) {
2875b8e80941Smrg            srcs[header_regs++] = brw_imm_ud(mask << 16);
2876b8e80941Smrg            opcode = indirect_offset.file != BAD_FILE ?
2877b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2878b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2879b8e80941Smrg         } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
2880b8e80941Smrg            /* Expand the 64-bit mask to 32-bit channels. We only handle
2881b8e80941Smrg             * two channels in each iteration, so we only care about X/Y.
2882b8e80941Smrg             */
2883b8e80941Smrg            unsigned mask32 = 0;
2884b8e80941Smrg            if (mask & WRITEMASK_X)
2885b8e80941Smrg               mask32 |= WRITEMASK_XY;
2886b8e80941Smrg            if (mask & WRITEMASK_Y)
2887b8e80941Smrg               mask32 |= WRITEMASK_ZW;
2888b8e80941Smrg
2889b8e80941Smrg            /* If the mask does not include any of the channels X or Y there
2890b8e80941Smrg             * is nothing to do in this iteration. Move on to the next couple
2891b8e80941Smrg             * of 64-bit channels.
2892b8e80941Smrg             */
2893b8e80941Smrg            if (!mask32) {
2894b8e80941Smrg               mask >>= 2;
2895b8e80941Smrg               imm_offset++;
2896b8e80941Smrg               continue;
2897b8e80941Smrg            }
2898b8e80941Smrg
2899b8e80941Smrg            srcs[header_regs++] = brw_imm_ud(mask32 << 16);
2900b8e80941Smrg            opcode = indirect_offset.file != BAD_FILE ?
2901b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
2902b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
2903b8e80941Smrg         } else {
2904b8e80941Smrg            opcode = indirect_offset.file != BAD_FILE ?
2905b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
2906b8e80941Smrg               SHADER_OPCODE_URB_WRITE_SIMD8;
2907b8e80941Smrg         }
2908b8e80941Smrg
2909b8e80941Smrg         for (unsigned i = 0; i < iter_components; i++) {
2910b8e80941Smrg            if (!(mask & (1 << (i + first_component))))
2911b8e80941Smrg               continue;
2912b8e80941Smrg
2913b8e80941Smrg            if (!is_64bit) {
2914b8e80941Smrg               srcs[header_regs + i + first_component] = offset(value, bld, i);
2915b8e80941Smrg            } else {
2916b8e80941Smrg               /* We need to shuffle the 64-bit data to match the layout
2917b8e80941Smrg                * expected by our 32-bit URB write messages. We use a temporary
2918b8e80941Smrg                * for that.
2919b8e80941Smrg                */
2920b8e80941Smrg               unsigned channel = iter * 2 + i;
2921b8e80941Smrg               fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1);
2922b8e80941Smrg
2923b8e80941Smrg               srcs[header_regs + (i + first_component) * 2] = dest;
2924b8e80941Smrg               srcs[header_regs + (i + first_component) * 2 + 1] =
2925b8e80941Smrg                  offset(dest, bld, 1);
2926b8e80941Smrg            }
2927b8e80941Smrg         }
2928b8e80941Smrg
2929b8e80941Smrg         unsigned mlen =
2930b8e80941Smrg            header_regs + (is_64bit ? 2 * iter_components : iter_components) +
2931b8e80941Smrg            (is_64bit ? 2 * first_component : first_component);
2932b8e80941Smrg         fs_reg payload =
2933b8e80941Smrg            bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
2934b8e80941Smrg         bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
2935b8e80941Smrg
2936b8e80941Smrg         fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
2937b8e80941Smrg         inst->offset = imm_offset;
2938b8e80941Smrg         inst->mlen = mlen;
2939b8e80941Smrg
2940b8e80941Smrg         /* If this is a 64-bit attribute, select the next two 64-bit channels
2941b8e80941Smrg          * to be handled in the next iteration.
2942b8e80941Smrg          */
2943b8e80941Smrg         if (is_64bit) {
2944b8e80941Smrg            mask >>= 2;
2945b8e80941Smrg            imm_offset++;
2946b8e80941Smrg         }
2947b8e80941Smrg      }
2948b8e80941Smrg      break;
2949b8e80941Smrg   }
2950b8e80941Smrg
2951b8e80941Smrg   default:
2952b8e80941Smrg      nir_emit_intrinsic(bld, instr);
2953b8e80941Smrg      break;
2954b8e80941Smrg   }
2955b8e80941Smrg}
2956b8e80941Smrg
2957b8e80941Smrgvoid
2958b8e80941Smrgfs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
2959b8e80941Smrg                                   nir_intrinsic_instr *instr)
2960b8e80941Smrg{
2961b8e80941Smrg   assert(stage == MESA_SHADER_TESS_EVAL);
2962b8e80941Smrg   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
2963b8e80941Smrg
2964b8e80941Smrg   fs_reg dest;
2965b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
2966b8e80941Smrg      dest = get_nir_dest(instr->dest);
2967b8e80941Smrg
2968b8e80941Smrg   switch (instr->intrinsic) {
2969b8e80941Smrg   case nir_intrinsic_load_primitive_id:
2970b8e80941Smrg      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
2971b8e80941Smrg      break;
2972b8e80941Smrg   case nir_intrinsic_load_tess_coord:
2973b8e80941Smrg      /* gl_TessCoord is part of the payload in g1-3 */
2974b8e80941Smrg      for (unsigned i = 0; i < 3; i++) {
2975b8e80941Smrg         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
2976b8e80941Smrg      }
2977b8e80941Smrg      break;
2978b8e80941Smrg
2979b8e80941Smrg   case nir_intrinsic_load_input:
2980b8e80941Smrg   case nir_intrinsic_load_per_vertex_input: {
2981b8e80941Smrg      fs_reg indirect_offset = get_indirect_offset(instr);
2982b8e80941Smrg      unsigned imm_offset = instr->const_index[0];
2983b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
2984b8e80941Smrg
2985b8e80941Smrg      if (type_sz(dest.type) == 8) {
2986b8e80941Smrg         first_component = first_component / 2;
2987b8e80941Smrg      }
2988b8e80941Smrg
2989b8e80941Smrg      fs_inst *inst;
2990b8e80941Smrg      if (indirect_offset.file == BAD_FILE) {
2991b8e80941Smrg         /* Arbitrarily only push up to 32 vec4 slots worth of data,
2992b8e80941Smrg          * which is 16 registers (since each holds 2 vec4 slots).
2993b8e80941Smrg          */
2994b8e80941Smrg         unsigned slot_count = 1;
2995b8e80941Smrg         if (type_sz(dest.type) == 8 && instr->num_components > 2)
2996b8e80941Smrg            slot_count++;
2997b8e80941Smrg
2998b8e80941Smrg         const unsigned max_push_slots = 32;
2999b8e80941Smrg         if (imm_offset + slot_count <= max_push_slots) {
3000b8e80941Smrg            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
3001b8e80941Smrg            for (int i = 0; i < instr->num_components; i++) {
3002b8e80941Smrg               unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
3003b8e80941Smrg                  i + first_component;
3004b8e80941Smrg               bld.MOV(offset(dest, bld, i), component(src, comp));
3005b8e80941Smrg            }
3006b8e80941Smrg
3007b8e80941Smrg            tes_prog_data->base.urb_read_length =
3008b8e80941Smrg               MAX2(tes_prog_data->base.urb_read_length,
3009b8e80941Smrg                    DIV_ROUND_UP(imm_offset + slot_count, 2));
3010b8e80941Smrg         } else {
3011b8e80941Smrg            /* Replicate the patch handle to all enabled channels */
3012b8e80941Smrg            const fs_reg srcs[] = {
3013b8e80941Smrg               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
3014b8e80941Smrg            };
3015b8e80941Smrg            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
3016b8e80941Smrg            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
3017b8e80941Smrg
3018b8e80941Smrg            if (first_component != 0) {
3019b8e80941Smrg               unsigned read_components =
3020b8e80941Smrg                  instr->num_components + first_component;
3021b8e80941Smrg               fs_reg tmp = bld.vgrf(dest.type, read_components);
3022b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
3023b8e80941Smrg                               patch_handle);
3024b8e80941Smrg               inst->size_written = read_components * REG_SIZE;
3025b8e80941Smrg               for (unsigned i = 0; i < instr->num_components; i++) {
3026b8e80941Smrg                  bld.MOV(offset(dest, bld, i),
3027b8e80941Smrg                          offset(tmp, bld, i + first_component));
3028b8e80941Smrg               }
3029b8e80941Smrg            } else {
3030b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
3031b8e80941Smrg                               patch_handle);
3032b8e80941Smrg               inst->size_written = instr->num_components * REG_SIZE;
3033b8e80941Smrg            }
3034b8e80941Smrg            inst->mlen = 1;
3035b8e80941Smrg            inst->offset = imm_offset;
3036b8e80941Smrg         }
3037b8e80941Smrg      } else {
3038b8e80941Smrg         /* Indirect indexing - use per-slot offsets as well. */
3039b8e80941Smrg
3040b8e80941Smrg         /* We can only read two double components with each URB read, so
3041b8e80941Smrg          * we send two read messages in that case, each one loading up to
3042b8e80941Smrg          * two double components.
3043b8e80941Smrg          */
3044b8e80941Smrg         unsigned num_iterations = 1;
3045b8e80941Smrg         unsigned num_components = instr->num_components;
3046b8e80941Smrg         fs_reg orig_dest = dest;
3047b8e80941Smrg         if (type_sz(dest.type) == 8) {
3048b8e80941Smrg            if (instr->num_components > 2) {
3049b8e80941Smrg               num_iterations = 2;
3050b8e80941Smrg               num_components = 2;
3051b8e80941Smrg            }
3052b8e80941Smrg            fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
3053b8e80941Smrg            dest = tmp;
3054b8e80941Smrg         }
3055b8e80941Smrg
3056b8e80941Smrg         for (unsigned iter = 0; iter < num_iterations; iter++) {
3057b8e80941Smrg            const fs_reg srcs[] = {
3058b8e80941Smrg               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
3059b8e80941Smrg               indirect_offset
3060b8e80941Smrg            };
3061b8e80941Smrg            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
3062b8e80941Smrg            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
3063b8e80941Smrg
3064b8e80941Smrg            if (first_component != 0) {
3065b8e80941Smrg               unsigned read_components =
3066b8e80941Smrg                   num_components + first_component;
3067b8e80941Smrg               fs_reg tmp = bld.vgrf(dest.type, read_components);
3068b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
3069b8e80941Smrg                               payload);
3070b8e80941Smrg               for (unsigned i = 0; i < num_components; i++) {
3071b8e80941Smrg                  bld.MOV(offset(dest, bld, i),
3072b8e80941Smrg                          offset(tmp, bld, i + first_component));
3073b8e80941Smrg               }
3074b8e80941Smrg            } else {
3075b8e80941Smrg               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
3076b8e80941Smrg                               payload);
3077b8e80941Smrg            }
3078b8e80941Smrg            inst->mlen = 2;
3079b8e80941Smrg            inst->offset = imm_offset;
3080b8e80941Smrg            inst->size_written = (num_components + first_component) *
3081b8e80941Smrg                                 inst->dst.component_size(inst->exec_size);
3082b8e80941Smrg
3083b8e80941Smrg            /* If we are reading 64-bit data using 32-bit read messages we need
3084b8e80941Smrg             * build proper 64-bit data elements by shuffling the low and high
3085b8e80941Smrg             * 32-bit components around like we do for other things like UBOs
3086b8e80941Smrg             * or SSBOs.
3087b8e80941Smrg             */
3088b8e80941Smrg            if (type_sz(dest.type) == 8) {
3089b8e80941Smrg               shuffle_from_32bit_read(bld,
3090b8e80941Smrg                                       offset(orig_dest, bld, iter * 2),
3091b8e80941Smrg                                       retype(dest, BRW_REGISTER_TYPE_D),
3092b8e80941Smrg                                       0, num_components);
3093b8e80941Smrg            }
3094b8e80941Smrg
3095b8e80941Smrg            /* If we are loading double data and we need a second read message
3096b8e80941Smrg             * adjust the offset
3097b8e80941Smrg             */
3098b8e80941Smrg            if (num_iterations > 1) {
3099b8e80941Smrg               num_components = instr->num_components - 2;
3100b8e80941Smrg               imm_offset++;
3101b8e80941Smrg            }
3102b8e80941Smrg         }
3103b8e80941Smrg      }
3104b8e80941Smrg      break;
3105b8e80941Smrg   }
3106b8e80941Smrg   default:
3107b8e80941Smrg      nir_emit_intrinsic(bld, instr);
3108b8e80941Smrg      break;
3109b8e80941Smrg   }
3110b8e80941Smrg}
3111b8e80941Smrg
3112b8e80941Smrgvoid
3113b8e80941Smrgfs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
3114b8e80941Smrg                                  nir_intrinsic_instr *instr)
3115b8e80941Smrg{
3116b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
3117b8e80941Smrg   fs_reg indirect_offset;
3118b8e80941Smrg
3119b8e80941Smrg   fs_reg dest;
3120b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3121b8e80941Smrg      dest = get_nir_dest(instr->dest);
3122b8e80941Smrg
3123b8e80941Smrg   switch (instr->intrinsic) {
3124b8e80941Smrg   case nir_intrinsic_load_primitive_id:
3125b8e80941Smrg      assert(stage == MESA_SHADER_GEOMETRY);
3126b8e80941Smrg      assert(brw_gs_prog_data(prog_data)->include_primitive_id);
3127b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
3128b8e80941Smrg              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
3129b8e80941Smrg      break;
3130b8e80941Smrg
3131b8e80941Smrg   case nir_intrinsic_load_input:
3132b8e80941Smrg      unreachable("load_input intrinsics are invalid for the GS stage");
3133b8e80941Smrg
3134b8e80941Smrg   case nir_intrinsic_load_per_vertex_input:
3135b8e80941Smrg      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
3136b8e80941Smrg                         instr->src[1], instr->num_components,
3137b8e80941Smrg                         nir_intrinsic_component(instr));
3138b8e80941Smrg      break;
3139b8e80941Smrg
3140b8e80941Smrg   case nir_intrinsic_emit_vertex_with_counter:
3141b8e80941Smrg      emit_gs_vertex(instr->src[0], instr->const_index[0]);
3142b8e80941Smrg      break;
3143b8e80941Smrg
3144b8e80941Smrg   case nir_intrinsic_end_primitive_with_counter:
3145b8e80941Smrg      emit_gs_end_primitive(instr->src[0]);
3146b8e80941Smrg      break;
3147b8e80941Smrg
3148b8e80941Smrg   case nir_intrinsic_set_vertex_count:
3149b8e80941Smrg      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
3150b8e80941Smrg      break;
3151b8e80941Smrg
3152b8e80941Smrg   case nir_intrinsic_load_invocation_id: {
3153b8e80941Smrg      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
3154b8e80941Smrg      assert(val.file != BAD_FILE);
3155b8e80941Smrg      dest.type = val.type;
3156b8e80941Smrg      bld.MOV(dest, val);
3157b8e80941Smrg      break;
3158b8e80941Smrg   }
3159b8e80941Smrg
3160b8e80941Smrg   default:
3161b8e80941Smrg      nir_emit_intrinsic(bld, instr);
3162b8e80941Smrg      break;
3163b8e80941Smrg   }
3164b8e80941Smrg}
3165b8e80941Smrg
3166b8e80941Smrg/**
3167b8e80941Smrg * Fetch the current render target layer index.
3168b8e80941Smrg */
3169b8e80941Smrgstatic fs_reg
3170b8e80941Smrgfetch_render_target_array_index(const fs_builder &bld)
3171b8e80941Smrg{
3172b8e80941Smrg   if (bld.shader->devinfo->gen >= 6) {
3173b8e80941Smrg      /* The render target array index is provided in the thread payload as
3174b8e80941Smrg       * bits 26:16 of r0.0.
3175b8e80941Smrg       */
3176b8e80941Smrg      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
3177b8e80941Smrg      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
3178b8e80941Smrg              brw_imm_uw(0x7ff));
3179b8e80941Smrg      return idx;
3180b8e80941Smrg   } else {
3181b8e80941Smrg      /* Pre-SNB we only ever render into the first layer of the framebuffer
3182b8e80941Smrg       * since layered rendering is not implemented.
3183b8e80941Smrg       */
3184b8e80941Smrg      return brw_imm_ud(0);
3185b8e80941Smrg   }
3186b8e80941Smrg}
3187b8e80941Smrg
3188b8e80941Smrg/**
3189b8e80941Smrg * Fake non-coherent framebuffer read implemented using TXF to fetch from the
3190b8e80941Smrg * framebuffer at the current fragment coordinates and sample index.
3191b8e80941Smrg */
3192b8e80941Smrgfs_inst *
3193b8e80941Smrgfs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
3194b8e80941Smrg                                      unsigned target)
3195b8e80941Smrg{
3196b8e80941Smrg   const struct gen_device_info *devinfo = bld.shader->devinfo;
3197b8e80941Smrg
3198b8e80941Smrg   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
3199b8e80941Smrg   const brw_wm_prog_key *wm_key =
3200b8e80941Smrg      reinterpret_cast<const brw_wm_prog_key *>(key);
3201b8e80941Smrg   assert(!wm_key->coherent_fb_fetch);
3202b8e80941Smrg   const struct brw_wm_prog_data *wm_prog_data =
3203b8e80941Smrg      brw_wm_prog_data(stage_prog_data);
3204b8e80941Smrg
3205b8e80941Smrg   /* Calculate the surface index relative to the start of the texture binding
3206b8e80941Smrg    * table block, since that's what the texturing messages expect.
3207b8e80941Smrg    */
3208b8e80941Smrg   const unsigned surface = target +
3209b8e80941Smrg      wm_prog_data->binding_table.render_target_read_start -
3210b8e80941Smrg      wm_prog_data->base.binding_table.texture_start;
3211b8e80941Smrg
3212b8e80941Smrg   /* Calculate the fragment coordinates. */
3213b8e80941Smrg   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
3214b8e80941Smrg   bld.MOV(offset(coords, bld, 0), pixel_x);
3215b8e80941Smrg   bld.MOV(offset(coords, bld, 1), pixel_y);
3216b8e80941Smrg   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
3217b8e80941Smrg
3218b8e80941Smrg   /* Calculate the sample index and MCS payload when multisampling.  Luckily
3219b8e80941Smrg    * the MCS fetch message behaves deterministically for UMS surfaces, so it
3220b8e80941Smrg    * shouldn't be necessary to recompile based on whether the framebuffer is
3221b8e80941Smrg    * CMS or UMS.
3222b8e80941Smrg    */
3223b8e80941Smrg   if (wm_key->multisample_fbo &&
3224b8e80941Smrg       nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
3225b8e80941Smrg      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
3226b8e80941Smrg
3227b8e80941Smrg   const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
3228b8e80941Smrg   const fs_reg mcs = wm_key->multisample_fbo ?
3229b8e80941Smrg      emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg();
3230b8e80941Smrg
3231b8e80941Smrg   /* Use either a normal or a CMS texel fetch message depending on whether
3232b8e80941Smrg    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
3233b8e80941Smrg    * message just in case the framebuffer uses 16x multisampling, it should
3234b8e80941Smrg    * be equivalent to the normal CMS fetch for lower multisampling modes.
3235b8e80941Smrg    */
3236b8e80941Smrg   const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
3237b8e80941Smrg                     devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
3238b8e80941Smrg                     SHADER_OPCODE_TXF_CMS_LOGICAL;
3239b8e80941Smrg
3240b8e80941Smrg   /* Emit the instruction. */
3241b8e80941Smrg   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
3242b8e80941Smrg   srcs[TEX_LOGICAL_SRC_COORDINATE]       = coords;
3243b8e80941Smrg   srcs[TEX_LOGICAL_SRC_LOD]              = brw_imm_ud(0);
3244b8e80941Smrg   srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX]     = sample;
3245b8e80941Smrg   srcs[TEX_LOGICAL_SRC_MCS]              = mcs;
3246b8e80941Smrg   srcs[TEX_LOGICAL_SRC_SURFACE]          = brw_imm_ud(surface);
3247b8e80941Smrg   srcs[TEX_LOGICAL_SRC_SAMPLER]          = brw_imm_ud(0);
3248b8e80941Smrg   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3);
3249b8e80941Smrg   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS]  = brw_imm_ud(0);
3250b8e80941Smrg
3251b8e80941Smrg   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
3252b8e80941Smrg   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3253b8e80941Smrg
3254b8e80941Smrg   return inst;
3255b8e80941Smrg}
3256b8e80941Smrg
3257b8e80941Smrg/**
3258b8e80941Smrg * Actual coherent framebuffer read implemented using the native render target
3259b8e80941Smrg * read message.  Requires SKL+.
3260b8e80941Smrg */
3261b8e80941Smrgstatic fs_inst *
3262b8e80941Smrgemit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
3263b8e80941Smrg{
3264b8e80941Smrg   assert(bld.shader->devinfo->gen >= 9);
3265b8e80941Smrg   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
3266b8e80941Smrg   inst->target = target;
3267b8e80941Smrg   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
3268b8e80941Smrg
3269b8e80941Smrg   return inst;
3270b8e80941Smrg}
3271b8e80941Smrg
3272b8e80941Smrgstatic fs_reg
3273b8e80941Smrgalloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
3274b8e80941Smrg{
3275b8e80941Smrg   if (n && regs[0].file != BAD_FILE) {
3276b8e80941Smrg      return regs[0];
3277b8e80941Smrg
3278b8e80941Smrg   } else {
3279b8e80941Smrg      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
3280b8e80941Smrg
3281b8e80941Smrg      for (unsigned i = 0; i < n; i++)
3282b8e80941Smrg         regs[i] = tmp;
3283b8e80941Smrg
3284b8e80941Smrg      return tmp;
3285b8e80941Smrg   }
3286b8e80941Smrg}
3287b8e80941Smrg
3288b8e80941Smrgstatic fs_reg
3289b8e80941Smrgalloc_frag_output(fs_visitor *v, unsigned location)
3290b8e80941Smrg{
3291b8e80941Smrg   assert(v->stage == MESA_SHADER_FRAGMENT);
3292b8e80941Smrg   const brw_wm_prog_key *const key =
3293b8e80941Smrg      reinterpret_cast<const brw_wm_prog_key *>(v->key);
3294b8e80941Smrg   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
3295b8e80941Smrg   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
3296b8e80941Smrg
3297b8e80941Smrg   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
3298b8e80941Smrg      return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
3299b8e80941Smrg
3300b8e80941Smrg   else if (l == FRAG_RESULT_COLOR)
3301b8e80941Smrg      return alloc_temporary(v->bld, 4, v->outputs,
3302b8e80941Smrg                             MAX2(key->nr_color_regions, 1));
3303b8e80941Smrg
3304b8e80941Smrg   else if (l == FRAG_RESULT_DEPTH)
3305b8e80941Smrg      return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
3306b8e80941Smrg
3307b8e80941Smrg   else if (l == FRAG_RESULT_STENCIL)
3308b8e80941Smrg      return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
3309b8e80941Smrg
3310b8e80941Smrg   else if (l == FRAG_RESULT_SAMPLE_MASK)
3311b8e80941Smrg      return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
3312b8e80941Smrg
3313b8e80941Smrg   else if (l >= FRAG_RESULT_DATA0 &&
3314b8e80941Smrg            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
3315b8e80941Smrg      return alloc_temporary(v->bld, 4,
3316b8e80941Smrg                             &v->outputs[l - FRAG_RESULT_DATA0], 1);
3317b8e80941Smrg
3318b8e80941Smrg   else
3319b8e80941Smrg      unreachable("Invalid location");
3320b8e80941Smrg}
3321b8e80941Smrg
3322b8e80941Smrgvoid
3323b8e80941Smrgfs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
3324b8e80941Smrg                                  nir_intrinsic_instr *instr)
3325b8e80941Smrg{
3326b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
3327b8e80941Smrg
3328b8e80941Smrg   fs_reg dest;
3329b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3330b8e80941Smrg      dest = get_nir_dest(instr->dest);
3331b8e80941Smrg
3332b8e80941Smrg   switch (instr->intrinsic) {
3333b8e80941Smrg   case nir_intrinsic_load_front_face:
3334b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
3335b8e80941Smrg              *emit_frontfacing_interpolation());
3336b8e80941Smrg      break;
3337b8e80941Smrg
3338b8e80941Smrg   case nir_intrinsic_load_sample_pos: {
3339b8e80941Smrg      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
3340b8e80941Smrg      assert(sample_pos.file != BAD_FILE);
3341b8e80941Smrg      dest.type = sample_pos.type;
3342b8e80941Smrg      bld.MOV(dest, sample_pos);
3343b8e80941Smrg      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
3344b8e80941Smrg      break;
3345b8e80941Smrg   }
3346b8e80941Smrg
3347b8e80941Smrg   case nir_intrinsic_load_layer_id:
3348b8e80941Smrg      dest.type = BRW_REGISTER_TYPE_UD;
3349b8e80941Smrg      bld.MOV(dest, fetch_render_target_array_index(bld));
3350b8e80941Smrg      break;
3351b8e80941Smrg
3352b8e80941Smrg   case nir_intrinsic_load_helper_invocation:
3353b8e80941Smrg   case nir_intrinsic_load_sample_mask_in:
3354b8e80941Smrg   case nir_intrinsic_load_sample_id: {
3355b8e80941Smrg      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3356b8e80941Smrg      fs_reg val = nir_system_values[sv];
3357b8e80941Smrg      assert(val.file != BAD_FILE);
3358b8e80941Smrg      dest.type = val.type;
3359b8e80941Smrg      bld.MOV(dest, val);
3360b8e80941Smrg      break;
3361b8e80941Smrg   }
3362b8e80941Smrg
3363b8e80941Smrg   case nir_intrinsic_store_output: {
3364b8e80941Smrg      const fs_reg src = get_nir_src(instr->src[0]);
3365b8e80941Smrg      const unsigned store_offset = nir_src_as_uint(instr->src[1]);
3366b8e80941Smrg      const unsigned location = nir_intrinsic_base(instr) +
3367b8e80941Smrg         SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION);
3368b8e80941Smrg      const fs_reg new_dest = retype(alloc_frag_output(this, location),
3369b8e80941Smrg                                     src.type);
3370b8e80941Smrg
3371b8e80941Smrg      for (unsigned j = 0; j < instr->num_components; j++)
3372b8e80941Smrg         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
3373b8e80941Smrg                 offset(src, bld, j));
3374b8e80941Smrg
3375b8e80941Smrg      break;
3376b8e80941Smrg   }
3377b8e80941Smrg
3378b8e80941Smrg   case nir_intrinsic_load_output: {
3379b8e80941Smrg      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
3380b8e80941Smrg                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
3381b8e80941Smrg      assert(l >= FRAG_RESULT_DATA0);
3382b8e80941Smrg      const unsigned load_offset = nir_src_as_uint(instr->src[0]);
3383b8e80941Smrg      const unsigned target = l - FRAG_RESULT_DATA0 + load_offset;
3384b8e80941Smrg      const fs_reg tmp = bld.vgrf(dest.type, 4);
3385b8e80941Smrg
3386b8e80941Smrg      if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
3387b8e80941Smrg         emit_coherent_fb_read(bld, tmp, target);
3388b8e80941Smrg      else
3389b8e80941Smrg         emit_non_coherent_fb_read(bld, tmp, target);
3390b8e80941Smrg
3391b8e80941Smrg      for (unsigned j = 0; j < instr->num_components; j++) {
3392b8e80941Smrg         bld.MOV(offset(dest, bld, j),
3393b8e80941Smrg                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
3394b8e80941Smrg      }
3395b8e80941Smrg
3396b8e80941Smrg      break;
3397b8e80941Smrg   }
3398b8e80941Smrg
3399b8e80941Smrg   case nir_intrinsic_discard:
3400b8e80941Smrg   case nir_intrinsic_discard_if: {
3401b8e80941Smrg      /* We track our discarded pixels in f0.1.  By predicating on it, we can
3402b8e80941Smrg       * update just the flag bits that aren't yet discarded.  If there's no
3403b8e80941Smrg       * condition, we emit a CMP of g0 != g0, so all currently executing
3404b8e80941Smrg       * channels will get turned off.
3405b8e80941Smrg       */
3406b8e80941Smrg      fs_inst *cmp;
3407b8e80941Smrg      if (instr->intrinsic == nir_intrinsic_discard_if) {
3408b8e80941Smrg         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
3409b8e80941Smrg                       brw_imm_d(0), BRW_CONDITIONAL_Z);
3410b8e80941Smrg      } else {
3411b8e80941Smrg         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
3412b8e80941Smrg                                       BRW_REGISTER_TYPE_UW));
3413b8e80941Smrg         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
3414b8e80941Smrg      }
3415b8e80941Smrg      cmp->predicate = BRW_PREDICATE_NORMAL;
3416b8e80941Smrg      cmp->flag_subreg = 1;
3417b8e80941Smrg
3418b8e80941Smrg      if (devinfo->gen >= 6) {
3419b8e80941Smrg         emit_discard_jump();
3420b8e80941Smrg      }
3421b8e80941Smrg
3422b8e80941Smrg      limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode.");
3423b8e80941Smrg      break;
3424b8e80941Smrg   }
3425b8e80941Smrg
3426b8e80941Smrg   case nir_intrinsic_load_input: {
3427b8e80941Smrg      /* load_input is only used for flat inputs */
3428b8e80941Smrg      unsigned base = nir_intrinsic_base(instr);
3429b8e80941Smrg      unsigned comp = nir_intrinsic_component(instr);
3430b8e80941Smrg      unsigned num_components = instr->num_components;
3431b8e80941Smrg      fs_reg orig_dest = dest;
3432b8e80941Smrg      enum brw_reg_type type = dest.type;
3433b8e80941Smrg
3434b8e80941Smrg      /* Special case fields in the VUE header */
3435b8e80941Smrg      if (base == VARYING_SLOT_LAYER)
3436b8e80941Smrg         comp = 1;
3437b8e80941Smrg      else if (base == VARYING_SLOT_VIEWPORT)
3438b8e80941Smrg         comp = 2;
3439b8e80941Smrg
3440b8e80941Smrg      if (nir_dest_bit_size(instr->dest) == 64) {
3441b8e80941Smrg         /* const_index is in 32-bit type size units that could not be aligned
3442b8e80941Smrg          * with DF. We need to read the double vector as if it was a float
3443b8e80941Smrg          * vector of twice the number of components to fetch the right data.
3444b8e80941Smrg          */
3445b8e80941Smrg         type = BRW_REGISTER_TYPE_F;
3446b8e80941Smrg         num_components *= 2;
3447b8e80941Smrg         dest = bld.vgrf(type, num_components);
3448b8e80941Smrg      }
3449b8e80941Smrg
3450b8e80941Smrg      for (unsigned int i = 0; i < num_components; i++) {
3451b8e80941Smrg         bld.MOV(offset(retype(dest, type), bld, i),
3452b8e80941Smrg                 retype(component(interp_reg(base, comp + i), 3), type));
3453b8e80941Smrg      }
3454b8e80941Smrg
3455b8e80941Smrg      if (nir_dest_bit_size(instr->dest) == 64) {
3456b8e80941Smrg         shuffle_from_32bit_read(bld, orig_dest, dest, 0,
3457b8e80941Smrg                                 instr->num_components);
3458b8e80941Smrg      }
3459b8e80941Smrg      break;
3460b8e80941Smrg   }
3461b8e80941Smrg
3462b8e80941Smrg   case nir_intrinsic_load_barycentric_pixel:
3463b8e80941Smrg   case nir_intrinsic_load_barycentric_centroid:
3464b8e80941Smrg   case nir_intrinsic_load_barycentric_sample:
3465b8e80941Smrg      /* Do nothing - load_interpolated_input handling will handle it later. */
3466b8e80941Smrg      break;
3467b8e80941Smrg
3468b8e80941Smrg   case nir_intrinsic_load_barycentric_at_sample: {
3469b8e80941Smrg      const glsl_interp_mode interpolation =
3470b8e80941Smrg         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3471b8e80941Smrg
3472b8e80941Smrg      if (nir_src_is_const(instr->src[0])) {
3473b8e80941Smrg         unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4;
3474b8e80941Smrg
3475b8e80941Smrg         emit_pixel_interpolater_send(bld,
3476b8e80941Smrg                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3477b8e80941Smrg                                      dest,
3478b8e80941Smrg                                      fs_reg(), /* src */
3479b8e80941Smrg                                      brw_imm_ud(msg_data),
3480b8e80941Smrg                                      interpolation);
3481b8e80941Smrg      } else {
3482b8e80941Smrg         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
3483b8e80941Smrg                                          BRW_REGISTER_TYPE_UD);
3484b8e80941Smrg
3485b8e80941Smrg         if (nir_src_is_dynamically_uniform(instr->src[0])) {
3486b8e80941Smrg            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3487b8e80941Smrg            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3488b8e80941Smrg            bld.exec_all().group(1, 0)
3489b8e80941Smrg               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3490b8e80941Smrg            emit_pixel_interpolater_send(bld,
3491b8e80941Smrg                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3492b8e80941Smrg                                         dest,
3493b8e80941Smrg                                         fs_reg(), /* src */
3494b8e80941Smrg                                         msg_data,
3495b8e80941Smrg                                         interpolation);
3496b8e80941Smrg         } else {
3497b8e80941Smrg            /* Make a loop that sends a message to the pixel interpolater
3498b8e80941Smrg             * for the sample number in each live channel. If there are
3499b8e80941Smrg             * multiple channels with the same sample number then these
3500b8e80941Smrg             * will be handled simultaneously with a single interation of
3501b8e80941Smrg             * the loop.
3502b8e80941Smrg             */
3503b8e80941Smrg            bld.emit(BRW_OPCODE_DO);
3504b8e80941Smrg
3505b8e80941Smrg            /* Get the next live sample number into sample_id_reg */
3506b8e80941Smrg            const fs_reg sample_id = bld.emit_uniformize(sample_src);
3507b8e80941Smrg
3508b8e80941Smrg            /* Set the flag register so that we can perform the send
3509b8e80941Smrg             * message on all channels that have the same sample number
3510b8e80941Smrg             */
3511b8e80941Smrg            bld.CMP(bld.null_reg_ud(),
3512b8e80941Smrg                    sample_src, sample_id,
3513b8e80941Smrg                    BRW_CONDITIONAL_EQ);
3514b8e80941Smrg            const fs_reg msg_data = vgrf(glsl_type::uint_type);
3515b8e80941Smrg            bld.exec_all().group(1, 0)
3516b8e80941Smrg               .SHL(msg_data, sample_id, brw_imm_ud(4u));
3517b8e80941Smrg            fs_inst *inst =
3518b8e80941Smrg               emit_pixel_interpolater_send(bld,
3519b8e80941Smrg                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
3520b8e80941Smrg                                            dest,
3521b8e80941Smrg                                            fs_reg(), /* src */
3522b8e80941Smrg                                            component(msg_data, 0),
3523b8e80941Smrg                                            interpolation);
3524b8e80941Smrg            set_predicate(BRW_PREDICATE_NORMAL, inst);
3525b8e80941Smrg
3526b8e80941Smrg            /* Continue the loop if there are any live channels left */
3527b8e80941Smrg            set_predicate_inv(BRW_PREDICATE_NORMAL,
3528b8e80941Smrg                              true, /* inverse */
3529b8e80941Smrg                              bld.emit(BRW_OPCODE_WHILE));
3530b8e80941Smrg         }
3531b8e80941Smrg      }
3532b8e80941Smrg      break;
3533b8e80941Smrg   }
3534b8e80941Smrg
3535b8e80941Smrg   case nir_intrinsic_load_barycentric_at_offset: {
3536b8e80941Smrg      const glsl_interp_mode interpolation =
3537b8e80941Smrg         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
3538b8e80941Smrg
3539b8e80941Smrg      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
3540b8e80941Smrg
3541b8e80941Smrg      if (const_offset) {
3542b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) == 32);
3543b8e80941Smrg         unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf;
3544b8e80941Smrg         unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf;
3545b8e80941Smrg
3546b8e80941Smrg         emit_pixel_interpolater_send(bld,
3547b8e80941Smrg                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
3548b8e80941Smrg                                      dest,
3549b8e80941Smrg                                      fs_reg(), /* src */
3550b8e80941Smrg                                      brw_imm_ud(off_x | (off_y << 4)),
3551b8e80941Smrg                                      interpolation);
3552b8e80941Smrg      } else {
3553b8e80941Smrg         fs_reg src = vgrf(glsl_type::ivec2_type);
3554b8e80941Smrg         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
3555b8e80941Smrg                                    BRW_REGISTER_TYPE_F);
3556b8e80941Smrg         for (int i = 0; i < 2; i++) {
3557b8e80941Smrg            fs_reg temp = vgrf(glsl_type::float_type);
3558b8e80941Smrg            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
3559b8e80941Smrg            fs_reg itemp = vgrf(glsl_type::int_type);
3560b8e80941Smrg            /* float to int */
3561b8e80941Smrg            bld.MOV(itemp, temp);
3562b8e80941Smrg
3563b8e80941Smrg            /* Clamp the upper end of the range to +7/16.
3564b8e80941Smrg             * ARB_gpu_shader5 requires that we support a maximum offset
3565b8e80941Smrg             * of +0.5, which isn't representable in a S0.4 value -- if
3566b8e80941Smrg             * we didn't clamp it, we'd end up with -8/16, which is the
3567b8e80941Smrg             * opposite of what the shader author wanted.
3568b8e80941Smrg             *
3569b8e80941Smrg             * This is legal due to ARB_gpu_shader5's quantization
3570b8e80941Smrg             * rules:
3571b8e80941Smrg             *
3572b8e80941Smrg             * "Not all values of <offset> may be supported; x and y
3573b8e80941Smrg             * offsets may be rounded to fixed-point values with the
3574b8e80941Smrg             * number of fraction bits given by the
3575b8e80941Smrg             * implementation-dependent constant
3576b8e80941Smrg             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
3577b8e80941Smrg             */
3578b8e80941Smrg            set_condmod(BRW_CONDITIONAL_L,
3579b8e80941Smrg                        bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
3580b8e80941Smrg         }
3581b8e80941Smrg
3582b8e80941Smrg         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
3583b8e80941Smrg         emit_pixel_interpolater_send(bld,
3584b8e80941Smrg                                      opcode,
3585b8e80941Smrg                                      dest,
3586b8e80941Smrg                                      src,
3587b8e80941Smrg                                      brw_imm_ud(0u),
3588b8e80941Smrg                                      interpolation);
3589b8e80941Smrg      }
3590b8e80941Smrg      break;
3591b8e80941Smrg   }
3592b8e80941Smrg
3593b8e80941Smrg   case nir_intrinsic_load_interpolated_input: {
3594b8e80941Smrg      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
3595b8e80941Smrg         emit_fragcoord_interpolation(dest);
3596b8e80941Smrg         break;
3597b8e80941Smrg      }
3598b8e80941Smrg
3599b8e80941Smrg      assert(instr->src[0].ssa &&
3600b8e80941Smrg             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
3601b8e80941Smrg      nir_intrinsic_instr *bary_intrinsic =
3602b8e80941Smrg         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
3603b8e80941Smrg      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
3604b8e80941Smrg      enum glsl_interp_mode interp_mode =
3605b8e80941Smrg         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
3606b8e80941Smrg      fs_reg dst_xy;
3607b8e80941Smrg
3608b8e80941Smrg      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
3609b8e80941Smrg          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
3610b8e80941Smrg         /* Use the result of the PI message */
3611b8e80941Smrg         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
3612b8e80941Smrg      } else {
3613b8e80941Smrg         /* Use the delta_xy values computed from the payload */
3614b8e80941Smrg         enum brw_barycentric_mode bary =
3615b8e80941Smrg            brw_barycentric_mode(interp_mode, bary_intrin);
3616b8e80941Smrg
3617b8e80941Smrg         dst_xy = this->delta_xy[bary];
3618b8e80941Smrg      }
3619b8e80941Smrg
3620b8e80941Smrg      for (unsigned int i = 0; i < instr->num_components; i++) {
3621b8e80941Smrg         fs_reg interp =
3622b8e80941Smrg            component(interp_reg(nir_intrinsic_base(instr),
3623b8e80941Smrg                                 nir_intrinsic_component(instr) + i), 0);
3624b8e80941Smrg         interp.type = BRW_REGISTER_TYPE_F;
3625b8e80941Smrg         dest.type = BRW_REGISTER_TYPE_F;
3626b8e80941Smrg
3627b8e80941Smrg         if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
3628b8e80941Smrg            fs_reg tmp = vgrf(glsl_type::float_type);
3629b8e80941Smrg            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
3630b8e80941Smrg            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
3631b8e80941Smrg         } else {
3632b8e80941Smrg            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
3633b8e80941Smrg         }
3634b8e80941Smrg      }
3635b8e80941Smrg      break;
3636b8e80941Smrg   }
3637b8e80941Smrg
3638b8e80941Smrg   default:
3639b8e80941Smrg      nir_emit_intrinsic(bld, instr);
3640b8e80941Smrg      break;
3641b8e80941Smrg   }
3642b8e80941Smrg}
3643b8e80941Smrg
3644b8e80941Smrgstatic int
3645b8e80941Smrgget_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src)
3646b8e80941Smrg{
3647b8e80941Smrg   if (nir_src_is_const(instr->src[src])) {
3648b8e80941Smrg      int64_t add_val = nir_src_as_int(instr->src[src]);
3649b8e80941Smrg      if (add_val == 1)
3650b8e80941Smrg         return BRW_AOP_INC;
3651b8e80941Smrg      else if (add_val == -1)
3652b8e80941Smrg         return BRW_AOP_DEC;
3653b8e80941Smrg   }
3654b8e80941Smrg
3655b8e80941Smrg   return BRW_AOP_ADD;
3656b8e80941Smrg}
3657b8e80941Smrg
3658b8e80941Smrgvoid
3659b8e80941Smrgfs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
3660b8e80941Smrg                                  nir_intrinsic_instr *instr)
3661b8e80941Smrg{
3662b8e80941Smrg   assert(stage == MESA_SHADER_COMPUTE);
3663b8e80941Smrg   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
3664b8e80941Smrg
3665b8e80941Smrg   fs_reg dest;
3666b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3667b8e80941Smrg      dest = get_nir_dest(instr->dest);
3668b8e80941Smrg
3669b8e80941Smrg   switch (instr->intrinsic) {
3670b8e80941Smrg   case nir_intrinsic_barrier:
3671b8e80941Smrg      emit_barrier();
3672b8e80941Smrg      cs_prog_data->uses_barrier = true;
3673b8e80941Smrg      break;
3674b8e80941Smrg
3675b8e80941Smrg   case nir_intrinsic_load_subgroup_id:
3676b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id);
3677b8e80941Smrg      break;
3678b8e80941Smrg
3679b8e80941Smrg   case nir_intrinsic_load_local_invocation_id:
3680b8e80941Smrg   case nir_intrinsic_load_work_group_id: {
3681b8e80941Smrg      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
3682b8e80941Smrg      fs_reg val = nir_system_values[sv];
3683b8e80941Smrg      assert(val.file != BAD_FILE);
3684b8e80941Smrg      dest.type = val.type;
3685b8e80941Smrg      for (unsigned i = 0; i < 3; i++)
3686b8e80941Smrg         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
3687b8e80941Smrg      break;
3688b8e80941Smrg   }
3689b8e80941Smrg
3690b8e80941Smrg   case nir_intrinsic_load_num_work_groups: {
3691b8e80941Smrg      const unsigned surface =
3692b8e80941Smrg         cs_prog_data->binding_table.work_groups_start;
3693b8e80941Smrg
3694b8e80941Smrg      cs_prog_data->uses_num_work_groups = true;
3695b8e80941Smrg
3696b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3697b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface);
3698b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3699b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */
3700b8e80941Smrg
3701b8e80941Smrg      /* Read the 3 GLuint components of gl_NumWorkGroups */
3702b8e80941Smrg      for (unsigned i = 0; i < 3; i++) {
3703b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2);
3704b8e80941Smrg         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3705b8e80941Smrg                  offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS);
3706b8e80941Smrg      }
3707b8e80941Smrg      break;
3708b8e80941Smrg   }
3709b8e80941Smrg
3710b8e80941Smrg   case nir_intrinsic_shared_atomic_add:
3711b8e80941Smrg      nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
3712b8e80941Smrg      break;
3713b8e80941Smrg   case nir_intrinsic_shared_atomic_imin:
3714b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
3715b8e80941Smrg      break;
3716b8e80941Smrg   case nir_intrinsic_shared_atomic_umin:
3717b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
3718b8e80941Smrg      break;
3719b8e80941Smrg   case nir_intrinsic_shared_atomic_imax:
3720b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
3721b8e80941Smrg      break;
3722b8e80941Smrg   case nir_intrinsic_shared_atomic_umax:
3723b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
3724b8e80941Smrg      break;
3725b8e80941Smrg   case nir_intrinsic_shared_atomic_and:
3726b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
3727b8e80941Smrg      break;
3728b8e80941Smrg   case nir_intrinsic_shared_atomic_or:
3729b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
3730b8e80941Smrg      break;
3731b8e80941Smrg   case nir_intrinsic_shared_atomic_xor:
3732b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
3733b8e80941Smrg      break;
3734b8e80941Smrg   case nir_intrinsic_shared_atomic_exchange:
3735b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
3736b8e80941Smrg      break;
3737b8e80941Smrg   case nir_intrinsic_shared_atomic_comp_swap:
3738b8e80941Smrg      nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
3739b8e80941Smrg      break;
3740b8e80941Smrg   case nir_intrinsic_shared_atomic_fmin:
3741b8e80941Smrg      nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr);
3742b8e80941Smrg      break;
3743b8e80941Smrg   case nir_intrinsic_shared_atomic_fmax:
3744b8e80941Smrg      nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr);
3745b8e80941Smrg      break;
3746b8e80941Smrg   case nir_intrinsic_shared_atomic_fcomp_swap:
3747b8e80941Smrg      nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr);
3748b8e80941Smrg      break;
3749b8e80941Smrg
3750b8e80941Smrg   case nir_intrinsic_load_shared: {
3751b8e80941Smrg      assert(devinfo->gen >= 7);
3752b8e80941Smrg      assert(stage == MESA_SHADER_COMPUTE);
3753b8e80941Smrg
3754b8e80941Smrg      const unsigned bit_size = nir_dest_bit_size(instr->dest);
3755b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3756b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3757b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]);
3758b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3759b8e80941Smrg
3760b8e80941Smrg      /* Make dest unsigned because that's what the temporary will be */
3761b8e80941Smrg      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3762b8e80941Smrg
3763b8e80941Smrg      /* Read the vector */
3764b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
3765b8e80941Smrg         assert(nir_dest_bit_size(instr->dest) == 32);
3766b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3767b8e80941Smrg         fs_inst *inst =
3768b8e80941Smrg            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
3769b8e80941Smrg                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
3770b8e80941Smrg         inst->size_written = instr->num_components * dispatch_width * 4;
3771b8e80941Smrg      } else {
3772b8e80941Smrg         assert(nir_dest_bit_size(instr->dest) <= 32);
3773b8e80941Smrg         assert(nir_dest_num_components(instr->dest) == 1);
3774b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3775b8e80941Smrg
3776b8e80941Smrg         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
3777b8e80941Smrg         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
3778b8e80941Smrg                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
3779b8e80941Smrg         bld.MOV(dest, read_result);
3780b8e80941Smrg      }
3781b8e80941Smrg      break;
3782b8e80941Smrg   }
3783b8e80941Smrg
3784b8e80941Smrg   case nir_intrinsic_store_shared: {
3785b8e80941Smrg      assert(devinfo->gen >= 7);
3786b8e80941Smrg      assert(stage == MESA_SHADER_COMPUTE);
3787b8e80941Smrg
3788b8e80941Smrg      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
3789b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3790b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
3791b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
3792b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
3793b8e80941Smrg
3794b8e80941Smrg      fs_reg data = get_nir_src(instr->src[0]);
3795b8e80941Smrg      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
3796b8e80941Smrg
3797b8e80941Smrg      assert(nir_intrinsic_write_mask(instr) ==
3798b8e80941Smrg             (1u << instr->num_components) - 1);
3799b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
3800b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) == 32);
3801b8e80941Smrg         assert(nir_src_num_components(instr->src[0]) <= 4);
3802b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
3803b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
3804b8e80941Smrg         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
3805b8e80941Smrg                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3806b8e80941Smrg      } else {
3807b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) <= 32);
3808b8e80941Smrg         assert(nir_src_num_components(instr->src[0]) == 1);
3809b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
3810b8e80941Smrg
3811b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
3812b8e80941Smrg         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
3813b8e80941Smrg
3814b8e80941Smrg         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
3815b8e80941Smrg                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
3816b8e80941Smrg      }
3817b8e80941Smrg      break;
3818b8e80941Smrg   }
3819b8e80941Smrg
3820b8e80941Smrg   default:
3821b8e80941Smrg      nir_emit_intrinsic(bld, instr);
3822b8e80941Smrg      break;
3823b8e80941Smrg   }
3824b8e80941Smrg}
3825b8e80941Smrg
3826b8e80941Smrgstatic fs_reg
3827b8e80941Smrgbrw_nir_reduction_op_identity(const fs_builder &bld,
3828b8e80941Smrg                              nir_op op, brw_reg_type type)
3829b8e80941Smrg{
3830b8e80941Smrg   nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8);
3831b8e80941Smrg   switch (type_sz(type)) {
3832b8e80941Smrg   case 2:
3833b8e80941Smrg      assert(type != BRW_REGISTER_TYPE_HF);
3834b8e80941Smrg      return retype(brw_imm_uw(value.u16), type);
3835b8e80941Smrg   case 4:
3836b8e80941Smrg      return retype(brw_imm_ud(value.u32), type);
3837b8e80941Smrg   case 8:
3838b8e80941Smrg      if (type == BRW_REGISTER_TYPE_DF)
3839b8e80941Smrg         return setup_imm_df(bld, value.f64);
3840b8e80941Smrg      else
3841b8e80941Smrg         return retype(brw_imm_u64(value.u64), type);
3842b8e80941Smrg   default:
3843b8e80941Smrg      unreachable("Invalid type size");
3844b8e80941Smrg   }
3845b8e80941Smrg}
3846b8e80941Smrg
3847b8e80941Smrgstatic opcode
3848b8e80941Smrgbrw_op_for_nir_reduction_op(nir_op op)
3849b8e80941Smrg{
3850b8e80941Smrg   switch (op) {
3851b8e80941Smrg   case nir_op_iadd: return BRW_OPCODE_ADD;
3852b8e80941Smrg   case nir_op_fadd: return BRW_OPCODE_ADD;
3853b8e80941Smrg   case nir_op_imul: return BRW_OPCODE_MUL;
3854b8e80941Smrg   case nir_op_fmul: return BRW_OPCODE_MUL;
3855b8e80941Smrg   case nir_op_imin: return BRW_OPCODE_SEL;
3856b8e80941Smrg   case nir_op_umin: return BRW_OPCODE_SEL;
3857b8e80941Smrg   case nir_op_fmin: return BRW_OPCODE_SEL;
3858b8e80941Smrg   case nir_op_imax: return BRW_OPCODE_SEL;
3859b8e80941Smrg   case nir_op_umax: return BRW_OPCODE_SEL;
3860b8e80941Smrg   case nir_op_fmax: return BRW_OPCODE_SEL;
3861b8e80941Smrg   case nir_op_iand: return BRW_OPCODE_AND;
3862b8e80941Smrg   case nir_op_ior:  return BRW_OPCODE_OR;
3863b8e80941Smrg   case nir_op_ixor: return BRW_OPCODE_XOR;
3864b8e80941Smrg   default:
3865b8e80941Smrg      unreachable("Invalid reduction operation");
3866b8e80941Smrg   }
3867b8e80941Smrg}
3868b8e80941Smrg
3869b8e80941Smrgstatic brw_conditional_mod
3870b8e80941Smrgbrw_cond_mod_for_nir_reduction_op(nir_op op)
3871b8e80941Smrg{
3872b8e80941Smrg   switch (op) {
3873b8e80941Smrg   case nir_op_iadd: return BRW_CONDITIONAL_NONE;
3874b8e80941Smrg   case nir_op_fadd: return BRW_CONDITIONAL_NONE;
3875b8e80941Smrg   case nir_op_imul: return BRW_CONDITIONAL_NONE;
3876b8e80941Smrg   case nir_op_fmul: return BRW_CONDITIONAL_NONE;
3877b8e80941Smrg   case nir_op_imin: return BRW_CONDITIONAL_L;
3878b8e80941Smrg   case nir_op_umin: return BRW_CONDITIONAL_L;
3879b8e80941Smrg   case nir_op_fmin: return BRW_CONDITIONAL_L;
3880b8e80941Smrg   case nir_op_imax: return BRW_CONDITIONAL_GE;
3881b8e80941Smrg   case nir_op_umax: return BRW_CONDITIONAL_GE;
3882b8e80941Smrg   case nir_op_fmax: return BRW_CONDITIONAL_GE;
3883b8e80941Smrg   case nir_op_iand: return BRW_CONDITIONAL_NONE;
3884b8e80941Smrg   case nir_op_ior:  return BRW_CONDITIONAL_NONE;
3885b8e80941Smrg   case nir_op_ixor: return BRW_CONDITIONAL_NONE;
3886b8e80941Smrg   default:
3887b8e80941Smrg      unreachable("Invalid reduction operation");
3888b8e80941Smrg   }
3889b8e80941Smrg}
3890b8e80941Smrg
3891b8e80941Smrgfs_reg
3892b8e80941Smrgfs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld,
3893b8e80941Smrg                                          nir_intrinsic_instr *instr)
3894b8e80941Smrg{
3895b8e80941Smrg   fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD);
3896b8e80941Smrg
3897b8e80941Smrg   if (stage_prog_data->binding_table.image_start > 0) {
3898b8e80941Smrg      if (image.file == BRW_IMMEDIATE_VALUE) {
3899b8e80941Smrg         image.d += stage_prog_data->binding_table.image_start;
3900b8e80941Smrg      } else {
3901b8e80941Smrg         bld.ADD(image, image,
3902b8e80941Smrg                 brw_imm_d(stage_prog_data->binding_table.image_start));
3903b8e80941Smrg      }
3904b8e80941Smrg   }
3905b8e80941Smrg
3906b8e80941Smrg   return bld.emit_uniformize(image);
3907b8e80941Smrg}
3908b8e80941Smrg
3909b8e80941Smrgfs_reg
3910b8e80941Smrgfs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld,
3911b8e80941Smrg                                         nir_intrinsic_instr *instr)
3912b8e80941Smrg{
3913b8e80941Smrg   /* SSBO stores are weird in that their index is in src[1] */
3914b8e80941Smrg   const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0;
3915b8e80941Smrg
3916b8e80941Smrg   fs_reg surf_index;
3917b8e80941Smrg   if (nir_src_is_const(instr->src[src])) {
3918b8e80941Smrg      unsigned index = stage_prog_data->binding_table.ssbo_start +
3919b8e80941Smrg                       nir_src_as_uint(instr->src[src]);
3920b8e80941Smrg      surf_index = brw_imm_ud(index);
3921b8e80941Smrg   } else {
3922b8e80941Smrg      surf_index = vgrf(glsl_type::uint_type);
3923b8e80941Smrg      bld.ADD(surf_index, get_nir_src(instr->src[src]),
3924b8e80941Smrg              brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
3925b8e80941Smrg   }
3926b8e80941Smrg
3927b8e80941Smrg   return bld.emit_uniformize(surf_index);
3928b8e80941Smrg}
3929b8e80941Smrg
3930b8e80941Smrgstatic unsigned
3931b8e80941Smrgimage_intrinsic_coord_components(nir_intrinsic_instr *instr)
3932b8e80941Smrg{
3933b8e80941Smrg   switch (nir_intrinsic_image_dim(instr)) {
3934b8e80941Smrg   case GLSL_SAMPLER_DIM_1D:
3935b8e80941Smrg      return 1 + nir_intrinsic_image_array(instr);
3936b8e80941Smrg   case GLSL_SAMPLER_DIM_2D:
3937b8e80941Smrg   case GLSL_SAMPLER_DIM_RECT:
3938b8e80941Smrg      return 2 + nir_intrinsic_image_array(instr);
3939b8e80941Smrg   case GLSL_SAMPLER_DIM_3D:
3940b8e80941Smrg   case GLSL_SAMPLER_DIM_CUBE:
3941b8e80941Smrg      return 3;
3942b8e80941Smrg   case GLSL_SAMPLER_DIM_BUF:
3943b8e80941Smrg      return 1;
3944b8e80941Smrg   case GLSL_SAMPLER_DIM_MS:
3945b8e80941Smrg      return 2 + nir_intrinsic_image_array(instr);
3946b8e80941Smrg   default:
3947b8e80941Smrg      unreachable("Invalid image dimension");
3948b8e80941Smrg   }
3949b8e80941Smrg}
3950b8e80941Smrg
3951b8e80941Smrgvoid
3952b8e80941Smrgfs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
3953b8e80941Smrg{
3954b8e80941Smrg   fs_reg dest;
3955b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
3956b8e80941Smrg      dest = get_nir_dest(instr->dest);
3957b8e80941Smrg
3958b8e80941Smrg   switch (instr->intrinsic) {
3959b8e80941Smrg   case nir_intrinsic_image_load:
3960b8e80941Smrg   case nir_intrinsic_image_store:
3961b8e80941Smrg   case nir_intrinsic_image_atomic_add:
3962b8e80941Smrg   case nir_intrinsic_image_atomic_min:
3963b8e80941Smrg   case nir_intrinsic_image_atomic_max:
3964b8e80941Smrg   case nir_intrinsic_image_atomic_and:
3965b8e80941Smrg   case nir_intrinsic_image_atomic_or:
3966b8e80941Smrg   case nir_intrinsic_image_atomic_xor:
3967b8e80941Smrg   case nir_intrinsic_image_atomic_exchange:
3968b8e80941Smrg   case nir_intrinsic_image_atomic_comp_swap:
3969b8e80941Smrg   case nir_intrinsic_bindless_image_load:
3970b8e80941Smrg   case nir_intrinsic_bindless_image_store:
3971b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_add:
3972b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_min:
3973b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_max:
3974b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_and:
3975b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_or:
3976b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_xor:
3977b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_exchange:
3978b8e80941Smrg   case nir_intrinsic_bindless_image_atomic_comp_swap: {
3979b8e80941Smrg      if (stage == MESA_SHADER_FRAGMENT &&
3980b8e80941Smrg          instr->intrinsic != nir_intrinsic_image_load)
3981b8e80941Smrg         brw_wm_prog_data(prog_data)->has_side_effects = true;
3982b8e80941Smrg
3983b8e80941Smrg      /* Get some metadata from the image intrinsic. */
3984b8e80941Smrg      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
3985b8e80941Smrg      const GLenum format = nir_intrinsic_format(instr);
3986b8e80941Smrg
3987b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
3988b8e80941Smrg
3989b8e80941Smrg      switch (instr->intrinsic) {
3990b8e80941Smrg      case nir_intrinsic_image_load:
3991b8e80941Smrg      case nir_intrinsic_image_store:
3992b8e80941Smrg      case nir_intrinsic_image_atomic_add:
3993b8e80941Smrg      case nir_intrinsic_image_atomic_min:
3994b8e80941Smrg      case nir_intrinsic_image_atomic_max:
3995b8e80941Smrg      case nir_intrinsic_image_atomic_and:
3996b8e80941Smrg      case nir_intrinsic_image_atomic_or:
3997b8e80941Smrg      case nir_intrinsic_image_atomic_xor:
3998b8e80941Smrg      case nir_intrinsic_image_atomic_exchange:
3999b8e80941Smrg      case nir_intrinsic_image_atomic_comp_swap:
4000b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4001b8e80941Smrg            get_nir_image_intrinsic_image(bld, instr);
4002b8e80941Smrg         break;
4003b8e80941Smrg
4004b8e80941Smrg      default:
4005b8e80941Smrg         /* Bindless */
4006b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] =
4007b8e80941Smrg            bld.emit_uniformize(get_nir_src(instr->src[0]));
4008b8e80941Smrg         break;
4009b8e80941Smrg      }
4010b8e80941Smrg
4011b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4012b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] =
4013b8e80941Smrg         brw_imm_ud(image_intrinsic_coord_components(instr));
4014b8e80941Smrg
4015b8e80941Smrg      /* Emit an image load, store or atomic op. */
4016b8e80941Smrg      if (instr->intrinsic == nir_intrinsic_image_load ||
4017b8e80941Smrg          instr->intrinsic == nir_intrinsic_bindless_image_load) {
4018b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4019b8e80941Smrg         fs_inst *inst =
4020b8e80941Smrg            bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
4021b8e80941Smrg                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4022b8e80941Smrg         inst->size_written = instr->num_components * dispatch_width * 4;
4023b8e80941Smrg      } else if (instr->intrinsic == nir_intrinsic_image_store ||
4024b8e80941Smrg                 instr->intrinsic == nir_intrinsic_bindless_image_store) {
4025b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4026b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]);
4027b8e80941Smrg         bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
4028b8e80941Smrg                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4029b8e80941Smrg      } else {
4030b8e80941Smrg         int op;
4031b8e80941Smrg         unsigned num_srcs = info->num_srcs;
4032b8e80941Smrg
4033b8e80941Smrg         switch (instr->intrinsic) {
4034b8e80941Smrg         case nir_intrinsic_image_atomic_add:
4035b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_add:
4036b8e80941Smrg            assert(num_srcs == 4);
4037b8e80941Smrg
4038b8e80941Smrg            op = get_op_for_atomic_add(instr, 3);
4039b8e80941Smrg
4040b8e80941Smrg            if (op != BRW_AOP_ADD)
4041b8e80941Smrg               num_srcs = 3;
4042b8e80941Smrg            break;
4043b8e80941Smrg         case nir_intrinsic_image_atomic_min:
4044b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_min:
4045b8e80941Smrg            assert(format == GL_R32UI || format == GL_R32I);
4046b8e80941Smrg            op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN;
4047b8e80941Smrg            break;
4048b8e80941Smrg         case nir_intrinsic_image_atomic_max:
4049b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_max:
4050b8e80941Smrg            assert(format == GL_R32UI || format == GL_R32I);
4051b8e80941Smrg            op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX;
4052b8e80941Smrg            break;
4053b8e80941Smrg         case nir_intrinsic_image_atomic_and:
4054b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_and:
4055b8e80941Smrg            op = BRW_AOP_AND;
4056b8e80941Smrg            break;
4057b8e80941Smrg         case nir_intrinsic_image_atomic_or:
4058b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_or:
4059b8e80941Smrg            op = BRW_AOP_OR;
4060b8e80941Smrg            break;
4061b8e80941Smrg         case nir_intrinsic_image_atomic_xor:
4062b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_xor:
4063b8e80941Smrg            op = BRW_AOP_XOR;
4064b8e80941Smrg            break;
4065b8e80941Smrg         case nir_intrinsic_image_atomic_exchange:
4066b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_exchange:
4067b8e80941Smrg            op = BRW_AOP_MOV;
4068b8e80941Smrg            break;
4069b8e80941Smrg         case nir_intrinsic_image_atomic_comp_swap:
4070b8e80941Smrg         case nir_intrinsic_bindless_image_atomic_comp_swap:
4071b8e80941Smrg            op = BRW_AOP_CMPWR;
4072b8e80941Smrg            break;
4073b8e80941Smrg         default:
4074b8e80941Smrg            unreachable("Not reachable.");
4075b8e80941Smrg         }
4076b8e80941Smrg
4077b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
4078b8e80941Smrg
4079b8e80941Smrg         fs_reg data;
4080b8e80941Smrg         if (num_srcs >= 4)
4081b8e80941Smrg            data = get_nir_src(instr->src[3]);
4082b8e80941Smrg         if (num_srcs >= 5) {
4083b8e80941Smrg            fs_reg tmp = bld.vgrf(data.type, 2);
4084b8e80941Smrg            fs_reg sources[2] = { data, get_nir_src(instr->src[4]) };
4085b8e80941Smrg            bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
4086b8e80941Smrg            data = tmp;
4087b8e80941Smrg         }
4088b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4089b8e80941Smrg
4090b8e80941Smrg         bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
4091b8e80941Smrg                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4092b8e80941Smrg      }
4093b8e80941Smrg      break;
4094b8e80941Smrg   }
4095b8e80941Smrg
4096b8e80941Smrg   case nir_intrinsic_image_size:
4097b8e80941Smrg   case nir_intrinsic_bindless_image_size: {
4098b8e80941Smrg      /* Unlike the [un]typed load and store opcodes, the TXS that this turns
4099b8e80941Smrg       * into will handle the binding table index for us in the geneerator.
4100b8e80941Smrg       * Incidentally, this means that we can handle bindless with exactly the
4101b8e80941Smrg       * same code.
4102b8e80941Smrg       */
4103b8e80941Smrg      fs_reg image = retype(get_nir_src_imm(instr->src[0]),
4104b8e80941Smrg                            BRW_REGISTER_TYPE_UD);
4105b8e80941Smrg      image = bld.emit_uniformize(image);
4106b8e80941Smrg
4107b8e80941Smrg      fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
4108b8e80941Smrg      if (instr->intrinsic == nir_intrinsic_image_size)
4109b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE] = image;
4110b8e80941Smrg      else
4111b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image;
4112b8e80941Smrg      srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
4113b8e80941Smrg      srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0);
4114b8e80941Smrg      srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
4115b8e80941Smrg
4116b8e80941Smrg      /* Since the image size is always uniform, we can just emit a SIMD8
4117b8e80941Smrg       * query instruction and splat the result out.
4118b8e80941Smrg       */
4119b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(8, 0);
4120b8e80941Smrg
4121b8e80941Smrg      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4122b8e80941Smrg      fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL,
4123b8e80941Smrg                                tmp, srcs, ARRAY_SIZE(srcs));
4124b8e80941Smrg      inst->size_written = 4 * REG_SIZE;
4125b8e80941Smrg
4126b8e80941Smrg      for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) {
4127b8e80941Smrg         if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) {
4128b8e80941Smrg            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
4129b8e80941Smrg                     offset(retype(dest, tmp.type), bld, c),
4130b8e80941Smrg                     component(offset(tmp, ubld, c), 0), brw_imm_ud(6));
4131b8e80941Smrg         } else {
4132b8e80941Smrg            bld.MOV(offset(retype(dest, tmp.type), bld, c),
4133b8e80941Smrg                    component(offset(tmp, ubld, c), 0));
4134b8e80941Smrg         }
4135b8e80941Smrg      }
4136b8e80941Smrg      break;
4137b8e80941Smrg   }
4138b8e80941Smrg
4139b8e80941Smrg   case nir_intrinsic_image_load_raw_intel: {
4140b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4141b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4142b8e80941Smrg         get_nir_image_intrinsic_image(bld, instr);
4143b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4144b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4145b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4146b8e80941Smrg
4147b8e80941Smrg      fs_inst *inst =
4148b8e80941Smrg         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4149b8e80941Smrg                  dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4150b8e80941Smrg      inst->size_written = instr->num_components * dispatch_width * 4;
4151b8e80941Smrg      break;
4152b8e80941Smrg   }
4153b8e80941Smrg
4154b8e80941Smrg   case nir_intrinsic_image_store_raw_intel: {
4155b8e80941Smrg      if (stage == MESA_SHADER_FRAGMENT)
4156b8e80941Smrg         brw_wm_prog_data(prog_data)->has_side_effects = true;
4157b8e80941Smrg
4158b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4159b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4160b8e80941Smrg         get_nir_image_intrinsic_image(bld, instr);
4161b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4162b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]);
4163b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4164b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4165b8e80941Smrg
4166b8e80941Smrg      bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4167b8e80941Smrg               fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4168b8e80941Smrg      break;
4169b8e80941Smrg   }
4170b8e80941Smrg
4171b8e80941Smrg   case nir_intrinsic_group_memory_barrier:
4172b8e80941Smrg   case nir_intrinsic_memory_barrier_shared:
4173b8e80941Smrg   case nir_intrinsic_memory_barrier_atomic_counter:
4174b8e80941Smrg   case nir_intrinsic_memory_barrier_buffer:
4175b8e80941Smrg   case nir_intrinsic_memory_barrier_image:
4176b8e80941Smrg   case nir_intrinsic_memory_barrier: {
4177b8e80941Smrg      const fs_builder ubld = bld.group(8, 0);
4178b8e80941Smrg      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4179b8e80941Smrg      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
4180b8e80941Smrg                brw_vec8_grf(0, 0), brw_imm_ud(0))
4181b8e80941Smrg         ->size_written = 2 * REG_SIZE;
4182b8e80941Smrg      break;
4183b8e80941Smrg   }
4184b8e80941Smrg
4185b8e80941Smrg   case nir_intrinsic_shader_clock: {
4186b8e80941Smrg      /* We cannot do anything if there is an event, so ignore it for now */
4187b8e80941Smrg      const fs_reg shader_clock = get_timestamp(bld);
4188b8e80941Smrg      const fs_reg srcs[] = { component(shader_clock, 0),
4189b8e80941Smrg                              component(shader_clock, 1) };
4190b8e80941Smrg      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
4191b8e80941Smrg      break;
4192b8e80941Smrg   }
4193b8e80941Smrg
4194b8e80941Smrg   case nir_intrinsic_image_samples:
4195b8e80941Smrg      /* The driver does not support multi-sampled images. */
4196b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
4197b8e80941Smrg      break;
4198b8e80941Smrg
4199b8e80941Smrg   case nir_intrinsic_load_uniform: {
4200b8e80941Smrg      /* Offsets are in bytes but they should always aligned to
4201b8e80941Smrg       * the type size
4202b8e80941Smrg       */
4203b8e80941Smrg      assert(instr->const_index[0] % 4 == 0 ||
4204b8e80941Smrg             instr->const_index[0] % type_sz(dest.type) == 0);
4205b8e80941Smrg
4206b8e80941Smrg      fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
4207b8e80941Smrg
4208b8e80941Smrg      if (nir_src_is_const(instr->src[0])) {
4209b8e80941Smrg         unsigned load_offset = nir_src_as_uint(instr->src[0]);
4210b8e80941Smrg         assert(load_offset % type_sz(dest.type) == 0);
4211b8e80941Smrg         /* For 16-bit types we add the module of the const_index[0]
4212b8e80941Smrg          * offset to access to not 32-bit aligned element
4213b8e80941Smrg          */
4214b8e80941Smrg         src.offset = load_offset + instr->const_index[0] % 4;
4215b8e80941Smrg
4216b8e80941Smrg         for (unsigned j = 0; j < instr->num_components; j++) {
4217b8e80941Smrg            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
4218b8e80941Smrg         }
4219b8e80941Smrg      } else {
4220b8e80941Smrg         fs_reg indirect = retype(get_nir_src(instr->src[0]),
4221b8e80941Smrg                                  BRW_REGISTER_TYPE_UD);
4222b8e80941Smrg
4223b8e80941Smrg         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
4224b8e80941Smrg          * go past the end of the uniform.  In order to keep the n'th
4225b8e80941Smrg          * component from running past, we subtract off the size of all but
4226b8e80941Smrg          * one component of the vector.
4227b8e80941Smrg          */
4228b8e80941Smrg         assert(instr->const_index[1] >=
4229b8e80941Smrg                instr->num_components * (int) type_sz(dest.type));
4230b8e80941Smrg         unsigned read_size = instr->const_index[1] -
4231b8e80941Smrg            (instr->num_components - 1) * type_sz(dest.type);
4232b8e80941Smrg
4233b8e80941Smrg         bool supports_64bit_indirects =
4234b8e80941Smrg            !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo);
4235b8e80941Smrg
4236b8e80941Smrg         if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
4237b8e80941Smrg            for (unsigned j = 0; j < instr->num_components; j++) {
4238b8e80941Smrg               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4239b8e80941Smrg                        offset(dest, bld, j), offset(src, bld, j),
4240b8e80941Smrg                        indirect, brw_imm_ud(read_size));
4241b8e80941Smrg            }
4242b8e80941Smrg         } else {
4243b8e80941Smrg            const unsigned num_mov_indirects =
4244b8e80941Smrg               type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
4245b8e80941Smrg            /* We read a little bit less per MOV INDIRECT, as they are now
4246b8e80941Smrg             * 32-bits ones instead of 64-bit. Fix read_size then.
4247b8e80941Smrg             */
4248b8e80941Smrg            const unsigned read_size_32bit = read_size -
4249b8e80941Smrg                (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
4250b8e80941Smrg            for (unsigned j = 0; j < instr->num_components; j++) {
4251b8e80941Smrg               for (unsigned i = 0; i < num_mov_indirects; i++) {
4252b8e80941Smrg                  bld.emit(SHADER_OPCODE_MOV_INDIRECT,
4253b8e80941Smrg                           subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
4254b8e80941Smrg                           subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
4255b8e80941Smrg                           indirect, brw_imm_ud(read_size_32bit));
4256b8e80941Smrg               }
4257b8e80941Smrg            }
4258b8e80941Smrg         }
4259b8e80941Smrg      }
4260b8e80941Smrg      break;
4261b8e80941Smrg   }
4262b8e80941Smrg
4263b8e80941Smrg   case nir_intrinsic_load_ubo: {
4264b8e80941Smrg      fs_reg surf_index;
4265b8e80941Smrg      if (nir_src_is_const(instr->src[0])) {
4266b8e80941Smrg         const unsigned index = stage_prog_data->binding_table.ubo_start +
4267b8e80941Smrg                                nir_src_as_uint(instr->src[0]);
4268b8e80941Smrg         surf_index = brw_imm_ud(index);
4269b8e80941Smrg      } else {
4270b8e80941Smrg         /* The block index is not a constant. Evaluate the index expression
4271b8e80941Smrg          * per-channel and add the base UBO index; we have to select a value
4272b8e80941Smrg          * from any live channel.
4273b8e80941Smrg          */
4274b8e80941Smrg         surf_index = vgrf(glsl_type::uint_type);
4275b8e80941Smrg         bld.ADD(surf_index, get_nir_src(instr->src[0]),
4276b8e80941Smrg                 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
4277b8e80941Smrg         surf_index = bld.emit_uniformize(surf_index);
4278b8e80941Smrg      }
4279b8e80941Smrg
4280b8e80941Smrg      if (!nir_src_is_const(instr->src[1])) {
4281b8e80941Smrg         fs_reg base_offset = retype(get_nir_src(instr->src[1]),
4282b8e80941Smrg                                     BRW_REGISTER_TYPE_UD);
4283b8e80941Smrg
4284b8e80941Smrg         for (int i = 0; i < instr->num_components; i++)
4285b8e80941Smrg            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
4286b8e80941Smrg                                       base_offset, i * type_sz(dest.type));
4287b8e80941Smrg      } else {
4288b8e80941Smrg         /* Even if we are loading doubles, a pull constant load will load
4289b8e80941Smrg          * a 32-bit vec4, so should only reserve vgrf space for that. If we
4290b8e80941Smrg          * need to load a full dvec4 we will have to emit 2 loads. This is
4291b8e80941Smrg          * similar to demote_pull_constants(), except that in that case we
4292b8e80941Smrg          * see individual accesses to each component of the vector and then
4293b8e80941Smrg          * we let CSE deal with duplicate loads. Here we see a vector access
4294b8e80941Smrg          * and we have to split it if necessary.
4295b8e80941Smrg          */
4296b8e80941Smrg         const unsigned type_size = type_sz(dest.type);
4297b8e80941Smrg         const unsigned load_offset = nir_src_as_uint(instr->src[1]);
4298b8e80941Smrg
4299b8e80941Smrg         /* See if we've selected this as a push constant candidate */
4300b8e80941Smrg         if (nir_src_is_const(instr->src[0])) {
4301b8e80941Smrg            const unsigned ubo_block = nir_src_as_uint(instr->src[0]);
4302b8e80941Smrg            const unsigned offset_256b = load_offset / 32;
4303b8e80941Smrg
4304b8e80941Smrg            fs_reg push_reg;
4305b8e80941Smrg            for (int i = 0; i < 4; i++) {
4306b8e80941Smrg               const struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
4307b8e80941Smrg               if (range->block == ubo_block &&
4308b8e80941Smrg                   offset_256b >= range->start &&
4309b8e80941Smrg                   offset_256b < range->start + range->length) {
4310b8e80941Smrg
4311b8e80941Smrg                  push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type);
4312b8e80941Smrg                  push_reg.offset = load_offset - 32 * range->start;
4313b8e80941Smrg                  break;
4314b8e80941Smrg               }
4315b8e80941Smrg            }
4316b8e80941Smrg
4317b8e80941Smrg            if (push_reg.file != BAD_FILE) {
4318b8e80941Smrg               for (unsigned i = 0; i < instr->num_components; i++) {
4319b8e80941Smrg                  bld.MOV(offset(dest, bld, i),
4320b8e80941Smrg                          byte_offset(push_reg, i * type_size));
4321b8e80941Smrg               }
4322b8e80941Smrg               break;
4323b8e80941Smrg            }
4324b8e80941Smrg         }
4325b8e80941Smrg
4326b8e80941Smrg         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
4327b8e80941Smrg         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
4328b8e80941Smrg         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4329b8e80941Smrg
4330b8e80941Smrg         for (unsigned c = 0; c < instr->num_components;) {
4331b8e80941Smrg            const unsigned base = load_offset + c * type_size;
4332b8e80941Smrg            /* Number of usable components in the next block-aligned load. */
4333b8e80941Smrg            const unsigned count = MIN2(instr->num_components - c,
4334b8e80941Smrg                                        (block_sz - base % block_sz) / type_size);
4335b8e80941Smrg
4336b8e80941Smrg            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
4337b8e80941Smrg                      packed_consts, surf_index,
4338b8e80941Smrg                      brw_imm_ud(base & ~(block_sz - 1)));
4339b8e80941Smrg
4340b8e80941Smrg            const fs_reg consts =
4341b8e80941Smrg               retype(byte_offset(packed_consts, base & (block_sz - 1)),
4342b8e80941Smrg                      dest.type);
4343b8e80941Smrg
4344b8e80941Smrg            for (unsigned d = 0; d < count; d++)
4345b8e80941Smrg               bld.MOV(offset(dest, bld, c + d), component(consts, d));
4346b8e80941Smrg
4347b8e80941Smrg            c += count;
4348b8e80941Smrg         }
4349b8e80941Smrg      }
4350b8e80941Smrg      break;
4351b8e80941Smrg   }
4352b8e80941Smrg
4353b8e80941Smrg   case nir_intrinsic_load_global: {
4354b8e80941Smrg      assert(devinfo->gen >= 8);
4355b8e80941Smrg
4356b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
4357b8e80941Smrg         assert(nir_dest_bit_size(instr->dest) == 32);
4358b8e80941Smrg         fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL,
4359b8e80941Smrg                                  dest,
4360b8e80941Smrg                                  get_nir_src(instr->src[0]), /* Address */
4361b8e80941Smrg                                  fs_reg(), /* No source data */
4362b8e80941Smrg                                  brw_imm_ud(instr->num_components));
4363b8e80941Smrg         inst->size_written = instr->num_components *
4364b8e80941Smrg                              inst->dst.component_size(inst->exec_size);
4365b8e80941Smrg      } else {
4366b8e80941Smrg         const unsigned bit_size = nir_dest_bit_size(instr->dest);
4367b8e80941Smrg         assert(bit_size <= 32);
4368b8e80941Smrg         assert(nir_dest_num_components(instr->dest) == 1);
4369b8e80941Smrg         brw_reg_type data_type =
4370b8e80941Smrg            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4371b8e80941Smrg         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4372b8e80941Smrg         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL,
4373b8e80941Smrg                  tmp,
4374b8e80941Smrg                  get_nir_src(instr->src[0]), /* Address */
4375b8e80941Smrg                  fs_reg(), /* No source data */
4376b8e80941Smrg                  brw_imm_ud(bit_size));
4377b8e80941Smrg         bld.MOV(retype(dest, data_type), tmp);
4378b8e80941Smrg      }
4379b8e80941Smrg      break;
4380b8e80941Smrg   }
4381b8e80941Smrg
4382b8e80941Smrg   case nir_intrinsic_store_global:
4383b8e80941Smrg      assert(devinfo->gen >= 8);
4384b8e80941Smrg
4385b8e80941Smrg      if (stage == MESA_SHADER_FRAGMENT)
4386b8e80941Smrg         brw_wm_prog_data(prog_data)->has_side_effects = true;
4387b8e80941Smrg
4388b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
4389b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) == 32);
4390b8e80941Smrg         bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL,
4391b8e80941Smrg                  fs_reg(),
4392b8e80941Smrg                  get_nir_src(instr->src[1]), /* Address */
4393b8e80941Smrg                  get_nir_src(instr->src[0]), /* Data */
4394b8e80941Smrg                  brw_imm_ud(instr->num_components));
4395b8e80941Smrg      } else {
4396b8e80941Smrg         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4397b8e80941Smrg         assert(bit_size <= 32);
4398b8e80941Smrg         assert(nir_src_num_components(instr->src[0]) == 1);
4399b8e80941Smrg         brw_reg_type data_type =
4400b8e80941Smrg            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4401b8e80941Smrg         fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4402b8e80941Smrg         bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type));
4403b8e80941Smrg         bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL,
4404b8e80941Smrg                  fs_reg(),
4405b8e80941Smrg                  get_nir_src(instr->src[1]), /* Address */
4406b8e80941Smrg                  tmp, /* Data */
4407b8e80941Smrg                  brw_imm_ud(nir_src_bit_size(instr->src[0])));
4408b8e80941Smrg      }
4409b8e80941Smrg      break;
4410b8e80941Smrg
4411b8e80941Smrg   case nir_intrinsic_global_atomic_add:
4412b8e80941Smrg      nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr);
4413b8e80941Smrg      break;
4414b8e80941Smrg   case nir_intrinsic_global_atomic_imin:
4415b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr);
4416b8e80941Smrg      break;
4417b8e80941Smrg   case nir_intrinsic_global_atomic_umin:
4418b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr);
4419b8e80941Smrg      break;
4420b8e80941Smrg   case nir_intrinsic_global_atomic_imax:
4421b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr);
4422b8e80941Smrg      break;
4423b8e80941Smrg   case nir_intrinsic_global_atomic_umax:
4424b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr);
4425b8e80941Smrg      break;
4426b8e80941Smrg   case nir_intrinsic_global_atomic_and:
4427b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_AND, instr);
4428b8e80941Smrg      break;
4429b8e80941Smrg   case nir_intrinsic_global_atomic_or:
4430b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_OR, instr);
4431b8e80941Smrg      break;
4432b8e80941Smrg   case nir_intrinsic_global_atomic_xor:
4433b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_XOR, instr);
4434b8e80941Smrg      break;
4435b8e80941Smrg   case nir_intrinsic_global_atomic_exchange:
4436b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_MOV, instr);
4437b8e80941Smrg      break;
4438b8e80941Smrg   case nir_intrinsic_global_atomic_comp_swap:
4439b8e80941Smrg      nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr);
4440b8e80941Smrg      break;
4441b8e80941Smrg   case nir_intrinsic_global_atomic_fmin:
4442b8e80941Smrg      nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr);
4443b8e80941Smrg      break;
4444b8e80941Smrg   case nir_intrinsic_global_atomic_fmax:
4445b8e80941Smrg      nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr);
4446b8e80941Smrg      break;
4447b8e80941Smrg   case nir_intrinsic_global_atomic_fcomp_swap:
4448b8e80941Smrg      nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4449b8e80941Smrg      break;
4450b8e80941Smrg
4451b8e80941Smrg   case nir_intrinsic_load_ssbo: {
4452b8e80941Smrg      assert(devinfo->gen >= 7);
4453b8e80941Smrg
4454b8e80941Smrg      const unsigned bit_size = nir_dest_bit_size(instr->dest);
4455b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4456b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4457b8e80941Smrg         get_nir_ssbo_intrinsic_index(bld, instr);
4458b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
4459b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4460b8e80941Smrg
4461b8e80941Smrg      /* Make dest unsigned because that's what the temporary will be */
4462b8e80941Smrg      dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4463b8e80941Smrg
4464b8e80941Smrg      /* Read the vector */
4465b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
4466b8e80941Smrg         assert(nir_dest_bit_size(instr->dest) == 32);
4467b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4468b8e80941Smrg         fs_inst *inst =
4469b8e80941Smrg            bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
4470b8e80941Smrg                     dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
4471b8e80941Smrg         inst->size_written = instr->num_components * dispatch_width * 4;
4472b8e80941Smrg      } else {
4473b8e80941Smrg         assert(nir_dest_bit_size(instr->dest) <= 32);
4474b8e80941Smrg         assert(nir_dest_num_components(instr->dest) == 1);
4475b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4476b8e80941Smrg
4477b8e80941Smrg         fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD);
4478b8e80941Smrg         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
4479b8e80941Smrg                  read_result, srcs, SURFACE_LOGICAL_NUM_SRCS);
4480b8e80941Smrg         bld.MOV(dest, read_result);
4481b8e80941Smrg      }
4482b8e80941Smrg      break;
4483b8e80941Smrg   }
4484b8e80941Smrg
4485b8e80941Smrg   case nir_intrinsic_store_ssbo: {
4486b8e80941Smrg      assert(devinfo->gen >= 7);
4487b8e80941Smrg
4488b8e80941Smrg      if (stage == MESA_SHADER_FRAGMENT)
4489b8e80941Smrg         brw_wm_prog_data(prog_data)->has_side_effects = true;
4490b8e80941Smrg
4491b8e80941Smrg      const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4492b8e80941Smrg      fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
4493b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_SURFACE] =
4494b8e80941Smrg         get_nir_ssbo_intrinsic_index(bld, instr);
4495b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]);
4496b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
4497b8e80941Smrg
4498b8e80941Smrg      fs_reg data = get_nir_src(instr->src[0]);
4499b8e80941Smrg      data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD);
4500b8e80941Smrg
4501b8e80941Smrg      assert(nir_intrinsic_write_mask(instr) ==
4502b8e80941Smrg             (1u << instr->num_components) - 1);
4503b8e80941Smrg      if (nir_intrinsic_align(instr) >= 4) {
4504b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) == 32);
4505b8e80941Smrg         assert(nir_src_num_components(instr->src[0]) <= 4);
4506b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = data;
4507b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components);
4508b8e80941Smrg         bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
4509b8e80941Smrg                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4510b8e80941Smrg      } else {
4511b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) <= 32);
4512b8e80941Smrg         assert(nir_src_num_components(instr->src[0]) == 1);
4513b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size);
4514b8e80941Smrg
4515b8e80941Smrg         srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4516b8e80941Smrg         bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data);
4517b8e80941Smrg
4518b8e80941Smrg         bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
4519b8e80941Smrg                  fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS);
4520b8e80941Smrg      }
4521b8e80941Smrg      break;
4522b8e80941Smrg   }
4523b8e80941Smrg
4524b8e80941Smrg   case nir_intrinsic_store_output: {
4525b8e80941Smrg      fs_reg src = get_nir_src(instr->src[0]);
4526b8e80941Smrg
4527b8e80941Smrg      unsigned store_offset = nir_src_as_uint(instr->src[1]);
4528b8e80941Smrg      unsigned num_components = instr->num_components;
4529b8e80941Smrg      unsigned first_component = nir_intrinsic_component(instr);
4530b8e80941Smrg      if (nir_src_bit_size(instr->src[0]) == 64) {
4531b8e80941Smrg         src = shuffle_for_32bit_write(bld, src, 0, num_components);
4532b8e80941Smrg         num_components *= 2;
4533b8e80941Smrg      }
4534b8e80941Smrg
4535b8e80941Smrg      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
4536b8e80941Smrg                                      4 * store_offset), src.type);
4537b8e80941Smrg      for (unsigned j = 0; j < num_components; j++) {
4538b8e80941Smrg         bld.MOV(offset(new_dest, bld, j + first_component),
4539b8e80941Smrg                 offset(src, bld, j));
4540b8e80941Smrg      }
4541b8e80941Smrg      break;
4542b8e80941Smrg   }
4543b8e80941Smrg
4544b8e80941Smrg   case nir_intrinsic_ssbo_atomic_add:
4545b8e80941Smrg      nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr);
4546b8e80941Smrg      break;
4547b8e80941Smrg   case nir_intrinsic_ssbo_atomic_imin:
4548b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
4549b8e80941Smrg      break;
4550b8e80941Smrg   case nir_intrinsic_ssbo_atomic_umin:
4551b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
4552b8e80941Smrg      break;
4553b8e80941Smrg   case nir_intrinsic_ssbo_atomic_imax:
4554b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
4555b8e80941Smrg      break;
4556b8e80941Smrg   case nir_intrinsic_ssbo_atomic_umax:
4557b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
4558b8e80941Smrg      break;
4559b8e80941Smrg   case nir_intrinsic_ssbo_atomic_and:
4560b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
4561b8e80941Smrg      break;
4562b8e80941Smrg   case nir_intrinsic_ssbo_atomic_or:
4563b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
4564b8e80941Smrg      break;
4565b8e80941Smrg   case nir_intrinsic_ssbo_atomic_xor:
4566b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
4567b8e80941Smrg      break;
4568b8e80941Smrg   case nir_intrinsic_ssbo_atomic_exchange:
4569b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
4570b8e80941Smrg      break;
4571b8e80941Smrg   case nir_intrinsic_ssbo_atomic_comp_swap:
4572b8e80941Smrg      nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
4573b8e80941Smrg      break;
4574b8e80941Smrg   case nir_intrinsic_ssbo_atomic_fmin:
4575b8e80941Smrg      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr);
4576b8e80941Smrg      break;
4577b8e80941Smrg   case nir_intrinsic_ssbo_atomic_fmax:
4578b8e80941Smrg      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr);
4579b8e80941Smrg      break;
4580b8e80941Smrg   case nir_intrinsic_ssbo_atomic_fcomp_swap:
4581b8e80941Smrg      nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr);
4582b8e80941Smrg      break;
4583b8e80941Smrg
4584b8e80941Smrg   case nir_intrinsic_get_buffer_size: {
4585b8e80941Smrg      assert(nir_src_num_components(instr->src[0]) == 1);
4586b8e80941Smrg      unsigned ssbo_index = nir_src_is_const(instr->src[0]) ?
4587b8e80941Smrg                            nir_src_as_uint(instr->src[0]) : 0;
4588b8e80941Smrg
4589b8e80941Smrg      /* A resinfo's sampler message is used to get the buffer size.  The
4590b8e80941Smrg       * SIMD8's writeback message consists of four registers and SIMD16's
4591b8e80941Smrg       * writeback message consists of 8 destination registers (two per each
4592b8e80941Smrg       * component).  Because we are only interested on the first channel of
4593b8e80941Smrg       * the first returned component, where resinfo returns the buffer size
4594b8e80941Smrg       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
4595b8e80941Smrg       * the dispatch width.
4596b8e80941Smrg       */
4597b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(8, 0);
4598b8e80941Smrg      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4599b8e80941Smrg      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
4600b8e80941Smrg
4601b8e80941Smrg      /* Set LOD = 0 */
4602b8e80941Smrg      ubld.MOV(src_payload, brw_imm_d(0));
4603b8e80941Smrg
4604b8e80941Smrg      const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
4605b8e80941Smrg      fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload,
4606b8e80941Smrg                                src_payload, brw_imm_ud(index));
4607b8e80941Smrg      inst->header_size = 0;
4608b8e80941Smrg      inst->mlen = 1;
4609b8e80941Smrg      inst->size_written = 4 * REG_SIZE;
4610b8e80941Smrg
4611b8e80941Smrg      /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
4612b8e80941Smrg       *
4613b8e80941Smrg       * "Out-of-bounds checking is always performed at a DWord granularity. If
4614b8e80941Smrg       * any part of the DWord is out-of-bounds then the whole DWord is
4615b8e80941Smrg       * considered out-of-bounds."
4616b8e80941Smrg       *
4617b8e80941Smrg       * This implies that types with size smaller than 4-bytes need to be
4618b8e80941Smrg       * padded if they don't complete the last dword of the buffer. But as we
4619b8e80941Smrg       * need to maintain the original size we need to reverse the padding
4620b8e80941Smrg       * calculation to return the correct size to know the number of elements
4621b8e80941Smrg       * of an unsized array. As we stored in the last two bits of the surface
4622b8e80941Smrg       * size the needed padding for the buffer, we calculate here the
4623b8e80941Smrg       * original buffer_size reversing the surface_size calculation:
4624b8e80941Smrg       *
4625b8e80941Smrg       * surface_size = isl_align(buffer_size, 4) +
4626b8e80941Smrg       *                (isl_align(buffer_size) - buffer_size)
4627b8e80941Smrg       *
4628b8e80941Smrg       * buffer_size = surface_size & ~3 - surface_size & 3
4629b8e80941Smrg       */
4630b8e80941Smrg
4631b8e80941Smrg      fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4632b8e80941Smrg      fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4633b8e80941Smrg      fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD);
4634b8e80941Smrg
4635b8e80941Smrg      ubld.AND(size_padding, ret_payload, brw_imm_ud(3));
4636b8e80941Smrg      ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3));
4637b8e80941Smrg      ubld.ADD(buffer_size, size_aligned4, negate(size_padding));
4638b8e80941Smrg
4639b8e80941Smrg      bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0));
4640b8e80941Smrg      break;
4641b8e80941Smrg   }
4642b8e80941Smrg
4643b8e80941Smrg   case nir_intrinsic_load_subgroup_invocation:
4644b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
4645b8e80941Smrg              nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]);
4646b8e80941Smrg      break;
4647b8e80941Smrg
4648b8e80941Smrg   case nir_intrinsic_load_subgroup_eq_mask:
4649b8e80941Smrg   case nir_intrinsic_load_subgroup_ge_mask:
4650b8e80941Smrg   case nir_intrinsic_load_subgroup_gt_mask:
4651b8e80941Smrg   case nir_intrinsic_load_subgroup_le_mask:
4652b8e80941Smrg   case nir_intrinsic_load_subgroup_lt_mask:
4653b8e80941Smrg      unreachable("not reached");
4654b8e80941Smrg
4655b8e80941Smrg   case nir_intrinsic_vote_any: {
4656b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
4657b8e80941Smrg
4658b8e80941Smrg      /* The any/all predicates do not consider channel enables. To prevent
4659b8e80941Smrg       * dead channels from affecting the result, we initialize the flag with
4660b8e80941Smrg       * with the identity value for the logical operation.
4661b8e80941Smrg       */
4662b8e80941Smrg      if (dispatch_width == 32) {
4663b8e80941Smrg         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4664b8e80941Smrg         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4665b8e80941Smrg                         brw_imm_ud(0));
4666b8e80941Smrg      } else {
4667b8e80941Smrg         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0));
4668b8e80941Smrg      }
4669b8e80941Smrg      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4670b8e80941Smrg
4671b8e80941Smrg      /* For some reason, the any/all predicates don't work properly with
4672b8e80941Smrg       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4673b8e80941Smrg       * doesn't read the correct subset of the flag register and you end up
4674b8e80941Smrg       * getting garbage in the second half.  Work around this by using a pair
4675b8e80941Smrg       * of 1-wide MOVs and scattering the result.
4676b8e80941Smrg       */
4677b8e80941Smrg      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4678b8e80941Smrg      ubld.MOV(res1, brw_imm_d(0));
4679b8e80941Smrg      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ANY8H :
4680b8e80941Smrg                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H :
4681b8e80941Smrg                                           BRW_PREDICATE_ALIGN1_ANY32H,
4682b8e80941Smrg                    ubld.MOV(res1, brw_imm_d(-1)));
4683b8e80941Smrg
4684b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4685b8e80941Smrg      break;
4686b8e80941Smrg   }
4687b8e80941Smrg   case nir_intrinsic_vote_all: {
4688b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
4689b8e80941Smrg
4690b8e80941Smrg      /* The any/all predicates do not consider channel enables. To prevent
4691b8e80941Smrg       * dead channels from affecting the result, we initialize the flag with
4692b8e80941Smrg       * with the identity value for the logical operation.
4693b8e80941Smrg       */
4694b8e80941Smrg      if (dispatch_width == 32) {
4695b8e80941Smrg         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4696b8e80941Smrg         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4697b8e80941Smrg                         brw_imm_ud(0xffffffff));
4698b8e80941Smrg      } else {
4699b8e80941Smrg         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4700b8e80941Smrg      }
4701b8e80941Smrg      bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ);
4702b8e80941Smrg
4703b8e80941Smrg      /* For some reason, the any/all predicates don't work properly with
4704b8e80941Smrg       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4705b8e80941Smrg       * doesn't read the correct subset of the flag register and you end up
4706b8e80941Smrg       * getting garbage in the second half.  Work around this by using a pair
4707b8e80941Smrg       * of 1-wide MOVs and scattering the result.
4708b8e80941Smrg       */
4709b8e80941Smrg      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4710b8e80941Smrg      ubld.MOV(res1, brw_imm_d(0));
4711b8e80941Smrg      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4712b8e80941Smrg                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4713b8e80941Smrg                                           BRW_PREDICATE_ALIGN1_ALL32H,
4714b8e80941Smrg                    ubld.MOV(res1, brw_imm_d(-1)));
4715b8e80941Smrg
4716b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4717b8e80941Smrg      break;
4718b8e80941Smrg   }
4719b8e80941Smrg   case nir_intrinsic_vote_feq:
4720b8e80941Smrg   case nir_intrinsic_vote_ieq: {
4721b8e80941Smrg      fs_reg value = get_nir_src(instr->src[0]);
4722b8e80941Smrg      if (instr->intrinsic == nir_intrinsic_vote_feq) {
4723b8e80941Smrg         const unsigned bit_size = nir_src_bit_size(instr->src[0]);
4724b8e80941Smrg         value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B :
4725b8e80941Smrg            brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F);
4726b8e80941Smrg      }
4727b8e80941Smrg
4728b8e80941Smrg      fs_reg uniformized = bld.emit_uniformize(value);
4729b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
4730b8e80941Smrg
4731b8e80941Smrg      /* The any/all predicates do not consider channel enables. To prevent
4732b8e80941Smrg       * dead channels from affecting the result, we initialize the flag with
4733b8e80941Smrg       * with the identity value for the logical operation.
4734b8e80941Smrg       */
4735b8e80941Smrg      if (dispatch_width == 32) {
4736b8e80941Smrg         /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */
4737b8e80941Smrg         ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD),
4738b8e80941Smrg                         brw_imm_ud(0xffffffff));
4739b8e80941Smrg      } else {
4740b8e80941Smrg         ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff));
4741b8e80941Smrg      }
4742b8e80941Smrg      bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z);
4743b8e80941Smrg
4744b8e80941Smrg      /* For some reason, the any/all predicates don't work properly with
4745b8e80941Smrg       * SIMD32.  In particular, it appears that a SEL with a QtrCtrl of 2H
4746b8e80941Smrg       * doesn't read the correct subset of the flag register and you end up
4747b8e80941Smrg       * getting garbage in the second half.  Work around this by using a pair
4748b8e80941Smrg       * of 1-wide MOVs and scattering the result.
4749b8e80941Smrg       */
4750b8e80941Smrg      fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D);
4751b8e80941Smrg      ubld.MOV(res1, brw_imm_d(0));
4752b8e80941Smrg      set_predicate(dispatch_width == 8  ? BRW_PREDICATE_ALIGN1_ALL8H :
4753b8e80941Smrg                    dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H :
4754b8e80941Smrg                                           BRW_PREDICATE_ALIGN1_ALL32H,
4755b8e80941Smrg                    ubld.MOV(res1, brw_imm_d(-1)));
4756b8e80941Smrg
4757b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0));
4758b8e80941Smrg      break;
4759b8e80941Smrg   }
4760b8e80941Smrg
4761b8e80941Smrg   case nir_intrinsic_ballot: {
4762b8e80941Smrg      const fs_reg value = retype(get_nir_src(instr->src[0]),
4763b8e80941Smrg                                  BRW_REGISTER_TYPE_UD);
4764b8e80941Smrg      struct brw_reg flag = brw_flag_reg(0, 0);
4765b8e80941Smrg      /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well
4766b8e80941Smrg       * as f0.0.  This is a problem for fragment programs as we currently use
4767b8e80941Smrg       * f0.1 for discards.  Fortunately, we don't support SIMD32 fragment
4768b8e80941Smrg       * programs yet so this isn't a problem.  When we do, something will
4769b8e80941Smrg       * have to change.
4770b8e80941Smrg       */
4771b8e80941Smrg      if (dispatch_width == 32)
4772b8e80941Smrg         flag.type = BRW_REGISTER_TYPE_UD;
4773b8e80941Smrg
4774b8e80941Smrg      bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u));
4775b8e80941Smrg      bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ);
4776b8e80941Smrg
4777b8e80941Smrg      if (instr->dest.ssa.bit_size > 32) {
4778b8e80941Smrg         dest.type = BRW_REGISTER_TYPE_UQ;
4779b8e80941Smrg      } else {
4780b8e80941Smrg         dest.type = BRW_REGISTER_TYPE_UD;
4781b8e80941Smrg      }
4782b8e80941Smrg      bld.MOV(dest, flag);
4783b8e80941Smrg      break;
4784b8e80941Smrg   }
4785b8e80941Smrg
4786b8e80941Smrg   case nir_intrinsic_read_invocation: {
4787b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4788b8e80941Smrg      const fs_reg invocation = get_nir_src(instr->src[1]);
4789b8e80941Smrg      fs_reg tmp = bld.vgrf(value.type);
4790b8e80941Smrg
4791b8e80941Smrg      bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value,
4792b8e80941Smrg                          bld.emit_uniformize(invocation));
4793b8e80941Smrg
4794b8e80941Smrg      bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0)));
4795b8e80941Smrg      break;
4796b8e80941Smrg   }
4797b8e80941Smrg
4798b8e80941Smrg   case nir_intrinsic_read_first_invocation: {
4799b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4800b8e80941Smrg      bld.MOV(retype(dest, value.type), bld.emit_uniformize(value));
4801b8e80941Smrg      break;
4802b8e80941Smrg   }
4803b8e80941Smrg
4804b8e80941Smrg   case nir_intrinsic_shuffle: {
4805b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4806b8e80941Smrg      const fs_reg index = get_nir_src(instr->src[1]);
4807b8e80941Smrg
4808b8e80941Smrg      bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index);
4809b8e80941Smrg      break;
4810b8e80941Smrg   }
4811b8e80941Smrg
4812b8e80941Smrg   case nir_intrinsic_first_invocation: {
4813b8e80941Smrg      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
4814b8e80941Smrg      bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp);
4815b8e80941Smrg      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
4816b8e80941Smrg              fs_reg(component(tmp, 0)));
4817b8e80941Smrg      break;
4818b8e80941Smrg   }
4819b8e80941Smrg
4820b8e80941Smrg   case nir_intrinsic_quad_broadcast: {
4821b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4822b8e80941Smrg      const unsigned index = nir_src_as_uint(instr->src[1]);
4823b8e80941Smrg
4824b8e80941Smrg      bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type),
4825b8e80941Smrg               value, brw_imm_ud(index), brw_imm_ud(4));
4826b8e80941Smrg      break;
4827b8e80941Smrg   }
4828b8e80941Smrg
4829b8e80941Smrg   case nir_intrinsic_quad_swap_horizontal: {
4830b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4831b8e80941Smrg      const fs_reg tmp = bld.vgrf(value.type);
4832b8e80941Smrg      if (devinfo->gen <= 7) {
4833b8e80941Smrg         /* The hardware doesn't seem to support these crazy regions with
4834b8e80941Smrg          * compressed instructions on gen7 and earlier so we fall back to
4835b8e80941Smrg          * using quad swizzles.  Fortunately, we don't support 64-bit
4836b8e80941Smrg          * anything in Vulkan on gen7.
4837b8e80941Smrg          */
4838b8e80941Smrg         assert(nir_src_bit_size(instr->src[0]) == 32);
4839b8e80941Smrg         const fs_builder ubld = bld.exec_all();
4840b8e80941Smrg         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4841b8e80941Smrg                   brw_imm_ud(BRW_SWIZZLE4(1,0,3,2)));
4842b8e80941Smrg         bld.MOV(retype(dest, value.type), tmp);
4843b8e80941Smrg      } else {
4844b8e80941Smrg         const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0);
4845b8e80941Smrg
4846b8e80941Smrg         const fs_reg src_left = horiz_stride(value, 2);
4847b8e80941Smrg         const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2);
4848b8e80941Smrg         const fs_reg tmp_left = horiz_stride(tmp, 2);
4849b8e80941Smrg         const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2);
4850b8e80941Smrg
4851b8e80941Smrg         ubld.MOV(tmp_left, src_right);
4852b8e80941Smrg         ubld.MOV(tmp_right, src_left);
4853b8e80941Smrg
4854b8e80941Smrg      }
4855b8e80941Smrg      bld.MOV(retype(dest, value.type), tmp);
4856b8e80941Smrg      break;
4857b8e80941Smrg   }
4858b8e80941Smrg
4859b8e80941Smrg   case nir_intrinsic_quad_swap_vertical: {
4860b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4861b8e80941Smrg      if (nir_src_bit_size(instr->src[0]) == 32) {
4862b8e80941Smrg         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4863b8e80941Smrg         const fs_reg tmp = bld.vgrf(value.type);
4864b8e80941Smrg         const fs_builder ubld = bld.exec_all();
4865b8e80941Smrg         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4866b8e80941Smrg                   brw_imm_ud(BRW_SWIZZLE4(2,3,0,1)));
4867b8e80941Smrg         bld.MOV(retype(dest, value.type), tmp);
4868b8e80941Smrg      } else {
4869b8e80941Smrg         /* For larger data types, we have to either emit dispatch_width many
4870b8e80941Smrg          * MOVs or else fall back to doing indirects.
4871b8e80941Smrg          */
4872b8e80941Smrg         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4873b8e80941Smrg         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4874b8e80941Smrg                      brw_imm_w(0x2));
4875b8e80941Smrg         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4876b8e80941Smrg      }
4877b8e80941Smrg      break;
4878b8e80941Smrg   }
4879b8e80941Smrg
4880b8e80941Smrg   case nir_intrinsic_quad_swap_diagonal: {
4881b8e80941Smrg      const fs_reg value = get_nir_src(instr->src[0]);
4882b8e80941Smrg      if (nir_src_bit_size(instr->src[0]) == 32) {
4883b8e80941Smrg         /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */
4884b8e80941Smrg         const fs_reg tmp = bld.vgrf(value.type);
4885b8e80941Smrg         const fs_builder ubld = bld.exec_all();
4886b8e80941Smrg         ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value,
4887b8e80941Smrg                   brw_imm_ud(BRW_SWIZZLE4(3,2,1,0)));
4888b8e80941Smrg         bld.MOV(retype(dest, value.type), tmp);
4889b8e80941Smrg      } else {
4890b8e80941Smrg         /* For larger data types, we have to either emit dispatch_width many
4891b8e80941Smrg          * MOVs or else fall back to doing indirects.
4892b8e80941Smrg          */
4893b8e80941Smrg         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4894b8e80941Smrg         bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4895b8e80941Smrg                      brw_imm_w(0x3));
4896b8e80941Smrg         bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx);
4897b8e80941Smrg      }
4898b8e80941Smrg      break;
4899b8e80941Smrg   }
4900b8e80941Smrg
4901b8e80941Smrg   case nir_intrinsic_reduce: {
4902b8e80941Smrg      fs_reg src = get_nir_src(instr->src[0]);
4903b8e80941Smrg      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4904b8e80941Smrg      unsigned cluster_size = nir_intrinsic_cluster_size(instr);
4905b8e80941Smrg      if (cluster_size == 0 || cluster_size > dispatch_width)
4906b8e80941Smrg         cluster_size = dispatch_width;
4907b8e80941Smrg
4908b8e80941Smrg      /* Figure out the source type */
4909b8e80941Smrg      src.type = brw_type_for_nir_type(devinfo,
4910b8e80941Smrg         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4911b8e80941Smrg                        nir_src_bit_size(instr->src[0])));
4912b8e80941Smrg
4913b8e80941Smrg      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4914b8e80941Smrg      opcode brw_op = brw_op_for_nir_reduction_op(redop);
4915b8e80941Smrg      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4916b8e80941Smrg
4917b8e80941Smrg      /* Set up a register for all of our scratching around and initialize it
4918b8e80941Smrg       * to reduction operation's identity value.
4919b8e80941Smrg       */
4920b8e80941Smrg      fs_reg scan = bld.vgrf(src.type);
4921b8e80941Smrg      bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4922b8e80941Smrg
4923b8e80941Smrg      bld.emit_scan(brw_op, scan, cluster_size, cond_mod);
4924b8e80941Smrg
4925b8e80941Smrg      dest.type = src.type;
4926b8e80941Smrg      if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) {
4927b8e80941Smrg         /* In this case, CLUSTER_BROADCAST instruction isn't needed because
4928b8e80941Smrg          * the distance between clusters is at least 2 GRFs.  In this case,
4929b8e80941Smrg          * we don't need the weird striding of the CLUSTER_BROADCAST
4930b8e80941Smrg          * instruction and can just do regular MOVs.
4931b8e80941Smrg          */
4932b8e80941Smrg         assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0);
4933b8e80941Smrg         const unsigned groups =
4934b8e80941Smrg            (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2);
4935b8e80941Smrg         const unsigned group_size = dispatch_width / groups;
4936b8e80941Smrg         for (unsigned i = 0; i < groups; i++) {
4937b8e80941Smrg            const unsigned cluster = (i * group_size) / cluster_size;
4938b8e80941Smrg            const unsigned comp = cluster * cluster_size + (cluster_size - 1);
4939b8e80941Smrg            bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size),
4940b8e80941Smrg                                         component(scan, comp));
4941b8e80941Smrg         }
4942b8e80941Smrg      } else {
4943b8e80941Smrg         bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan,
4944b8e80941Smrg                  brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size));
4945b8e80941Smrg      }
4946b8e80941Smrg      break;
4947b8e80941Smrg   }
4948b8e80941Smrg
4949b8e80941Smrg   case nir_intrinsic_inclusive_scan:
4950b8e80941Smrg   case nir_intrinsic_exclusive_scan: {
4951b8e80941Smrg      fs_reg src = get_nir_src(instr->src[0]);
4952b8e80941Smrg      nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr);
4953b8e80941Smrg
4954b8e80941Smrg      /* Figure out the source type */
4955b8e80941Smrg      src.type = brw_type_for_nir_type(devinfo,
4956b8e80941Smrg         (nir_alu_type)(nir_op_infos[redop].input_types[0] |
4957b8e80941Smrg                        nir_src_bit_size(instr->src[0])));
4958b8e80941Smrg
4959b8e80941Smrg      fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type);
4960b8e80941Smrg      opcode brw_op = brw_op_for_nir_reduction_op(redop);
4961b8e80941Smrg      brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop);
4962b8e80941Smrg
4963b8e80941Smrg      /* Set up a register for all of our scratching around and initialize it
4964b8e80941Smrg       * to reduction operation's identity value.
4965b8e80941Smrg       */
4966b8e80941Smrg      fs_reg scan = bld.vgrf(src.type);
4967b8e80941Smrg      const fs_builder allbld = bld.exec_all();
4968b8e80941Smrg      allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity);
4969b8e80941Smrg
4970b8e80941Smrg      if (instr->intrinsic == nir_intrinsic_exclusive_scan) {
4971b8e80941Smrg         /* Exclusive scan is a bit harder because we have to do an annoying
4972b8e80941Smrg          * shift of the contents before we can begin.  To make things worse,
4973b8e80941Smrg          * we can't do this with a normal stride; we have to use indirects.
4974b8e80941Smrg          */
4975b8e80941Smrg         fs_reg shifted = bld.vgrf(src.type);
4976b8e80941Smrg         fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W);
4977b8e80941Smrg         allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION],
4978b8e80941Smrg                         brw_imm_w(-1));
4979b8e80941Smrg         allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx);
4980b8e80941Smrg         allbld.group(1, 0).MOV(component(shifted, 0), identity);
4981b8e80941Smrg         scan = shifted;
4982b8e80941Smrg      }
4983b8e80941Smrg
4984b8e80941Smrg      bld.emit_scan(brw_op, scan, dispatch_width, cond_mod);
4985b8e80941Smrg
4986b8e80941Smrg      bld.MOV(retype(dest, src.type), scan);
4987b8e80941Smrg      break;
4988b8e80941Smrg   }
4989b8e80941Smrg
4990b8e80941Smrg   case nir_intrinsic_begin_invocation_interlock: {
4991b8e80941Smrg      const fs_builder ubld = bld.group(8, 0);
4992b8e80941Smrg      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4993b8e80941Smrg
4994b8e80941Smrg      ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0))
4995b8e80941Smrg         ->size_written = 2 * REG_SIZE;
4996b8e80941Smrg      break;
4997b8e80941Smrg   }
4998b8e80941Smrg
4999b8e80941Smrg   case nir_intrinsic_end_invocation_interlock: {
5000b8e80941Smrg      /* For endInvocationInterlock(), we need to insert a memory fence which
5001b8e80941Smrg       * stalls in the shader until the memory transactions prior to that
5002b8e80941Smrg       * fence are complete.  This ensures that the shader does not end before
5003b8e80941Smrg       * any writes from its critical section have landed.  Otherwise, you can
5004b8e80941Smrg       * end up with a case where the next invocation on that pixel properly
5005b8e80941Smrg       * stalls for previous FS invocation on its pixel to complete but
5006b8e80941Smrg       * doesn't actually wait for the dataport memory transactions from that
5007b8e80941Smrg       * thread to land before submitting its own.
5008b8e80941Smrg       */
5009b8e80941Smrg      const fs_builder ubld = bld.group(8, 0);
5010b8e80941Smrg      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5011b8e80941Smrg      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
5012b8e80941Smrg                brw_vec8_grf(0, 0), brw_imm_ud(1))
5013b8e80941Smrg         ->size_written = 2 * REG_SIZE;
5014b8e80941Smrg      break;
5015b8e80941Smrg   }
5016b8e80941Smrg
5017b8e80941Smrg   default:
5018b8e80941Smrg      unreachable("unknown intrinsic");
5019b8e80941Smrg   }
5020b8e80941Smrg}
5021b8e80941Smrg
5022b8e80941Smrgvoid
5023b8e80941Smrgfs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
5024b8e80941Smrg                                 int op, nir_intrinsic_instr *instr)
5025b8e80941Smrg{
5026b8e80941Smrg   if (stage == MESA_SHADER_FRAGMENT)
5027b8e80941Smrg      brw_wm_prog_data(prog_data)->has_side_effects = true;
5028b8e80941Smrg
5029b8e80941Smrg   /* The BTI untyped atomic messages only support 32-bit atomics.  If you
5030b8e80941Smrg    * just look at the big table of messages in the Vol 7 of the SKL PRM, they
5031b8e80941Smrg    * appear to exist.  However, if you look at Vol 2a, there are no message
5032b8e80941Smrg    * descriptors provided for Qword atomic ops except for A64 messages.
5033b8e80941Smrg    */
5034b8e80941Smrg   assert(nir_dest_bit_size(instr->dest) == 32);
5035b8e80941Smrg
5036b8e80941Smrg   fs_reg dest;
5037b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5038b8e80941Smrg      dest = get_nir_dest(instr->dest);
5039b8e80941Smrg
5040b8e80941Smrg   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5041b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5042b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5043b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5044b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5045b8e80941Smrg
5046b8e80941Smrg   fs_reg data;
5047b8e80941Smrg   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5048b8e80941Smrg      data = get_nir_src(instr->src[2]);
5049b8e80941Smrg
5050b8e80941Smrg   if (op == BRW_AOP_CMPWR) {
5051b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5052b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5053b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5054b8e80941Smrg      data = tmp;
5055b8e80941Smrg   }
5056b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5057b8e80941Smrg
5058b8e80941Smrg   /* Emit the actual atomic operation */
5059b8e80941Smrg
5060b8e80941Smrg   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5061b8e80941Smrg            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5062b8e80941Smrg}
5063b8e80941Smrg
5064b8e80941Smrgvoid
5065b8e80941Smrgfs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld,
5066b8e80941Smrg                                       int op, nir_intrinsic_instr *instr)
5067b8e80941Smrg{
5068b8e80941Smrg   if (stage == MESA_SHADER_FRAGMENT)
5069b8e80941Smrg      brw_wm_prog_data(prog_data)->has_side_effects = true;
5070b8e80941Smrg
5071b8e80941Smrg   fs_reg dest;
5072b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5073b8e80941Smrg      dest = get_nir_dest(instr->dest);
5074b8e80941Smrg
5075b8e80941Smrg   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5076b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr);
5077b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]);
5078b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5079b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5080b8e80941Smrg
5081b8e80941Smrg   fs_reg data = get_nir_src(instr->src[2]);
5082b8e80941Smrg   if (op == BRW_AOP_FCMPWR) {
5083b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5084b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[3]) };
5085b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5086b8e80941Smrg      data = tmp;
5087b8e80941Smrg   }
5088b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5089b8e80941Smrg
5090b8e80941Smrg   /* Emit the actual atomic operation */
5091b8e80941Smrg
5092b8e80941Smrg   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5093b8e80941Smrg            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5094b8e80941Smrg}
5095b8e80941Smrg
5096b8e80941Smrgvoid
5097b8e80941Smrgfs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
5098b8e80941Smrg                                   int op, nir_intrinsic_instr *instr)
5099b8e80941Smrg{
5100b8e80941Smrg   fs_reg dest;
5101b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5102b8e80941Smrg      dest = get_nir_dest(instr->dest);
5103b8e80941Smrg
5104b8e80941Smrg   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5105b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5106b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5107b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5108b8e80941Smrg
5109b8e80941Smrg   fs_reg data;
5110b8e80941Smrg   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5111b8e80941Smrg      data = get_nir_src(instr->src[1]);
5112b8e80941Smrg   if (op == BRW_AOP_CMPWR) {
5113b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5114b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5115b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5116b8e80941Smrg      data = tmp;
5117b8e80941Smrg   }
5118b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5119b8e80941Smrg
5120b8e80941Smrg   /* Get the offset */
5121b8e80941Smrg   if (nir_src_is_const(instr->src[0])) {
5122b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5123b8e80941Smrg         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5124b8e80941Smrg   } else {
5125b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5126b8e80941Smrg      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5127b8e80941Smrg	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5128b8e80941Smrg	      brw_imm_ud(instr->const_index[0]));
5129b8e80941Smrg   }
5130b8e80941Smrg
5131b8e80941Smrg   /* Emit the actual atomic operation operation */
5132b8e80941Smrg
5133b8e80941Smrg   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
5134b8e80941Smrg            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5135b8e80941Smrg}
5136b8e80941Smrg
5137b8e80941Smrgvoid
5138b8e80941Smrgfs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld,
5139b8e80941Smrg                                         int op, nir_intrinsic_instr *instr)
5140b8e80941Smrg{
5141b8e80941Smrg   fs_reg dest;
5142b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5143b8e80941Smrg      dest = get_nir_dest(instr->dest);
5144b8e80941Smrg
5145b8e80941Smrg   fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS];
5146b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM);
5147b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1);
5148b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op);
5149b8e80941Smrg
5150b8e80941Smrg   fs_reg data = get_nir_src(instr->src[1]);
5151b8e80941Smrg   if (op == BRW_AOP_FCMPWR) {
5152b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5153b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5154b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5155b8e80941Smrg      data = tmp;
5156b8e80941Smrg   }
5157b8e80941Smrg   srcs[SURFACE_LOGICAL_SRC_DATA] = data;
5158b8e80941Smrg
5159b8e80941Smrg   /* Get the offset */
5160b8e80941Smrg   if (nir_src_is_const(instr->src[0])) {
5161b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] =
5162b8e80941Smrg         brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0]));
5163b8e80941Smrg   } else {
5164b8e80941Smrg      srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type);
5165b8e80941Smrg      bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS],
5166b8e80941Smrg	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
5167b8e80941Smrg	      brw_imm_ud(instr->const_index[0]));
5168b8e80941Smrg   }
5169b8e80941Smrg
5170b8e80941Smrg   /* Emit the actual atomic operation operation */
5171b8e80941Smrg
5172b8e80941Smrg   bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL,
5173b8e80941Smrg            dest, srcs, SURFACE_LOGICAL_NUM_SRCS);
5174b8e80941Smrg}
5175b8e80941Smrg
5176b8e80941Smrgvoid
5177b8e80941Smrgfs_visitor::nir_emit_global_atomic(const fs_builder &bld,
5178b8e80941Smrg                                   int op, nir_intrinsic_instr *instr)
5179b8e80941Smrg{
5180b8e80941Smrg   if (stage == MESA_SHADER_FRAGMENT)
5181b8e80941Smrg      brw_wm_prog_data(prog_data)->has_side_effects = true;
5182b8e80941Smrg
5183b8e80941Smrg   fs_reg dest;
5184b8e80941Smrg   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
5185b8e80941Smrg      dest = get_nir_dest(instr->dest);
5186b8e80941Smrg
5187b8e80941Smrg   fs_reg addr = get_nir_src(instr->src[0]);
5188b8e80941Smrg
5189b8e80941Smrg   fs_reg data;
5190b8e80941Smrg   if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC)
5191b8e80941Smrg      data = get_nir_src(instr->src[1]);
5192b8e80941Smrg
5193b8e80941Smrg   if (op == BRW_AOP_CMPWR) {
5194b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5195b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5196b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5197b8e80941Smrg      data = tmp;
5198b8e80941Smrg   }
5199b8e80941Smrg
5200b8e80941Smrg   if (nir_dest_bit_size(instr->dest) == 64) {
5201b8e80941Smrg      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL,
5202b8e80941Smrg               dest, addr, data, brw_imm_ud(op));
5203b8e80941Smrg   } else {
5204b8e80941Smrg      assert(nir_dest_bit_size(instr->dest) == 32);
5205b8e80941Smrg      bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5206b8e80941Smrg               dest, addr, data, brw_imm_ud(op));
5207b8e80941Smrg   }
5208b8e80941Smrg}
5209b8e80941Smrg
5210b8e80941Smrgvoid
5211b8e80941Smrgfs_visitor::nir_emit_global_atomic_float(const fs_builder &bld,
5212b8e80941Smrg                                         int op, nir_intrinsic_instr *instr)
5213b8e80941Smrg{
5214b8e80941Smrg   if (stage == MESA_SHADER_FRAGMENT)
5215b8e80941Smrg      brw_wm_prog_data(prog_data)->has_side_effects = true;
5216b8e80941Smrg
5217b8e80941Smrg   assert(nir_intrinsic_infos[instr->intrinsic].has_dest);
5218b8e80941Smrg   fs_reg dest = get_nir_dest(instr->dest);
5219b8e80941Smrg
5220b8e80941Smrg   fs_reg addr = get_nir_src(instr->src[0]);
5221b8e80941Smrg
5222b8e80941Smrg   assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC);
5223b8e80941Smrg   fs_reg data = get_nir_src(instr->src[1]);
5224b8e80941Smrg
5225b8e80941Smrg   if (op == BRW_AOP_FCMPWR) {
5226b8e80941Smrg      fs_reg tmp = bld.vgrf(data.type, 2);
5227b8e80941Smrg      fs_reg sources[2] = { data, get_nir_src(instr->src[2]) };
5228b8e80941Smrg      bld.LOAD_PAYLOAD(tmp, sources, 2, 0);
5229b8e80941Smrg      data = tmp;
5230b8e80941Smrg   }
5231b8e80941Smrg
5232b8e80941Smrg   bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL,
5233b8e80941Smrg            dest, addr, data, brw_imm_ud(op));
5234b8e80941Smrg}
5235b8e80941Smrg
5236b8e80941Smrgvoid
5237b8e80941Smrgfs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
5238b8e80941Smrg{
5239b8e80941Smrg   unsigned texture = instr->texture_index;
5240b8e80941Smrg   unsigned sampler = instr->sampler_index;
5241b8e80941Smrg
5242b8e80941Smrg   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
5243b8e80941Smrg
5244b8e80941Smrg   srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
5245b8e80941Smrg   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
5246b8e80941Smrg
5247b8e80941Smrg   int lod_components = 0;
5248b8e80941Smrg
5249b8e80941Smrg   /* The hardware requires a LOD for buffer textures */
5250b8e80941Smrg   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
5251b8e80941Smrg      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
5252b8e80941Smrg
5253b8e80941Smrg   uint32_t header_bits = 0;
5254b8e80941Smrg   for (unsigned i = 0; i < instr->num_srcs; i++) {
5255b8e80941Smrg      fs_reg src = get_nir_src(instr->src[i].src);
5256b8e80941Smrg      switch (instr->src[i].src_type) {
5257b8e80941Smrg      case nir_tex_src_bias:
5258b8e80941Smrg         srcs[TEX_LOGICAL_SRC_LOD] =
5259b8e80941Smrg            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5260b8e80941Smrg         break;
5261b8e80941Smrg      case nir_tex_src_comparator:
5262b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
5263b8e80941Smrg         break;
5264b8e80941Smrg      case nir_tex_src_coord:
5265b8e80941Smrg         switch (instr->op) {
5266b8e80941Smrg         case nir_texop_txf:
5267b8e80941Smrg         case nir_texop_txf_ms:
5268b8e80941Smrg         case nir_texop_txf_ms_mcs:
5269b8e80941Smrg         case nir_texop_samples_identical:
5270b8e80941Smrg            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
5271b8e80941Smrg            break;
5272b8e80941Smrg         default:
5273b8e80941Smrg            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
5274b8e80941Smrg            break;
5275b8e80941Smrg         }
5276b8e80941Smrg         break;
5277b8e80941Smrg      case nir_tex_src_ddx:
5278b8e80941Smrg         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
5279b8e80941Smrg         lod_components = nir_tex_instr_src_size(instr, i);
5280b8e80941Smrg         break;
5281b8e80941Smrg      case nir_tex_src_ddy:
5282b8e80941Smrg         srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
5283b8e80941Smrg         break;
5284b8e80941Smrg      case nir_tex_src_lod:
5285b8e80941Smrg         switch (instr->op) {
5286b8e80941Smrg         case nir_texop_txs:
5287b8e80941Smrg            srcs[TEX_LOGICAL_SRC_LOD] =
5288b8e80941Smrg               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
5289b8e80941Smrg            break;
5290b8e80941Smrg         case nir_texop_txf:
5291b8e80941Smrg            srcs[TEX_LOGICAL_SRC_LOD] =
5292b8e80941Smrg               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
5293b8e80941Smrg            break;
5294b8e80941Smrg         default:
5295b8e80941Smrg            srcs[TEX_LOGICAL_SRC_LOD] =
5296b8e80941Smrg               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5297b8e80941Smrg            break;
5298b8e80941Smrg         }
5299b8e80941Smrg         break;
5300b8e80941Smrg      case nir_tex_src_min_lod:
5301b8e80941Smrg         srcs[TEX_LOGICAL_SRC_MIN_LOD] =
5302b8e80941Smrg            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
5303b8e80941Smrg         break;
5304b8e80941Smrg      case nir_tex_src_ms_index:
5305b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
5306b8e80941Smrg         break;
5307b8e80941Smrg
5308b8e80941Smrg      case nir_tex_src_offset: {
5309b8e80941Smrg         uint32_t offset_bits = 0;
5310b8e80941Smrg         if (brw_texture_offset(instr, i, &offset_bits)) {
5311b8e80941Smrg            header_bits |= offset_bits;
5312b8e80941Smrg         } else {
5313b8e80941Smrg            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
5314b8e80941Smrg               retype(src, BRW_REGISTER_TYPE_D);
5315b8e80941Smrg         }
5316b8e80941Smrg         break;
5317b8e80941Smrg      }
5318b8e80941Smrg
5319b8e80941Smrg      case nir_tex_src_projector:
5320b8e80941Smrg         unreachable("should be lowered");
5321b8e80941Smrg
5322b8e80941Smrg      case nir_tex_src_texture_offset: {
5323b8e80941Smrg         /* Emit code to evaluate the actual indexing expression */
5324b8e80941Smrg         fs_reg tmp = vgrf(glsl_type::uint_type);
5325b8e80941Smrg         bld.ADD(tmp, src, brw_imm_ud(texture));
5326b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
5327b8e80941Smrg         break;
5328b8e80941Smrg      }
5329b8e80941Smrg
5330b8e80941Smrg      case nir_tex_src_sampler_offset: {
5331b8e80941Smrg         /* Emit code to evaluate the actual indexing expression */
5332b8e80941Smrg         fs_reg tmp = vgrf(glsl_type::uint_type);
5333b8e80941Smrg         bld.ADD(tmp, src, brw_imm_ud(sampler));
5334b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
5335b8e80941Smrg         break;
5336b8e80941Smrg      }
5337b8e80941Smrg
5338b8e80941Smrg      case nir_tex_src_texture_handle:
5339b8e80941Smrg         assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
5340b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg();
5341b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src);
5342b8e80941Smrg         break;
5343b8e80941Smrg
5344b8e80941Smrg      case nir_tex_src_sampler_handle:
5345b8e80941Smrg         assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
5346b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg();
5347b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src);
5348b8e80941Smrg         break;
5349b8e80941Smrg
5350b8e80941Smrg      case nir_tex_src_ms_mcs:
5351b8e80941Smrg         assert(instr->op == nir_texop_txf_ms);
5352b8e80941Smrg         srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
5353b8e80941Smrg         break;
5354b8e80941Smrg
5355b8e80941Smrg      case nir_tex_src_plane: {
5356b8e80941Smrg         const uint32_t plane = nir_src_as_uint(instr->src[i].src);
5357b8e80941Smrg         const uint32_t texture_index =
5358b8e80941Smrg            instr->texture_index +
5359b8e80941Smrg            stage_prog_data->binding_table.plane_start[plane] -
5360b8e80941Smrg            stage_prog_data->binding_table.texture_start;
5361b8e80941Smrg
5362b8e80941Smrg         srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
5363b8e80941Smrg         break;
5364b8e80941Smrg      }
5365b8e80941Smrg
5366b8e80941Smrg      default:
5367b8e80941Smrg         unreachable("unknown texture source");
5368b8e80941Smrg      }
5369b8e80941Smrg   }
5370b8e80941Smrg
5371b8e80941Smrg   if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
5372b8e80941Smrg       (instr->op == nir_texop_txf_ms ||
5373b8e80941Smrg        instr->op == nir_texop_samples_identical)) {
5374b8e80941Smrg      if (devinfo->gen >= 7 &&
5375b8e80941Smrg          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
5376b8e80941Smrg         srcs[TEX_LOGICAL_SRC_MCS] =
5377b8e80941Smrg            emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
5378b8e80941Smrg                           instr->coord_components,
5379b8e80941Smrg                           srcs[TEX_LOGICAL_SRC_SURFACE],
5380b8e80941Smrg                           srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]);
5381b8e80941Smrg      } else {
5382b8e80941Smrg         srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
5383b8e80941Smrg      }
5384b8e80941Smrg   }
5385b8e80941Smrg
5386b8e80941Smrg   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
5387b8e80941Smrg   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
5388b8e80941Smrg
5389b8e80941Smrg   enum opcode opcode;
5390b8e80941Smrg   switch (instr->op) {
5391b8e80941Smrg   case nir_texop_tex:
5392b8e80941Smrg      opcode = SHADER_OPCODE_TEX_LOGICAL;
5393b8e80941Smrg      break;
5394b8e80941Smrg   case nir_texop_txb:
5395b8e80941Smrg      opcode = FS_OPCODE_TXB_LOGICAL;
5396b8e80941Smrg      break;
5397b8e80941Smrg   case nir_texop_txl:
5398b8e80941Smrg      opcode = SHADER_OPCODE_TXL_LOGICAL;
5399b8e80941Smrg      break;
5400b8e80941Smrg   case nir_texop_txd:
5401b8e80941Smrg      opcode = SHADER_OPCODE_TXD_LOGICAL;
5402b8e80941Smrg      break;
5403b8e80941Smrg   case nir_texop_txf:
5404b8e80941Smrg      opcode = SHADER_OPCODE_TXF_LOGICAL;
5405b8e80941Smrg      break;
5406b8e80941Smrg   case nir_texop_txf_ms:
5407b8e80941Smrg      if ((key_tex->msaa_16 & (1 << sampler)))
5408b8e80941Smrg         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
5409b8e80941Smrg      else
5410b8e80941Smrg         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
5411b8e80941Smrg      break;
5412b8e80941Smrg   case nir_texop_txf_ms_mcs:
5413b8e80941Smrg      opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
5414b8e80941Smrg      break;
5415b8e80941Smrg   case nir_texop_query_levels:
5416b8e80941Smrg   case nir_texop_txs:
5417b8e80941Smrg      opcode = SHADER_OPCODE_TXS_LOGICAL;
5418b8e80941Smrg      break;
5419b8e80941Smrg   case nir_texop_lod:
5420b8e80941Smrg      opcode = SHADER_OPCODE_LOD_LOGICAL;
5421b8e80941Smrg      break;
5422b8e80941Smrg   case nir_texop_tg4:
5423b8e80941Smrg      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
5424b8e80941Smrg         opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
5425b8e80941Smrg      else
5426b8e80941Smrg         opcode = SHADER_OPCODE_TG4_LOGICAL;
5427b8e80941Smrg      break;
5428b8e80941Smrg   case nir_texop_texture_samples:
5429b8e80941Smrg      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
5430b8e80941Smrg      break;
5431b8e80941Smrg   case nir_texop_samples_identical: {
5432b8e80941Smrg      fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
5433b8e80941Smrg
5434b8e80941Smrg      /* If mcs is an immediate value, it means there is no MCS.  In that case
5435b8e80941Smrg       * just return false.
5436b8e80941Smrg       */
5437b8e80941Smrg      if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
5438b8e80941Smrg         bld.MOV(dst, brw_imm_ud(0u));
5439b8e80941Smrg      } else if ((key_tex->msaa_16 & (1 << sampler))) {
5440b8e80941Smrg         fs_reg tmp = vgrf(glsl_type::uint_type);
5441b8e80941Smrg         bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
5442b8e80941Smrg                offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
5443b8e80941Smrg         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
5444b8e80941Smrg      } else {
5445b8e80941Smrg         bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
5446b8e80941Smrg                 BRW_CONDITIONAL_EQ);
5447b8e80941Smrg      }
5448b8e80941Smrg      return;
5449b8e80941Smrg   }
5450b8e80941Smrg   default:
5451b8e80941Smrg      unreachable("unknown texture opcode");
5452b8e80941Smrg   }
5453b8e80941Smrg
5454b8e80941Smrg   if (instr->op == nir_texop_tg4) {
5455b8e80941Smrg      if (instr->component == 1 &&
5456b8e80941Smrg          key_tex->gather_channel_quirk_mask & (1 << texture)) {
5457b8e80941Smrg         /* gather4 sampler is broken for green channel on RG32F --
5458b8e80941Smrg          * we must ask for blue instead.
5459b8e80941Smrg          */
5460b8e80941Smrg         header_bits |= 2 << 16;
5461b8e80941Smrg      } else {
5462b8e80941Smrg         header_bits |= instr->component << 16;
5463b8e80941Smrg      }
5464b8e80941Smrg   }
5465b8e80941Smrg
5466b8e80941Smrg   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
5467b8e80941Smrg   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
5468b8e80941Smrg   inst->offset = header_bits;
5469b8e80941Smrg
5470b8e80941Smrg   const unsigned dest_size = nir_tex_instr_dest_size(instr);
5471b8e80941Smrg   if (devinfo->gen >= 9 &&
5472b8e80941Smrg       instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
5473b8e80941Smrg      unsigned write_mask = instr->dest.is_ssa ?
5474b8e80941Smrg                            nir_ssa_def_components_read(&instr->dest.ssa):
5475b8e80941Smrg                            (1 << dest_size) - 1;
5476b8e80941Smrg      assert(write_mask != 0); /* dead code should have been eliminated */
5477b8e80941Smrg      inst->size_written = util_last_bit(write_mask) *
5478b8e80941Smrg                           inst->dst.component_size(inst->exec_size);
5479b8e80941Smrg   } else {
5480b8e80941Smrg      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
5481b8e80941Smrg   }
5482b8e80941Smrg
5483b8e80941Smrg   if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
5484b8e80941Smrg      inst->shadow_compare = true;
5485b8e80941Smrg
5486b8e80941Smrg   if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
5487b8e80941Smrg      emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
5488b8e80941Smrg
5489b8e80941Smrg   fs_reg nir_dest[4];
5490b8e80941Smrg   for (unsigned i = 0; i < dest_size; i++)
5491b8e80941Smrg      nir_dest[i] = offset(dst, bld, i);
5492b8e80941Smrg
5493b8e80941Smrg   if (instr->op == nir_texop_query_levels) {
5494b8e80941Smrg      /* # levels is in .w */
5495b8e80941Smrg      nir_dest[0] = offset(dst, bld, 3);
5496b8e80941Smrg   } else if (instr->op == nir_texop_txs &&
5497b8e80941Smrg              dest_size >= 3 && devinfo->gen < 7) {
5498b8e80941Smrg      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
5499b8e80941Smrg      fs_reg depth = offset(dst, bld, 2);
5500b8e80941Smrg      nir_dest[2] = vgrf(glsl_type::int_type);
5501b8e80941Smrg      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
5502b8e80941Smrg   }
5503b8e80941Smrg
5504b8e80941Smrg   bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
5505b8e80941Smrg}
5506b8e80941Smrg
5507b8e80941Smrgvoid
5508b8e80941Smrgfs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
5509b8e80941Smrg{
5510b8e80941Smrg   switch (instr->type) {
5511b8e80941Smrg   case nir_jump_break:
5512b8e80941Smrg      bld.emit(BRW_OPCODE_BREAK);
5513b8e80941Smrg      break;
5514b8e80941Smrg   case nir_jump_continue:
5515b8e80941Smrg      bld.emit(BRW_OPCODE_CONTINUE);
5516b8e80941Smrg      break;
5517b8e80941Smrg   case nir_jump_return:
5518b8e80941Smrg   default:
5519b8e80941Smrg      unreachable("unknown jump");
5520b8e80941Smrg   }
5521b8e80941Smrg}
5522b8e80941Smrg
5523b8e80941Smrg/*
5524b8e80941Smrg * This helper takes a source register and un/shuffles it into the destination
5525b8e80941Smrg * register.
5526b8e80941Smrg *
5527b8e80941Smrg * If source type size is smaller than destination type size the operation
5528b8e80941Smrg * needed is a component shuffle. The opposite case would be an unshuffle. If
5529b8e80941Smrg * source/destination type size is equal a shuffle is done that would be
5530b8e80941Smrg * equivalent to a simple MOV.
5531b8e80941Smrg *
5532b8e80941Smrg * For example, if source is a 16-bit type and destination is 32-bit. A 3
5533b8e80941Smrg * components .xyz 16-bit vector on SIMD8 would be.
5534b8e80941Smrg *
5535b8e80941Smrg *    |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8|
5536b8e80941Smrg *    |z1|z2|z3|z4|z5|z6|z7|z8|  |  |  |  |  |  |  |  |
5537b8e80941Smrg *
5538b8e80941Smrg * This helper will return the following 2 32-bit components with the 16-bit
5539b8e80941Smrg * values shuffled:
5540b8e80941Smrg *
5541b8e80941Smrg *    |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8|
5542b8e80941Smrg *    |z1   |z2   |z3   |z4   |z5   |z6   |z7   |z8   |
5543b8e80941Smrg *
5544b8e80941Smrg * For unshuffle, the example would be the opposite, a 64-bit type source
5545b8e80941Smrg * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8
5546b8e80941Smrg * would be:
5547b8e80941Smrg *
5548b8e80941Smrg *    | x1l   x1h | x2l   x2h | x3l   x3h | x4l   x4h |
5549b8e80941Smrg *    | x5l   x5h | x6l   x6h | x7l   x7h | x8l   x8h |
5550b8e80941Smrg *    | y1l   y1h | y2l   y2h | y3l   y3h | y4l   y4h |
5551b8e80941Smrg *    | y5l   y5h | y6l   y6h | y7l   y7h | y8l   y8h |
5552b8e80941Smrg *
5553b8e80941Smrg * The returned result would be the following 4 32-bit components unshuffled:
5554b8e80941Smrg *
5555b8e80941Smrg *    | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l |
5556b8e80941Smrg *    | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h |
5557b8e80941Smrg *    | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l |
5558b8e80941Smrg *    | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h |
5559b8e80941Smrg *
5560b8e80941Smrg * - Source and destination register must not be overlapped.
5561b8e80941Smrg * - components units are measured in terms of the smaller type between
5562b8e80941Smrg *   source and destination because we are un/shuffling the smaller
5563b8e80941Smrg *   components from/into the bigger ones.
5564b8e80941Smrg * - first_component parameter allows skipping source components.
5565b8e80941Smrg */
5566b8e80941Smrgvoid
5567b8e80941Smrgshuffle_src_to_dst(const fs_builder &bld,
5568b8e80941Smrg                   const fs_reg &dst,
5569b8e80941Smrg                   const fs_reg &src,
5570b8e80941Smrg                   uint32_t first_component,
5571b8e80941Smrg                   uint32_t components)
5572b8e80941Smrg{
5573b8e80941Smrg   if (type_sz(src.type) == type_sz(dst.type)) {
5574b8e80941Smrg      assert(!regions_overlap(dst,
5575b8e80941Smrg         type_sz(dst.type) * bld.dispatch_width() * components,
5576b8e80941Smrg         offset(src, bld, first_component),
5577b8e80941Smrg         type_sz(src.type) * bld.dispatch_width() * components));
5578b8e80941Smrg      for (unsigned i = 0; i < components; i++) {
5579b8e80941Smrg         bld.MOV(retype(offset(dst, bld, i), src.type),
5580b8e80941Smrg                 offset(src, bld, i + first_component));
5581b8e80941Smrg      }
5582b8e80941Smrg   } else if (type_sz(src.type) < type_sz(dst.type)) {
5583b8e80941Smrg      /* Source is shuffled into destination */
5584b8e80941Smrg      unsigned size_ratio = type_sz(dst.type) / type_sz(src.type);
5585b8e80941Smrg      assert(!regions_overlap(dst,
5586b8e80941Smrg         type_sz(dst.type) * bld.dispatch_width() *
5587b8e80941Smrg         DIV_ROUND_UP(components, size_ratio),
5588b8e80941Smrg         offset(src, bld, first_component),
5589b8e80941Smrg         type_sz(src.type) * bld.dispatch_width() * components));
5590b8e80941Smrg
5591b8e80941Smrg      brw_reg_type shuffle_type =
5592b8e80941Smrg         brw_reg_type_from_bit_size(8 * type_sz(src.type),
5593b8e80941Smrg                                    BRW_REGISTER_TYPE_D);
5594b8e80941Smrg      for (unsigned i = 0; i < components; i++) {
5595b8e80941Smrg         fs_reg shuffle_component_i =
5596b8e80941Smrg            subscript(offset(dst, bld, i / size_ratio),
5597b8e80941Smrg                      shuffle_type, i % size_ratio);
5598b8e80941Smrg         bld.MOV(shuffle_component_i,
5599b8e80941Smrg                 retype(offset(src, bld, i + first_component), shuffle_type));
5600b8e80941Smrg      }
5601b8e80941Smrg   } else {
5602b8e80941Smrg      /* Source is unshuffled into destination */
5603b8e80941Smrg      unsigned size_ratio = type_sz(src.type) / type_sz(dst.type);
5604b8e80941Smrg      assert(!regions_overlap(dst,
5605b8e80941Smrg         type_sz(dst.type) * bld.dispatch_width() * components,
5606b8e80941Smrg         offset(src, bld, first_component / size_ratio),
5607b8e80941Smrg         type_sz(src.type) * bld.dispatch_width() *
5608b8e80941Smrg         DIV_ROUND_UP(components + (first_component % size_ratio),
5609b8e80941Smrg                      size_ratio)));
5610b8e80941Smrg
5611b8e80941Smrg      brw_reg_type shuffle_type =
5612b8e80941Smrg         brw_reg_type_from_bit_size(8 * type_sz(dst.type),
5613b8e80941Smrg                                    BRW_REGISTER_TYPE_D);
5614b8e80941Smrg      for (unsigned i = 0; i < components; i++) {
5615b8e80941Smrg         fs_reg shuffle_component_i =
5616b8e80941Smrg            subscript(offset(src, bld, (first_component + i) / size_ratio),
5617b8e80941Smrg                      shuffle_type, (first_component + i) % size_ratio);
5618b8e80941Smrg         bld.MOV(retype(offset(dst, bld, i), shuffle_type),
5619b8e80941Smrg                 shuffle_component_i);
5620b8e80941Smrg      }
5621b8e80941Smrg   }
5622b8e80941Smrg}
5623b8e80941Smrg
5624b8e80941Smrgvoid
5625b8e80941Smrgshuffle_from_32bit_read(const fs_builder &bld,
5626b8e80941Smrg                        const fs_reg &dst,
5627b8e80941Smrg                        const fs_reg &src,
5628b8e80941Smrg                        uint32_t first_component,
5629b8e80941Smrg                        uint32_t components)
5630b8e80941Smrg{
5631b8e80941Smrg   assert(type_sz(src.type) == 4);
5632b8e80941Smrg
5633b8e80941Smrg   /* This function takes components in units of the destination type while
5634b8e80941Smrg    * shuffle_src_to_dst takes components in units of the smallest type
5635b8e80941Smrg    */
5636b8e80941Smrg   if (type_sz(dst.type) > 4) {
5637b8e80941Smrg      assert(type_sz(dst.type) == 8);
5638b8e80941Smrg      first_component *= 2;
5639b8e80941Smrg      components *= 2;
5640b8e80941Smrg   }
5641b8e80941Smrg
5642b8e80941Smrg   shuffle_src_to_dst(bld, dst, src, first_component, components);
5643b8e80941Smrg}
5644b8e80941Smrg
5645b8e80941Smrgfs_reg
5646b8e80941Smrgshuffle_for_32bit_write(const fs_builder &bld,
5647b8e80941Smrg                        const fs_reg &src,
5648b8e80941Smrg                        uint32_t first_component,
5649b8e80941Smrg                        uint32_t components)
5650b8e80941Smrg{
5651b8e80941Smrg   fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D,
5652b8e80941Smrg                         DIV_ROUND_UP (components * type_sz(src.type), 4));
5653b8e80941Smrg   /* This function takes components in units of the source type while
5654b8e80941Smrg    * shuffle_src_to_dst takes components in units of the smallest type
5655b8e80941Smrg    */
5656b8e80941Smrg   if (type_sz(src.type) > 4) {
5657b8e80941Smrg      assert(type_sz(src.type) == 8);
5658b8e80941Smrg      first_component *= 2;
5659b8e80941Smrg      components *= 2;
5660b8e80941Smrg   }
5661b8e80941Smrg
5662b8e80941Smrg   shuffle_src_to_dst(bld, dst, src, first_component, components);
5663b8e80941Smrg
5664b8e80941Smrg   return dst;
5665b8e80941Smrg}
5666b8e80941Smrg
5667b8e80941Smrgfs_reg
5668b8e80941Smrgsetup_imm_df(const fs_builder &bld, double v)
5669b8e80941Smrg{
5670b8e80941Smrg   const struct gen_device_info *devinfo = bld.shader->devinfo;
5671b8e80941Smrg   assert(devinfo->gen >= 7);
5672b8e80941Smrg
5673b8e80941Smrg   if (devinfo->gen >= 8)
5674b8e80941Smrg      return brw_imm_df(v);
5675b8e80941Smrg
5676b8e80941Smrg   /* gen7.5 does not support DF immediates straighforward but the DIM
5677b8e80941Smrg    * instruction allows to set the 64-bit immediate value.
5678b8e80941Smrg    */
5679b8e80941Smrg   if (devinfo->is_haswell) {
5680b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
5681b8e80941Smrg      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
5682b8e80941Smrg      ubld.DIM(dst, brw_imm_df(v));
5683b8e80941Smrg      return component(dst, 0);
5684b8e80941Smrg   }
5685b8e80941Smrg
5686b8e80941Smrg   /* gen7 does not support DF immediates, so we generate a 64-bit constant by
5687b8e80941Smrg    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
5688b8e80941Smrg    * the high 32-bit to suboffset 4 and then applying a stride of 0.
5689b8e80941Smrg    *
5690b8e80941Smrg    * Alternatively, we could also produce a normal VGRF (without stride 0)
5691b8e80941Smrg    * by writing to all the channels in the VGRF, however, that would hit the
5692b8e80941Smrg    * gen7 bug where we have to split writes that span more than 1 register
5693b8e80941Smrg    * into instructions with a width of 4 (otherwise the write to the second
5694b8e80941Smrg    * register written runs into an execmask hardware bug) which isn't very
5695b8e80941Smrg    * nice.
5696b8e80941Smrg    */
5697b8e80941Smrg   union {
5698b8e80941Smrg      double d;
5699b8e80941Smrg      struct {
5700b8e80941Smrg         uint32_t i1;
5701b8e80941Smrg         uint32_t i2;
5702b8e80941Smrg      };
5703b8e80941Smrg   } di;
5704b8e80941Smrg
5705b8e80941Smrg   di.d = v;
5706b8e80941Smrg
5707b8e80941Smrg   const fs_builder ubld = bld.exec_all().group(1, 0);
5708b8e80941Smrg   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
5709b8e80941Smrg   ubld.MOV(tmp, brw_imm_ud(di.i1));
5710b8e80941Smrg   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
5711b8e80941Smrg
5712b8e80941Smrg   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
5713b8e80941Smrg}
5714b8e80941Smrg
5715b8e80941Smrgfs_reg
5716b8e80941Smrgsetup_imm_b(const fs_builder &bld, int8_t v)
5717b8e80941Smrg{
5718b8e80941Smrg   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B);
5719b8e80941Smrg   bld.MOV(tmp, brw_imm_w(v));
5720b8e80941Smrg   return tmp;
5721b8e80941Smrg}
5722b8e80941Smrg
5723b8e80941Smrgfs_reg
5724b8e80941Smrgsetup_imm_ub(const fs_builder &bld, uint8_t v)
5725b8e80941Smrg{
5726b8e80941Smrg   const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB);
5727b8e80941Smrg   bld.MOV(tmp, brw_imm_uw(v));
5728b8e80941Smrg   return tmp;
5729b8e80941Smrg}
5730