1/* Copyright © 2011 Intel Corporation
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a
4 * copy of this software and associated documentation files (the "Software"),
5 * to deal in the Software without restriction, including without limitation
6 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
7 * and/or sell copies of the Software, and to permit persons to whom the
8 * Software is furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice (including the next
11 * paragraph) shall be included in all copies or substantial portions of the
12 * Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
20 * IN THE SOFTWARE.
21 */
22
23#include "brw_vec4.h"
24#include "brw_cfg.h"
25#include "brw_eu.h"
26#include "dev/gen_debug.h"
27
28using namespace brw;
29
30static void
31generate_math1_gen4(struct brw_codegen *p,
32                    vec4_instruction *inst,
33                    struct brw_reg dst,
34                    struct brw_reg src)
35{
36   gen4_math(p,
37	     dst,
38	     brw_math_function(inst->opcode),
39	     inst->base_mrf,
40	     src,
41	     BRW_MATH_PRECISION_FULL);
42}
43
44static void
45check_gen6_math_src_arg(struct brw_reg src)
46{
47   /* Source swizzles are ignored. */
48   assert(!src.abs);
49   assert(!src.negate);
50   assert(src.swizzle == BRW_SWIZZLE_XYZW);
51}
52
53static void
54generate_math_gen6(struct brw_codegen *p,
55                   vec4_instruction *inst,
56                   struct brw_reg dst,
57                   struct brw_reg src0,
58                   struct brw_reg src1)
59{
60   /* Can't do writemask because math can't be align16. */
61   assert(dst.writemask == WRITEMASK_XYZW);
62   /* Source swizzles are ignored. */
63   check_gen6_math_src_arg(src0);
64   if (src1.file == BRW_GENERAL_REGISTER_FILE)
65      check_gen6_math_src_arg(src1);
66
67   brw_set_default_access_mode(p, BRW_ALIGN_1);
68   gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
69   brw_set_default_access_mode(p, BRW_ALIGN_16);
70}
71
72static void
73generate_math2_gen4(struct brw_codegen *p,
74                    vec4_instruction *inst,
75                    struct brw_reg dst,
76                    struct brw_reg src0,
77                    struct brw_reg src1)
78{
79   /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
80    * "Message Payload":
81    *
82    * "Operand0[7].  For the INT DIV functions, this operand is the
83    *  denominator."
84    *  ...
85    * "Operand1[7].  For the INT DIV functions, this operand is the
86    *  numerator."
87    */
88   bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
89   struct brw_reg &op0 = is_int_div ? src1 : src0;
90   struct brw_reg &op1 = is_int_div ? src0 : src1;
91
92   brw_push_insn_state(p);
93   brw_set_default_saturate(p, false);
94   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
95   brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
96   brw_pop_insn_state(p);
97
98   gen4_math(p,
99	     dst,
100	     brw_math_function(inst->opcode),
101	     inst->base_mrf,
102	     op0,
103	     BRW_MATH_PRECISION_FULL);
104}
105
106static void
107generate_tex(struct brw_codegen *p,
108             struct brw_vue_prog_data *prog_data,
109             gl_shader_stage stage,
110             vec4_instruction *inst,
111             struct brw_reg dst,
112             struct brw_reg src,
113             struct brw_reg surface_index,
114             struct brw_reg sampler_index)
115{
116   const struct gen_device_info *devinfo = p->devinfo;
117   int msg_type = -1;
118
119   if (devinfo->gen >= 5) {
120      switch (inst->opcode) {
121      case SHADER_OPCODE_TEX:
122      case SHADER_OPCODE_TXL:
123	 if (inst->shadow_compare) {
124	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
125	 } else {
126	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
127	 }
128	 break;
129      case SHADER_OPCODE_TXD:
130         if (inst->shadow_compare) {
131            /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
132            assert(devinfo->gen >= 8 || devinfo->is_haswell);
133            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
134         } else {
135            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
136         }
137	 break;
138      case SHADER_OPCODE_TXF:
139	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
140	 break;
141      case SHADER_OPCODE_TXF_CMS_W:
142         assert(devinfo->gen >= 9);
143         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
144         break;
145      case SHADER_OPCODE_TXF_CMS:
146         if (devinfo->gen >= 7)
147            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
148         else
149            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
150         break;
151      case SHADER_OPCODE_TXF_MCS:
152         assert(devinfo->gen >= 7);
153         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
154         break;
155      case SHADER_OPCODE_TXS:
156	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
157	 break;
158      case SHADER_OPCODE_TG4:
159         if (inst->shadow_compare) {
160            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
161         } else {
162            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
163         }
164         break;
165      case SHADER_OPCODE_TG4_OFFSET:
166         if (inst->shadow_compare) {
167            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
168         } else {
169            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
170         }
171         break;
172      case SHADER_OPCODE_SAMPLEINFO:
173         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
174         break;
175      default:
176	 unreachable("should not get here: invalid vec4 texture opcode");
177      }
178   } else {
179      switch (inst->opcode) {
180      case SHADER_OPCODE_TEX:
181      case SHADER_OPCODE_TXL:
182	 if (inst->shadow_compare) {
183	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
184	    assert(inst->mlen == 3);
185	 } else {
186	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
187	    assert(inst->mlen == 2);
188	 }
189	 break;
190      case SHADER_OPCODE_TXD:
191	 /* There is no sample_d_c message; comparisons are done manually. */
192	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
193	 assert(inst->mlen == 4);
194	 break;
195      case SHADER_OPCODE_TXF:
196	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
197	 assert(inst->mlen == 2);
198	 break;
199      case SHADER_OPCODE_TXS:
200	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
201	 assert(inst->mlen == 2);
202	 break;
203      default:
204	 unreachable("should not get here: invalid vec4 texture opcode");
205      }
206   }
207
208   assert(msg_type != -1);
209
210   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
211
212   /* Load the message header if present.  If there's a texture offset, we need
213    * to set it up explicitly and load the offset bitfield.  Otherwise, we can
214    * use an implied move from g0 to the first message register.
215    */
216   if (inst->header_size != 0) {
217      if (devinfo->gen < 6 && !inst->offset) {
218         /* Set up an implied move from g0 to the MRF. */
219         src = brw_vec8_grf(0, 0);
220      } else {
221         struct brw_reg header =
222            retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
223         uint32_t dw2 = 0;
224
225         /* Explicitly set up the message header by copying g0 to the MRF. */
226         brw_push_insn_state(p);
227         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
228         brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
229
230         brw_set_default_access_mode(p, BRW_ALIGN_1);
231
232         if (inst->offset)
233            /* Set the texel offset bits in DWord 2. */
234            dw2 = inst->offset;
235
236         if (devinfo->gen >= 9)
237            /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
238             * based on bit 22 in the header.
239             */
240            dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
241
242         /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
243          * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
244          * not, so we must set to to 0 to avoid setting undesirable bits
245          * in the message header.
246          */
247         if (dw2 ||
248             stage == MESA_SHADER_TESS_CTRL ||
249             stage == MESA_SHADER_GEOMETRY) {
250            brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
251         }
252
253         brw_adjust_sampler_state_pointer(p, header, sampler_index);
254         brw_pop_insn_state(p);
255      }
256   }
257
258   uint32_t return_format;
259
260   switch (dst.type) {
261   case BRW_REGISTER_TYPE_D:
262      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
263      break;
264   case BRW_REGISTER_TYPE_UD:
265      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
266      break;
267   default:
268      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
269      break;
270   }
271
272   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
273         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
274         ? prog_data->base.binding_table.gather_texture_start
275         : prog_data->base.binding_table.texture_start;
276
277   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
278       sampler_index.file == BRW_IMMEDIATE_VALUE) {
279      uint32_t surface = surface_index.ud;
280      uint32_t sampler = sampler_index.ud;
281
282      brw_SAMPLE(p,
283                 dst,
284                 inst->base_mrf,
285                 src,
286                 surface + base_binding_table_index,
287                 sampler % 16,
288                 msg_type,
289                 1, /* response length */
290                 inst->mlen,
291                 inst->header_size != 0,
292                 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
293                 return_format);
294   } else {
295      /* Non-constant sampler index. */
296
297      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
298      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
299      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
300
301      brw_push_insn_state(p);
302      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
303      brw_set_default_access_mode(p, BRW_ALIGN_1);
304
305      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
306         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
307      } else {
308         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
309            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
310         } else {
311            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
312            brw_OR(p, addr, addr, surface_reg);
313         }
314      }
315      if (base_binding_table_index)
316         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
317      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
318
319      brw_pop_insn_state(p);
320
321      if (inst->base_mrf != -1)
322         gen6_resolve_implied_move(p, &src, inst->base_mrf);
323
324      /* dst = send(offset, a0.0 | <descriptor>) */
325      brw_send_indirect_message(
326         p, BRW_SFID_SAMPLER, dst, src, addr,
327         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
328         brw_sampler_desc(devinfo,
329                          0 /* surface */,
330                          0 /* sampler */,
331                          msg_type,
332                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
333                          return_format),
334         false /* EOT */);
335
336      /* visitor knows more than we do about the surface limit required,
337       * so has already done marking.
338       */
339   }
340}
341
342static void
343generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
344{
345   brw_urb_WRITE(p,
346		 brw_null_reg(), /* dest */
347		 inst->base_mrf, /* starting mrf reg nr */
348		 brw_vec8_grf(0, 0), /* src */
349                 inst->urb_write_flags,
350		 inst->mlen,
351		 0,		/* response len */
352		 inst->offset,	/* urb destination offset */
353		 BRW_URB_SWIZZLE_INTERLEAVE);
354}
355
356static void
357generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
358{
359   struct brw_reg src = brw_message_reg(inst->base_mrf);
360   brw_urb_WRITE(p,
361                 brw_null_reg(), /* dest */
362                 inst->base_mrf, /* starting mrf reg nr */
363                 src,
364                 inst->urb_write_flags,
365                 inst->mlen,
366                 0,             /* response len */
367                 inst->offset,  /* urb destination offset */
368                 BRW_URB_SWIZZLE_INTERLEAVE);
369}
370
371static void
372generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
373{
374   struct brw_reg src = brw_message_reg(inst->base_mrf);
375
376   /* We pass the temporary passed in src0 as the writeback register */
377   brw_urb_WRITE(p,
378                 inst->src[0].as_brw_reg(), /* dest */
379                 inst->base_mrf, /* starting mrf reg nr */
380                 src,
381                 BRW_URB_WRITE_ALLOCATE_COMPLETE,
382                 inst->mlen,
383                 1, /* response len */
384                 inst->offset,  /* urb destination offset */
385                 BRW_URB_SWIZZLE_INTERLEAVE);
386
387   /* Now put allocated urb handle in dst.0 */
388   brw_push_insn_state(p);
389   brw_set_default_access_mode(p, BRW_ALIGN_1);
390   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
391   brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
392           get_element_ud(inst->src[0].as_brw_reg(), 0));
393   brw_pop_insn_state(p);
394}
395
396static void
397generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
398{
399   struct brw_reg src = brw_message_reg(inst->base_mrf);
400   brw_urb_WRITE(p,
401                 brw_null_reg(), /* dest */
402                 inst->base_mrf, /* starting mrf reg nr */
403                 src,
404                 BRW_URB_WRITE_EOT | inst->urb_write_flags,
405                 inst->mlen,
406                 0,              /* response len */
407                 0,              /* urb destination offset */
408                 BRW_URB_SWIZZLE_INTERLEAVE);
409}
410
411static void
412generate_gs_set_write_offset(struct brw_codegen *p,
413                             struct brw_reg dst,
414                             struct brw_reg src0,
415                             struct brw_reg src1)
416{
417   /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
418    * Header: M0.3):
419    *
420    *     Slot 0 Offset. This field, after adding to the Global Offset field
421    *     in the message descriptor, specifies the offset (in 256-bit units)
422    *     from the start of the URB entry, as referenced by URB Handle 0, at
423    *     which the data will be accessed.
424    *
425    * Similar text describes DWORD M0.4, which is slot 1 offset.
426    *
427    * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
428    * of the register for geometry shader invocations 0 and 1) by the
429    * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
430    *
431    * We can do this with the following EU instruction:
432    *
433    *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
434    */
435   brw_push_insn_state(p);
436   brw_set_default_access_mode(p, BRW_ALIGN_1);
437   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
438   assert(p->devinfo->gen >= 7 &&
439          src1.file == BRW_IMMEDIATE_VALUE &&
440          src1.type == BRW_REGISTER_TYPE_UD &&
441          src1.ud <= USHRT_MAX);
442   if (src0.file == BRW_IMMEDIATE_VALUE) {
443      brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
444              brw_imm_ud(src0.ud * src1.ud));
445   } else {
446      brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
447              retype(src1, BRW_REGISTER_TYPE_UW));
448   }
449   brw_pop_insn_state(p);
450}
451
452static void
453generate_gs_set_vertex_count(struct brw_codegen *p,
454                             struct brw_reg dst,
455                             struct brw_reg src)
456{
457   brw_push_insn_state(p);
458   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
459
460   if (p->devinfo->gen >= 8) {
461      /* Move the vertex count into the second MRF for the EOT write. */
462      brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
463              src);
464   } else {
465      /* If we think of the src and dst registers as composed of 8 DWORDs each,
466       * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
467       * them to WORDs, and then pack them into DWORD 2 of dst.
468       *
469       * It's easier to get the EU to do this if we think of the src and dst
470       * registers as composed of 16 WORDS each; then, we want to pick up the
471       * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
472       * of dst.
473       *
474       * We can do that by the following EU instruction:
475       *
476       *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
477       */
478      brw_set_default_access_mode(p, BRW_ALIGN_1);
479      brw_MOV(p,
480              suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
481              stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
482   }
483   brw_pop_insn_state(p);
484}
485
486static void
487generate_gs_svb_write(struct brw_codegen *p,
488                      struct brw_vue_prog_data *prog_data,
489                      vec4_instruction *inst,
490                      struct brw_reg dst,
491                      struct brw_reg src0,
492                      struct brw_reg src1)
493{
494   int binding = inst->sol_binding;
495   bool final_write = inst->sol_final_write;
496
497   brw_push_insn_state(p);
498   brw_set_default_exec_size(p, BRW_EXECUTE_4);
499   /* Copy Vertex data into M0.x */
500   brw_MOV(p, stride(dst, 4, 4, 1),
501           stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
502   brw_pop_insn_state(p);
503
504   brw_push_insn_state(p);
505   /* Send SVB Write */
506   brw_svb_write(p,
507                 final_write ? src1 : brw_null_reg(), /* dest == src1 */
508                 1, /* msg_reg_nr */
509                 dst, /* src0 == previous dst */
510                 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
511                 final_write); /* send_commit_msg */
512
513   /* Finally, wait for the write commit to occur so that we can proceed to
514    * other things safely.
515    *
516    * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
517    *
518    *   The write commit does not modify the destination register, but
519    *   merely clears the dependency associated with the destination
520    *   register. Thus, a simple “mov” instruction using the register as a
521    *   source is sufficient to wait for the write commit to occur.
522    */
523   if (final_write) {
524      brw_MOV(p, src1, src1);
525   }
526   brw_pop_insn_state(p);
527}
528
529static void
530generate_gs_svb_set_destination_index(struct brw_codegen *p,
531                                      vec4_instruction *inst,
532                                      struct brw_reg dst,
533                                      struct brw_reg src)
534{
535   int vertex = inst->sol_vertex;
536   brw_push_insn_state(p);
537   brw_set_default_access_mode(p, BRW_ALIGN_1);
538   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
539   brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
540   brw_pop_insn_state(p);
541}
542
543static void
544generate_gs_set_dword_2(struct brw_codegen *p,
545                        struct brw_reg dst,
546                        struct brw_reg src)
547{
548   brw_push_insn_state(p);
549   brw_set_default_access_mode(p, BRW_ALIGN_1);
550   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
551   brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
552   brw_pop_insn_state(p);
553}
554
555static void
556generate_gs_prepare_channel_masks(struct brw_codegen *p,
557                                  struct brw_reg dst)
558{
559   /* We want to left shift just DWORD 4 (the x component belonging to the
560    * second geometry shader invocation) by 4 bits.  So generate the
561    * instruction:
562    *
563    *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
564    */
565   dst = suboffset(vec1(dst), 4);
566   brw_push_insn_state(p);
567   brw_set_default_access_mode(p, BRW_ALIGN_1);
568   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
569   brw_SHL(p, dst, dst, brw_imm_ud(4));
570   brw_pop_insn_state(p);
571}
572
573static void
574generate_gs_set_channel_masks(struct brw_codegen *p,
575                              struct brw_reg dst,
576                              struct brw_reg src)
577{
578   /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
579    * Header: M0.5):
580    *
581    *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
582    *
583    *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
584    *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
585    *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
586    *        channel enable to determine the final channel enable.  For the
587    *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
588    *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
589    *        in the writeback message.  For the URB_WRITE_OWORD &
590    *        URB_WRITE_HWORD messages, when final channel enable is 1 it
591    *        indicates that Vertex 1 DATA [3] will be written to the surface.
592    *
593    *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
594    *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
595    *
596    *     14 Vertex 1 DATA [2] Channel Mask
597    *     13 Vertex 1 DATA [1] Channel Mask
598    *     12 Vertex 1 DATA [0] Channel Mask
599    *     11 Vertex 0 DATA [3] Channel Mask
600    *     10 Vertex 0 DATA [2] Channel Mask
601    *      9 Vertex 0 DATA [1] Channel Mask
602    *      8 Vertex 0 DATA [0] Channel Mask
603    *
604    * (This is from a section of the PRM that is agnostic to the particular
605    * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
606    * geometry shader invocations 0 and 1, respectively).  Since we have the
607    * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
608    * and the enable flags for geometry shader invocation 1 in bits 7:0 of
609    * DWORD 4, we just need to OR them together and store the result in bits
610    * 15:8 of DWORD 5.
611    *
612    * It's easier to get the EU to do this if we think of the src and dst
613    * registers as composed of 32 bytes each; then, we want to pick up the
614    * contents of bytes 0 and 16 from src, OR them together, and store them in
615    * byte 21.
616    *
617    * We can do that by the following EU instruction:
618    *
619    *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
620    *
621    * Note: this relies on the source register having zeros in (a) bits 7:4 of
622    * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
623    * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
624    * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
625    * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
626    * contain valid channel mask values (which are in the range 0x0-0xf).
627    */
628   dst = retype(dst, BRW_REGISTER_TYPE_UB);
629   src = retype(src, BRW_REGISTER_TYPE_UB);
630   brw_push_insn_state(p);
631   brw_set_default_access_mode(p, BRW_ALIGN_1);
632   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
633   brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
634   brw_pop_insn_state(p);
635}
636
637static void
638generate_gs_get_instance_id(struct brw_codegen *p,
639                            struct brw_reg dst)
640{
641   /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
642    * and store into dst.0 & dst.4. So generate the instruction:
643    *
644    *     shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
645    */
646   brw_push_insn_state(p);
647   brw_set_default_access_mode(p, BRW_ALIGN_1);
648   dst = retype(dst, BRW_REGISTER_TYPE_UD);
649   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
650   brw_SHR(p, dst, stride(r0, 1, 4, 0),
651           brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
652   brw_pop_insn_state(p);
653}
654
655static void
656generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
657                                   struct brw_reg dst,
658                                   struct brw_reg src0,
659                                   struct brw_reg src1,
660                                   struct brw_reg src2)
661{
662   brw_push_insn_state(p);
663   brw_set_default_access_mode(p, BRW_ALIGN_1);
664   /* Save src0 data in 16:31 bits of dst.0 */
665   brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
666           brw_imm_ud(0xffffu));
667   brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
668   /* Save src1 data in 0:15 bits of dst.0 */
669   brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
670           brw_imm_ud(0xffffu));
671   brw_OR(p, suboffset(vec1(dst), 0),
672          suboffset(vec1(dst), 0),
673          suboffset(vec1(src2), 0));
674   brw_pop_insn_state(p);
675}
676
677static void
678generate_gs_ff_sync(struct brw_codegen *p,
679                    vec4_instruction *inst,
680                    struct brw_reg dst,
681                    struct brw_reg src0,
682                    struct brw_reg src1)
683{
684   /* This opcode uses an implied MRF register for:
685    *  - the header of the ff_sync message. And as such it is expected to be
686    *    initialized to r0 before calling here.
687    *  - the destination where we will write the allocated URB handle.
688    */
689   struct brw_reg header =
690      retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
691
692   /* Overwrite dword 0 of the header (SO vertices to write) and
693    * dword 1 (number of primitives written).
694    */
695   brw_push_insn_state(p);
696   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
697   brw_set_default_access_mode(p, BRW_ALIGN_1);
698   brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
699   brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
700   brw_pop_insn_state(p);
701
702   /* Allocate URB handle in dst */
703   brw_ff_sync(p,
704               dst,
705               0,
706               header,
707               1, /* allocate */
708               1, /* response length */
709               0 /* eot */);
710
711   /* Now put allocated urb handle in header.0 */
712   brw_push_insn_state(p);
713   brw_set_default_access_mode(p, BRW_ALIGN_1);
714   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
715   brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
716
717   /* src1 is not an immediate when we use transform feedback */
718   if (src1.file != BRW_IMMEDIATE_VALUE) {
719      brw_set_default_exec_size(p, BRW_EXECUTE_4);
720      brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
721   }
722
723   brw_pop_insn_state(p);
724}
725
726static void
727generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
728{
729   /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
730   struct brw_reg src = brw_vec8_grf(0, 0);
731   brw_push_insn_state(p);
732   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
733   brw_set_default_access_mode(p, BRW_ALIGN_1);
734   brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
735   brw_pop_insn_state(p);
736}
737
738static void
739generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
740{
741   const struct gen_device_info *devinfo = p->devinfo;
742   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
743
744   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
745    *
746    * Since we operate in SIMD4x2 mode, we need run half as many threads
747    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
748    * shift right by one less to accomplish the multiplication by two.
749    */
750   dst = retype(dst, BRW_REGISTER_TYPE_UD);
751   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
752
753   brw_push_insn_state(p);
754   brw_set_default_access_mode(p, BRW_ALIGN_1);
755
756   const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
757   const int shift = ivb ? 16 : 17;
758
759   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
760   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
761           brw_imm_ud(shift - 1));
762   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
763
764   brw_pop_insn_state(p);
765}
766
767static void
768generate_tcs_urb_write(struct brw_codegen *p,
769                       vec4_instruction *inst,
770                       struct brw_reg urb_header)
771{
772   const struct gen_device_info *devinfo = p->devinfo;
773
774   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
775   brw_set_dest(p, send, brw_null_reg());
776   brw_set_src0(p, send, urb_header);
777   brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true));
778
779   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
780   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
781   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
782   if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
783      brw_inst_set_eot(devinfo, send, 1);
784   } else {
785      brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
786      brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
787   }
788
789   /* what happens to swizzles? */
790}
791
792
793static void
794generate_tcs_input_urb_offsets(struct brw_codegen *p,
795                               struct brw_reg dst,
796                               struct brw_reg vertex,
797                               struct brw_reg offset)
798{
799   /* Generates an URB read/write message header for HS/DS operation.
800    * Inputs are a vertex index, and a byte offset from the beginning of
801    * the vertex. */
802
803   /* If `vertex` is not an immediate, we clobber a0.0 */
804
805   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
806   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
807
808   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
809
810   brw_push_insn_state(p);
811   brw_set_default_access_mode(p, BRW_ALIGN_1);
812   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
813   brw_MOV(p, dst, brw_imm_ud(0));
814
815   /* m0.5 bits 8-15 are channel enables */
816   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
817
818   /* m0.0-0.1: URB handles */
819   if (vertex.file == BRW_IMMEDIATE_VALUE) {
820      uint32_t vertex_index = vertex.ud;
821      struct brw_reg index_reg = brw_vec1_grf(
822            1 + (vertex_index >> 3), vertex_index & 7);
823
824      brw_MOV(p, vec2(get_element_ud(dst, 0)),
825              retype(index_reg, BRW_REGISTER_TYPE_UD));
826   } else {
827      /* Use indirect addressing.  ICP Handles are DWords (single channels
828       * of a register) and start at g1.0.
829       *
830       * In order to start our region at g1.0, we add 8 to the vertex index,
831       * effectively skipping over the 8 channels in g0.0.  This gives us a
832       * DWord offset to the ICP Handle.
833       *
834       * Indirect addressing works in terms of bytes, so we then multiply
835       * the DWord offset by 4 (by shifting left by 2).
836       */
837      struct brw_reg addr = brw_address_reg(0);
838
839      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
840      brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
841              brw_imm_uw(0x8));
842      brw_SHL(p, addr, addr, brw_imm_uw(2));
843      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
844
845      /* top half: m0.1 = g[1.0 + vertex.4]UD */
846      brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
847              brw_imm_uw(0x8));
848      brw_SHL(p, addr, addr, brw_imm_uw(2));
849      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
850   }
851
852   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
853   if (offset.file != ARF)
854      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
855
856   brw_pop_insn_state(p);
857}
858
859
860static void
861generate_tcs_output_urb_offsets(struct brw_codegen *p,
862                                struct brw_reg dst,
863                                struct brw_reg write_mask,
864                                struct brw_reg offset)
865{
866   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
867   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
868
869   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
870   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
871
872   brw_push_insn_state(p);
873
874   brw_set_default_access_mode(p, BRW_ALIGN_1);
875   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
876   brw_MOV(p, dst, brw_imm_ud(0));
877
878   unsigned mask = write_mask.ud;
879
880   /* m0.5 bits 15:12 and 11:8 are channel enables */
881   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
882
883   /* HS patch URB handle is delivered in r0.0 */
884   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
885
886   /* m0.0-0.1: URB handles */
887   brw_MOV(p, vec2(get_element_ud(dst, 0)),
888           retype(urb_handle, BRW_REGISTER_TYPE_UD));
889
890   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
891   if (offset.file != ARF)
892      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
893
894   brw_pop_insn_state(p);
895}
896
897static void
898generate_tes_create_input_read_header(struct brw_codegen *p,
899                                      struct brw_reg dst)
900{
901   brw_push_insn_state(p);
902   brw_set_default_access_mode(p, BRW_ALIGN_1);
903   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
904
905   /* Initialize the register to 0 */
906   brw_MOV(p, dst, brw_imm_ud(0));
907
908   /* Enable all the channels in m0.5 bits 15:8 */
909   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
910
911   /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
912    * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
913    */
914   brw_AND(p, vec2(get_element_ud(dst, 0)),
915           retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
916           brw_imm_ud(0x1fff));
917   brw_pop_insn_state(p);
918}
919
920static void
921generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
922                                     struct brw_reg dst,
923                                     struct brw_reg header,
924                                     struct brw_reg offset)
925{
926   brw_push_insn_state(p);
927   brw_set_default_access_mode(p, BRW_ALIGN_1);
928   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
929
930   brw_MOV(p, dst, header);
931
932   /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>.
933    * Other values get <4;1,0>.
934    */
935   struct brw_reg restrided_offset;
936   if (offset.vstride == BRW_VERTICAL_STRIDE_0 &&
937       offset.width == BRW_WIDTH_4 &&
938       offset.hstride == BRW_HORIZONTAL_STRIDE_1) {
939      restrided_offset = stride(offset, 0, 1, 0);
940   } else {
941      restrided_offset = stride(offset, 4, 1, 0);
942   }
943
944   /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
945   brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset);
946
947   brw_pop_insn_state(p);
948}
949
950static void
951generate_vec4_urb_read(struct brw_codegen *p,
952                       vec4_instruction *inst,
953                       struct brw_reg dst,
954                       struct brw_reg header)
955{
956   const struct gen_device_info *devinfo = p->devinfo;
957
958   assert(header.file == BRW_GENERAL_REGISTER_FILE);
959   assert(header.type == BRW_REGISTER_TYPE_UD);
960
961   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
962   brw_set_dest(p, send, dst);
963   brw_set_src0(p, send, header);
964
965   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true));
966
967   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
968   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
969   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
970   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
971
972   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
973}
974
975static void
976generate_tcs_release_input(struct brw_codegen *p,
977                           struct brw_reg header,
978                           struct brw_reg vertex,
979                           struct brw_reg is_unpaired)
980{
981   const struct gen_device_info *devinfo = p->devinfo;
982
983   assert(vertex.file == BRW_IMMEDIATE_VALUE);
984   assert(vertex.type == BRW_REGISTER_TYPE_UD);
985
986   /* m0.0-0.1: URB handles */
987   struct brw_reg urb_handles =
988      retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
989             BRW_REGISTER_TYPE_UD);
990
991   brw_push_insn_state(p);
992   brw_set_default_access_mode(p, BRW_ALIGN_1);
993   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
994   brw_MOV(p, header, brw_imm_ud(0));
995   brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
996   brw_pop_insn_state(p);
997
998   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
999   brw_set_dest(p, send, brw_null_reg());
1000   brw_set_src0(p, send, header);
1001   brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true));
1002
1003   brw_inst_set_sfid(devinfo, send, BRW_SFID_URB);
1004   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
1005   brw_inst_set_urb_complete(devinfo, send, 1);
1006   brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
1007                                    BRW_URB_SWIZZLE_NONE :
1008                                    BRW_URB_SWIZZLE_INTERLEAVE);
1009}
1010
1011static void
1012generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
1013{
1014   struct brw_reg header = brw_message_reg(inst->base_mrf);
1015
1016   brw_push_insn_state(p);
1017   brw_set_default_access_mode(p, BRW_ALIGN_1);
1018   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1019   brw_MOV(p, header, brw_imm_ud(0));
1020   brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
1021   brw_MOV(p, get_element_ud(header, 0),
1022           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
1023   brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
1024   brw_pop_insn_state(p);
1025
1026   brw_urb_WRITE(p,
1027                 brw_null_reg(), /* dest */
1028                 inst->base_mrf, /* starting mrf reg nr */
1029                 header,
1030                 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
1031                 BRW_URB_WRITE_USE_CHANNEL_MASKS,
1032                 inst->mlen,
1033                 0,              /* response len */
1034                 0,              /* urb destination offset */
1035                 0);
1036}
1037
1038static void
1039generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1040{
1041   brw_push_insn_state(p);
1042   brw_set_default_access_mode(p, BRW_ALIGN_1);
1043   brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
1044   brw_pop_insn_state(p);
1045}
1046
1047static void
1048generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
1049{
1050   brw_push_insn_state(p);
1051   brw_set_default_access_mode(p, BRW_ALIGN_1);
1052   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
1053   brw_pop_insn_state(p);
1054}
1055
1056static void
1057generate_tcs_create_barrier_header(struct brw_codegen *p,
1058                                   struct brw_vue_prog_data *prog_data,
1059                                   struct brw_reg dst)
1060{
1061   const struct gen_device_info *devinfo = p->devinfo;
1062   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
1063   struct brw_reg m0_2 = get_element_ud(dst, 2);
1064   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
1065
1066   brw_push_insn_state(p);
1067   brw_set_default_access_mode(p, BRW_ALIGN_1);
1068   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1069
1070   /* Zero the message header */
1071   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
1072
1073   /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
1074   brw_AND(p, m0_2,
1075           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
1076           brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
1077
1078   /* Shift it up to bits 27:24. */
1079   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
1080
1081   /* Set the Barrier Count and the enable bit */
1082   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
1083
1084   brw_pop_insn_state(p);
1085}
1086
1087static void
1088generate_oword_dual_block_offsets(struct brw_codegen *p,
1089                                  struct brw_reg m1,
1090                                  struct brw_reg index)
1091{
1092   int second_vertex_offset;
1093
1094   if (p->devinfo->gen >= 6)
1095      second_vertex_offset = 1;
1096   else
1097      second_vertex_offset = 16;
1098
1099   m1 = retype(m1, BRW_REGISTER_TYPE_D);
1100
1101   /* Set up M1 (message payload).  Only the block offsets in M1.0 and
1102    * M1.4 are used, and the rest are ignored.
1103    */
1104   struct brw_reg m1_0 = suboffset(vec1(m1), 0);
1105   struct brw_reg m1_4 = suboffset(vec1(m1), 4);
1106   struct brw_reg index_0 = suboffset(vec1(index), 0);
1107   struct brw_reg index_4 = suboffset(vec1(index), 4);
1108
1109   brw_push_insn_state(p);
1110   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1111   brw_set_default_access_mode(p, BRW_ALIGN_1);
1112
1113   brw_MOV(p, m1_0, index_0);
1114
1115   if (index.file == BRW_IMMEDIATE_VALUE) {
1116      index_4.ud += second_vertex_offset;
1117      brw_MOV(p, m1_4, index_4);
1118   } else {
1119      brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
1120   }
1121
1122   brw_pop_insn_state(p);
1123}
1124
1125static void
1126generate_unpack_flags(struct brw_codegen *p,
1127                      struct brw_reg dst)
1128{
1129   brw_push_insn_state(p);
1130   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1131   brw_set_default_access_mode(p, BRW_ALIGN_1);
1132
1133   struct brw_reg flags = brw_flag_reg(0, 0);
1134   struct brw_reg dst_0 = suboffset(vec1(dst), 0);
1135   struct brw_reg dst_4 = suboffset(vec1(dst), 4);
1136
1137   brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
1138   brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
1139   brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
1140
1141   brw_pop_insn_state(p);
1142}
1143
1144static void
1145generate_scratch_read(struct brw_codegen *p,
1146                      vec4_instruction *inst,
1147                      struct brw_reg dst,
1148                      struct brw_reg index)
1149{
1150   const struct gen_device_info *devinfo = p->devinfo;
1151   struct brw_reg header = brw_vec8_grf(0, 0);
1152
1153   gen6_resolve_implied_move(p, &header, inst->base_mrf);
1154
1155   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1156				     index);
1157
1158   uint32_t msg_type;
1159
1160   if (devinfo->gen >= 6)
1161      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1162   else if (devinfo->gen == 5 || devinfo->is_g4x)
1163      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1164   else
1165      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1166
1167   const unsigned target_cache =
1168      devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1169      devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1170      BRW_SFID_DATAPORT_READ;
1171
1172   /* Each of the 8 channel enables is considered for whether each
1173    * dword is written.
1174    */
1175   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1176   brw_inst_set_sfid(devinfo, send, target_cache);
1177   brw_set_dest(p, send, dst);
1178   brw_set_src0(p, send, header);
1179   if (devinfo->gen < 6)
1180      brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
1181   brw_set_desc(p, send,
1182                brw_message_desc(devinfo, 2, 1, true) |
1183                brw_dp_read_desc(devinfo,
1184                                 brw_scratch_surface_idx(p),
1185                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1186                                 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
1187}
1188
1189static void
1190generate_scratch_write(struct brw_codegen *p,
1191                       vec4_instruction *inst,
1192                       struct brw_reg dst,
1193                       struct brw_reg src,
1194                       struct brw_reg index)
1195{
1196   const struct gen_device_info *devinfo = p->devinfo;
1197   const unsigned target_cache =
1198      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1199       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1200       BRW_SFID_DATAPORT_WRITE);
1201   struct brw_reg header = brw_vec8_grf(0, 0);
1202   bool write_commit;
1203
1204   /* If the instruction is predicated, we'll predicate the send, not
1205    * the header setup.
1206    */
1207   brw_set_default_predicate_control(p, false);
1208
1209   gen6_resolve_implied_move(p, &header, inst->base_mrf);
1210
1211   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
1212				     index);
1213
1214   brw_MOV(p,
1215	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
1216	   retype(src, BRW_REGISTER_TYPE_D));
1217
1218   uint32_t msg_type;
1219
1220   if (devinfo->gen >= 7)
1221      msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
1222   else if (devinfo->gen == 6)
1223      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1224   else
1225      msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
1226
1227   brw_set_default_predicate_control(p, inst->predicate);
1228
1229   /* Pre-gen6, we have to specify write commits to ensure ordering
1230    * between reads and writes within a thread.  Afterwards, that's
1231    * guaranteed and write commits only matter for inter-thread
1232    * synchronization.
1233    */
1234   if (devinfo->gen >= 6) {
1235      write_commit = false;
1236   } else {
1237      /* The visitor set up our destination register to be g0.  This
1238       * means that when the next read comes along, we will end up
1239       * reading from g0 and causing a block on the write commit.  For
1240       * write-after-read, we are relying on the value of the previous
1241       * read being used (and thus blocking on completion) before our
1242       * write is executed.  This means we have to be careful in
1243       * instruction scheduling to not violate this assumption.
1244       */
1245      write_commit = true;
1246   }
1247
1248   /* Each of the 8 channel enables is considered for whether each
1249    * dword is written.
1250    */
1251   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1252   brw_inst_set_sfid(p->devinfo, send, target_cache);
1253   brw_set_dest(p, send, dst);
1254   brw_set_src0(p, send, header);
1255   if (devinfo->gen < 6)
1256      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1257   brw_set_desc(p, send,
1258                brw_message_desc(devinfo, 3, write_commit, true) |
1259                brw_dp_write_desc(devinfo,
1260                                  brw_scratch_surface_idx(p),
1261                                  BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1262                                  msg_type,
1263                                  false, /* not a render target write */
1264                                  write_commit));
1265}
1266
1267static void
1268generate_pull_constant_load(struct brw_codegen *p,
1269                            struct brw_vue_prog_data *prog_data,
1270                            vec4_instruction *inst,
1271                            struct brw_reg dst,
1272                            struct brw_reg index,
1273                            struct brw_reg offset)
1274{
1275   const struct gen_device_info *devinfo = p->devinfo;
1276   const unsigned target_cache =
1277      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
1278       BRW_SFID_DATAPORT_READ);
1279   assert(index.file == BRW_IMMEDIATE_VALUE &&
1280	  index.type == BRW_REGISTER_TYPE_UD);
1281   uint32_t surf_index = index.ud;
1282
1283   struct brw_reg header = brw_vec8_grf(0, 0);
1284
1285   gen6_resolve_implied_move(p, &header, inst->base_mrf);
1286
1287   if (devinfo->gen >= 6) {
1288      if (offset.file == BRW_IMMEDIATE_VALUE) {
1289         brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1290                           BRW_REGISTER_TYPE_D),
1291                 brw_imm_d(offset.ud >> 4));
1292      } else {
1293         brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
1294                           BRW_REGISTER_TYPE_D),
1295                 offset, brw_imm_d(4));
1296      }
1297   } else {
1298      brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
1299                        BRW_REGISTER_TYPE_D),
1300              offset);
1301   }
1302
1303   uint32_t msg_type;
1304
1305   if (devinfo->gen >= 6)
1306      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1307   else if (devinfo->gen == 5 || devinfo->is_g4x)
1308      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1309   else
1310      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
1311
1312   /* Each of the 8 channel enables is considered for whether each
1313    * dword is written.
1314    */
1315   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1316   brw_inst_set_sfid(devinfo, send, target_cache);
1317   brw_set_dest(p, send, dst);
1318   brw_set_src0(p, send, header);
1319   if (devinfo->gen < 6)
1320      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
1321   brw_set_desc(p, send,
1322                brw_message_desc(devinfo, 2, 1, true) |
1323                brw_dp_read_desc(devinfo, surf_index,
1324                                 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
1325                                 msg_type,
1326                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
1327}
1328
1329static void
1330generate_get_buffer_size(struct brw_codegen *p,
1331                         struct brw_vue_prog_data *prog_data,
1332                         vec4_instruction *inst,
1333                         struct brw_reg dst,
1334                         struct brw_reg src,
1335                         struct brw_reg surf_index)
1336{
1337   assert(p->devinfo->gen >= 7);
1338   assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
1339          surf_index.file == BRW_IMMEDIATE_VALUE);
1340
1341   brw_SAMPLE(p,
1342              dst,
1343              inst->base_mrf,
1344              src,
1345              surf_index.ud,
1346              0,
1347              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
1348              1, /* response length */
1349              inst->mlen,
1350              inst->header_size > 0,
1351              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1352              BRW_SAMPLER_RETURN_FORMAT_SINT32);
1353}
1354
1355static void
1356generate_pull_constant_load_gen7(struct brw_codegen *p,
1357                                 struct brw_vue_prog_data *prog_data,
1358                                 vec4_instruction *inst,
1359                                 struct brw_reg dst,
1360                                 struct brw_reg surf_index,
1361                                 struct brw_reg offset)
1362{
1363   const struct gen_device_info *devinfo = p->devinfo;
1364   assert(surf_index.type == BRW_REGISTER_TYPE_UD);
1365
1366   if (surf_index.file == BRW_IMMEDIATE_VALUE) {
1367
1368      brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
1369      brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
1370      brw_set_dest(p, insn, dst);
1371      brw_set_src0(p, insn, offset);
1372      brw_set_desc(p, insn,
1373                   brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1374                   brw_sampler_desc(devinfo, surf_index.ud,
1375                                    0, /* LD message ignores sampler unit */
1376                                    GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1377                                    BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0));
1378   } else {
1379
1380      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1381
1382      brw_push_insn_state(p);
1383      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1384      brw_set_default_access_mode(p, BRW_ALIGN_1);
1385
1386      /* a0.0 = surf_index & 0xff */
1387      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1388      brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1);
1389      brw_set_dest(p, insn_and, addr);
1390      brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
1391      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1392
1393      brw_pop_insn_state(p);
1394
1395      /* dst = send(offset, a0.0 | <descriptor>) */
1396      brw_send_indirect_message(
1397         p, BRW_SFID_SAMPLER, dst, offset, addr,
1398         brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) |
1399         brw_sampler_desc(devinfo,
1400                          0 /* surface */,
1401                          0 /* sampler */,
1402                          GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1403                          BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1404                          0),
1405         false /* EOT */);
1406   }
1407}
1408
1409static void
1410generate_set_simd4x2_header_gen9(struct brw_codegen *p,
1411                                 vec4_instruction *,
1412                                 struct brw_reg dst)
1413{
1414   brw_push_insn_state(p);
1415   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1416
1417   brw_set_default_exec_size(p, BRW_EXECUTE_8);
1418   brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1419
1420   brw_set_default_access_mode(p, BRW_ALIGN_1);
1421   brw_MOV(p, get_element_ud(dst, 2),
1422           brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1423
1424   brw_pop_insn_state(p);
1425}
1426
1427static void
1428generate_mov_indirect(struct brw_codegen *p,
1429                      vec4_instruction *,
1430                      struct brw_reg dst, struct brw_reg reg,
1431                      struct brw_reg indirect)
1432{
1433   assert(indirect.type == BRW_REGISTER_TYPE_UD);
1434   assert(p->devinfo->gen >= 6);
1435
1436   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
1437
1438   /* This instruction acts in align1 mode */
1439   assert(dst.writemask == WRITEMASK_XYZW);
1440
1441   if (indirect.file == BRW_IMMEDIATE_VALUE) {
1442      imm_byte_offset += indirect.ud;
1443
1444      reg.nr = imm_byte_offset / REG_SIZE;
1445      reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
1446      unsigned shift = (imm_byte_offset / 4) % 4;
1447      reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
1448
1449      brw_MOV(p, dst, reg);
1450   } else {
1451      brw_push_insn_state(p);
1452      brw_set_default_access_mode(p, BRW_ALIGN_1);
1453      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1454
1455      struct brw_reg addr = vec8(brw_address_reg(0));
1456
1457      /* We need to move the indirect value into the address register.  In
1458       * order to make things make some sense, we want to respect at least the
1459       * X component of the swizzle.  In order to do that, we need to convert
1460       * the subnr (probably 0) to an align1 subnr and add in the swizzle.
1461       */
1462      assert(brw_is_single_value_swizzle(indirect.swizzle));
1463      indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
1464
1465      /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
1466       * the indirect and splat it out to all four channels of the given half
1467       * of a0.
1468       */
1469      indirect.subnr *= 2;
1470      indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
1471      brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
1472
1473      /* Now we need to incorporate the swizzle from the source register */
1474      if (reg.swizzle != BRW_SWIZZLE_XXXX) {
1475         uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
1476                            BRW_GET_SWZ(reg.swizzle, 1) << 6 |
1477                            BRW_GET_SWZ(reg.swizzle, 2) << 10 |
1478                            BRW_GET_SWZ(reg.swizzle, 3) << 14;
1479         uv_swiz |= uv_swiz << 16;
1480
1481         brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
1482      }
1483
1484      brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
1485
1486      brw_pop_insn_state(p);
1487   }
1488}
1489
1490static void
1491generate_code(struct brw_codegen *p,
1492              const struct brw_compiler *compiler,
1493              void *log_data,
1494              const nir_shader *nir,
1495              struct brw_vue_prog_data *prog_data,
1496              const struct cfg_t *cfg)
1497{
1498   const struct gen_device_info *devinfo = p->devinfo;
1499   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage);
1500   bool debug_flag = INTEL_DEBUG &
1501      intel_debug_flag_for_shader_stage(nir->info.stage);
1502   struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1503   int spill_count = 0, fill_count = 0;
1504   int loop_count = 0;
1505
1506   foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
1507      struct brw_reg src[3], dst;
1508
1509      if (unlikely(debug_flag))
1510         disasm_annotate(disasm_info, inst, p->next_insn_offset);
1511
1512      for (unsigned int i = 0; i < 3; i++) {
1513         src[i] = inst->src[i].as_brw_reg();
1514      }
1515      dst = inst->dst.as_brw_reg();
1516
1517      brw_set_default_predicate_control(p, inst->predicate);
1518      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1519      brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2);
1520      brw_set_default_saturate(p, inst->saturate);
1521      brw_set_default_mask_control(p, inst->force_writemask_all);
1522      brw_set_default_acc_write_control(p, inst->writes_accumulator);
1523
1524      assert(inst->group % inst->exec_size == 0);
1525      assert(inst->group % 4 == 0);
1526
1527      /* There are some instructions where the destination is 64-bit
1528       * but we retype it to a smaller type. In that case, we cannot
1529       * double the exec_size.
1530       */
1531      const bool is_df = (get_exec_type_size(inst) == 8 ||
1532                          inst->dst.type == BRW_REGISTER_TYPE_DF) &&
1533                         inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT &&
1534                         inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT &&
1535                         inst->opcode != VEC4_OPCODE_SET_LOW_32BIT &&
1536                         inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT;
1537
1538      unsigned exec_size = inst->exec_size;
1539      if (devinfo->gen == 7 && !devinfo->is_haswell && is_df)
1540         exec_size *= 2;
1541
1542      brw_set_default_exec_size(p, cvt(exec_size) - 1);
1543
1544      if (!inst->force_writemask_all)
1545         brw_set_default_group(p, inst->group);
1546
1547      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1548      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1549
1550      unsigned pre_emit_nr_insn = p->nr_insn;
1551
1552      switch (inst->opcode) {
1553      case VEC4_OPCODE_UNPACK_UNIFORM:
1554      case BRW_OPCODE_MOV:
1555         brw_MOV(p, dst, src[0]);
1556         break;
1557      case BRW_OPCODE_ADD:
1558         brw_ADD(p, dst, src[0], src[1]);
1559         break;
1560      case BRW_OPCODE_MUL:
1561         brw_MUL(p, dst, src[0], src[1]);
1562         break;
1563      case BRW_OPCODE_MACH:
1564         brw_MACH(p, dst, src[0], src[1]);
1565         break;
1566
1567      case BRW_OPCODE_MAD:
1568         assert(devinfo->gen >= 6);
1569         brw_MAD(p, dst, src[0], src[1], src[2]);
1570         break;
1571
1572      case BRW_OPCODE_FRC:
1573         brw_FRC(p, dst, src[0]);
1574         break;
1575      case BRW_OPCODE_RNDD:
1576         brw_RNDD(p, dst, src[0]);
1577         break;
1578      case BRW_OPCODE_RNDE:
1579         brw_RNDE(p, dst, src[0]);
1580         break;
1581      case BRW_OPCODE_RNDZ:
1582         brw_RNDZ(p, dst, src[0]);
1583         break;
1584
1585      case BRW_OPCODE_AND:
1586         brw_AND(p, dst, src[0], src[1]);
1587         break;
1588      case BRW_OPCODE_OR:
1589         brw_OR(p, dst, src[0], src[1]);
1590         break;
1591      case BRW_OPCODE_XOR:
1592         brw_XOR(p, dst, src[0], src[1]);
1593         break;
1594      case BRW_OPCODE_NOT:
1595         brw_NOT(p, dst, src[0]);
1596         break;
1597      case BRW_OPCODE_ASR:
1598         brw_ASR(p, dst, src[0], src[1]);
1599         break;
1600      case BRW_OPCODE_SHR:
1601         brw_SHR(p, dst, src[0], src[1]);
1602         break;
1603      case BRW_OPCODE_SHL:
1604         brw_SHL(p, dst, src[0], src[1]);
1605         break;
1606
1607      case BRW_OPCODE_CMP:
1608         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1609         break;
1610      case BRW_OPCODE_SEL:
1611         brw_SEL(p, dst, src[0], src[1]);
1612         break;
1613
1614      case BRW_OPCODE_DPH:
1615         brw_DPH(p, dst, src[0], src[1]);
1616         break;
1617
1618      case BRW_OPCODE_DP4:
1619         brw_DP4(p, dst, src[0], src[1]);
1620         break;
1621
1622      case BRW_OPCODE_DP3:
1623         brw_DP3(p, dst, src[0], src[1]);
1624         break;
1625
1626      case BRW_OPCODE_DP2:
1627         brw_DP2(p, dst, src[0], src[1]);
1628         break;
1629
1630      case BRW_OPCODE_F32TO16:
1631         assert(devinfo->gen >= 7);
1632         brw_F32TO16(p, dst, src[0]);
1633         break;
1634
1635      case BRW_OPCODE_F16TO32:
1636         assert(devinfo->gen >= 7);
1637         brw_F16TO32(p, dst, src[0]);
1638         break;
1639
1640      case BRW_OPCODE_LRP:
1641         assert(devinfo->gen >= 6);
1642         brw_LRP(p, dst, src[0], src[1], src[2]);
1643         break;
1644
1645      case BRW_OPCODE_BFREV:
1646         assert(devinfo->gen >= 7);
1647         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1648                   retype(src[0], BRW_REGISTER_TYPE_UD));
1649         break;
1650      case BRW_OPCODE_FBH:
1651         assert(devinfo->gen >= 7);
1652         brw_FBH(p, retype(dst, src[0].type), src[0]);
1653         break;
1654      case BRW_OPCODE_FBL:
1655         assert(devinfo->gen >= 7);
1656         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1657                 retype(src[0], BRW_REGISTER_TYPE_UD));
1658         break;
1659      case BRW_OPCODE_LZD:
1660         brw_LZD(p, dst, src[0]);
1661         break;
1662      case BRW_OPCODE_CBIT:
1663         assert(devinfo->gen >= 7);
1664         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1665                  retype(src[0], BRW_REGISTER_TYPE_UD));
1666         break;
1667      case BRW_OPCODE_ADDC:
1668         assert(devinfo->gen >= 7);
1669         brw_ADDC(p, dst, src[0], src[1]);
1670         break;
1671      case BRW_OPCODE_SUBB:
1672         assert(devinfo->gen >= 7);
1673         brw_SUBB(p, dst, src[0], src[1]);
1674         break;
1675      case BRW_OPCODE_MAC:
1676         brw_MAC(p, dst, src[0], src[1]);
1677         break;
1678
1679      case BRW_OPCODE_BFE:
1680         assert(devinfo->gen >= 7);
1681         brw_BFE(p, dst, src[0], src[1], src[2]);
1682         break;
1683
1684      case BRW_OPCODE_BFI1:
1685         assert(devinfo->gen >= 7);
1686         brw_BFI1(p, dst, src[0], src[1]);
1687         break;
1688      case BRW_OPCODE_BFI2:
1689         assert(devinfo->gen >= 7);
1690         brw_BFI2(p, dst, src[0], src[1], src[2]);
1691         break;
1692
1693      case BRW_OPCODE_IF:
1694         if (!inst->src[0].is_null()) {
1695            /* The instruction has an embedded compare (only allowed on gen6) */
1696            assert(devinfo->gen == 6);
1697            gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1698         } else {
1699            brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
1700            brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
1701         }
1702         break;
1703
1704      case BRW_OPCODE_ELSE:
1705         brw_ELSE(p);
1706         break;
1707      case BRW_OPCODE_ENDIF:
1708         brw_ENDIF(p);
1709         break;
1710
1711      case BRW_OPCODE_DO:
1712         brw_DO(p, BRW_EXECUTE_8);
1713         break;
1714
1715      case BRW_OPCODE_BREAK:
1716         brw_BREAK(p);
1717         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1718         break;
1719      case BRW_OPCODE_CONTINUE:
1720         brw_CONT(p);
1721         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1722         break;
1723
1724      case BRW_OPCODE_WHILE:
1725         brw_WHILE(p);
1726         loop_count++;
1727         break;
1728
1729      case SHADER_OPCODE_RCP:
1730      case SHADER_OPCODE_RSQ:
1731      case SHADER_OPCODE_SQRT:
1732      case SHADER_OPCODE_EXP2:
1733      case SHADER_OPCODE_LOG2:
1734      case SHADER_OPCODE_SIN:
1735      case SHADER_OPCODE_COS:
1736         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1737         if (devinfo->gen >= 7) {
1738            gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1739                      brw_null_reg());
1740         } else if (devinfo->gen == 6) {
1741            generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
1742         } else {
1743            generate_math1_gen4(p, inst, dst, src[0]);
1744         }
1745         break;
1746
1747      case SHADER_OPCODE_POW:
1748      case SHADER_OPCODE_INT_QUOTIENT:
1749      case SHADER_OPCODE_INT_REMAINDER:
1750         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1751         if (devinfo->gen >= 7) {
1752            gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1753         } else if (devinfo->gen == 6) {
1754            generate_math_gen6(p, inst, dst, src[0], src[1]);
1755         } else {
1756            generate_math2_gen4(p, inst, dst, src[0], src[1]);
1757         }
1758         break;
1759
1760      case SHADER_OPCODE_TEX:
1761      case SHADER_OPCODE_TXD:
1762      case SHADER_OPCODE_TXF:
1763      case SHADER_OPCODE_TXF_CMS:
1764      case SHADER_OPCODE_TXF_CMS_W:
1765      case SHADER_OPCODE_TXF_MCS:
1766      case SHADER_OPCODE_TXL:
1767      case SHADER_OPCODE_TXS:
1768      case SHADER_OPCODE_TG4:
1769      case SHADER_OPCODE_TG4_OFFSET:
1770      case SHADER_OPCODE_SAMPLEINFO:
1771         generate_tex(p, prog_data, nir->info.stage,
1772                      inst, dst, src[0], src[1], src[2]);
1773         break;
1774
1775      case SHADER_OPCODE_GET_BUFFER_SIZE:
1776         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
1777         break;
1778
1779      case VS_OPCODE_URB_WRITE:
1780         generate_vs_urb_write(p, inst);
1781         break;
1782
1783      case SHADER_OPCODE_GEN4_SCRATCH_READ:
1784         generate_scratch_read(p, inst, dst, src[0]);
1785         fill_count++;
1786         break;
1787
1788      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1789         generate_scratch_write(p, inst, dst, src[0], src[1]);
1790         spill_count++;
1791         break;
1792
1793      case VS_OPCODE_PULL_CONSTANT_LOAD:
1794         generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
1795         break;
1796
1797      case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
1798         generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
1799         break;
1800
1801      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
1802         generate_set_simd4x2_header_gen9(p, inst, dst);
1803         break;
1804
1805      case GS_OPCODE_URB_WRITE:
1806         generate_gs_urb_write(p, inst);
1807         break;
1808
1809      case GS_OPCODE_URB_WRITE_ALLOCATE:
1810         generate_gs_urb_write_allocate(p, inst);
1811         break;
1812
1813      case GS_OPCODE_SVB_WRITE:
1814         generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
1815         break;
1816
1817      case GS_OPCODE_SVB_SET_DST_INDEX:
1818         generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
1819         break;
1820
1821      case GS_OPCODE_THREAD_END:
1822         generate_gs_thread_end(p, inst);
1823         break;
1824
1825      case GS_OPCODE_SET_WRITE_OFFSET:
1826         generate_gs_set_write_offset(p, dst, src[0], src[1]);
1827         break;
1828
1829      case GS_OPCODE_SET_VERTEX_COUNT:
1830         generate_gs_set_vertex_count(p, dst, src[0]);
1831         break;
1832
1833      case GS_OPCODE_FF_SYNC:
1834         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
1835         break;
1836
1837      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
1838         generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
1839         break;
1840
1841      case GS_OPCODE_SET_PRIMITIVE_ID:
1842         generate_gs_set_primitive_id(p, dst);
1843         break;
1844
1845      case GS_OPCODE_SET_DWORD_2:
1846         generate_gs_set_dword_2(p, dst, src[0]);
1847         break;
1848
1849      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
1850         generate_gs_prepare_channel_masks(p, dst);
1851         break;
1852
1853      case GS_OPCODE_SET_CHANNEL_MASKS:
1854         generate_gs_set_channel_masks(p, dst, src[0]);
1855         break;
1856
1857      case GS_OPCODE_GET_INSTANCE_ID:
1858         generate_gs_get_instance_id(p, dst);
1859         break;
1860
1861      case SHADER_OPCODE_SHADER_TIME_ADD:
1862         brw_shader_time_add(p, src[0],
1863                             prog_data->base.binding_table.shader_time_start);
1864         break;
1865
1866      case VEC4_OPCODE_UNTYPED_ATOMIC:
1867         assert(src[2].file == BRW_IMMEDIATE_VALUE);
1868         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
1869                            !inst->dst.is_null(), inst->header_size);
1870         break;
1871
1872      case VEC4_OPCODE_UNTYPED_SURFACE_READ:
1873         assert(!inst->header_size);
1874         assert(src[2].file == BRW_IMMEDIATE_VALUE);
1875         brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
1876                                  src[2].ud);
1877         break;
1878
1879      case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
1880         assert(src[2].file == BRW_IMMEDIATE_VALUE);
1881         brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
1882                                   src[2].ud, inst->header_size);
1883         break;
1884
1885      case SHADER_OPCODE_MEMORY_FENCE:
1886         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false);
1887         break;
1888
1889      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
1890         const struct brw_reg mask =
1891            brw_stage_has_packed_dispatch(devinfo, nir->info.stage,
1892                                          &prog_data->base) ? brw_imm_ud(~0u) :
1893            brw_dmask_reg();
1894         brw_find_live_channel(p, dst, mask);
1895         break;
1896      }
1897
1898      case SHADER_OPCODE_BROADCAST:
1899         assert(inst->force_writemask_all);
1900         brw_broadcast(p, dst, src[0], src[1]);
1901         break;
1902
1903      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
1904         generate_unpack_flags(p, dst);
1905         break;
1906
1907      case VEC4_OPCODE_MOV_BYTES: {
1908         /* Moves the low byte from each channel, using an Align1 access mode
1909          * and a <4,1,0> source region.
1910          */
1911         assert(src[0].type == BRW_REGISTER_TYPE_UB ||
1912                src[0].type == BRW_REGISTER_TYPE_B);
1913
1914         brw_set_default_access_mode(p, BRW_ALIGN_1);
1915         src[0].vstride = BRW_VERTICAL_STRIDE_4;
1916         src[0].width = BRW_WIDTH_1;
1917         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
1918         brw_MOV(p, dst, src[0]);
1919         brw_set_default_access_mode(p, BRW_ALIGN_16);
1920         break;
1921      }
1922
1923      case VEC4_OPCODE_DOUBLE_TO_F32:
1924      case VEC4_OPCODE_DOUBLE_TO_D32:
1925      case VEC4_OPCODE_DOUBLE_TO_U32: {
1926         assert(type_sz(src[0].type) == 8);
1927         assert(type_sz(dst.type) == 8);
1928
1929         brw_reg_type dst_type;
1930
1931         switch (inst->opcode) {
1932         case VEC4_OPCODE_DOUBLE_TO_F32:
1933            dst_type = BRW_REGISTER_TYPE_F;
1934            break;
1935         case VEC4_OPCODE_DOUBLE_TO_D32:
1936            dst_type = BRW_REGISTER_TYPE_D;
1937            break;
1938         case VEC4_OPCODE_DOUBLE_TO_U32:
1939            dst_type = BRW_REGISTER_TYPE_UD;
1940            break;
1941         default:
1942            unreachable("Not supported conversion");
1943         }
1944         dst = retype(dst, dst_type);
1945
1946         brw_set_default_access_mode(p, BRW_ALIGN_1);
1947
1948         /* When converting from DF->F, we set destination's stride as 2 as an
1949          * aligment requirement. But in IVB/BYT, each DF implicitly writes
1950          * two floats, being the first one the converted value. So we don't
1951          * need to explicitly set stride 2, but 1.
1952          */
1953         struct brw_reg spread_dst;
1954         if (devinfo->gen == 7 && !devinfo->is_haswell)
1955            spread_dst = stride(dst, 8, 4, 1);
1956         else
1957            spread_dst = stride(dst, 8, 4, 2);
1958
1959         brw_MOV(p, spread_dst, src[0]);
1960
1961         brw_set_default_access_mode(p, BRW_ALIGN_16);
1962         break;
1963      }
1964
1965      case VEC4_OPCODE_TO_DOUBLE: {
1966         assert(type_sz(src[0].type) == 4);
1967         assert(type_sz(dst.type) == 8);
1968
1969         brw_set_default_access_mode(p, BRW_ALIGN_1);
1970
1971         brw_MOV(p, dst, src[0]);
1972
1973         brw_set_default_access_mode(p, BRW_ALIGN_16);
1974         break;
1975      }
1976
1977      case VEC4_OPCODE_PICK_LOW_32BIT:
1978      case VEC4_OPCODE_PICK_HIGH_32BIT: {
1979         /* Stores the low/high 32-bit of each 64-bit element in src[0] into
1980          * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
1981          */
1982         assert(type_sz(src[0].type) == 8);
1983         assert(type_sz(dst.type) == 4);
1984
1985         brw_set_default_access_mode(p, BRW_ALIGN_1);
1986
1987         dst = retype(dst, BRW_REGISTER_TYPE_UD);
1988         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
1989
1990         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
1991         if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
1992            src[0] = suboffset(src[0], 1);
1993         src[0] = spread(src[0], 2);
1994         brw_MOV(p, dst, src[0]);
1995
1996         brw_set_default_access_mode(p, BRW_ALIGN_16);
1997         break;
1998      }
1999
2000      case VEC4_OPCODE_SET_LOW_32BIT:
2001      case VEC4_OPCODE_SET_HIGH_32BIT: {
2002         /* Reads consecutive 32-bit elements from src[0] and writes
2003          * them to the low/high 32-bit of each 64-bit element in dst.
2004          */
2005         assert(type_sz(src[0].type) == 4);
2006         assert(type_sz(dst.type) == 8);
2007
2008         brw_set_default_access_mode(p, BRW_ALIGN_1);
2009
2010         dst = retype(dst, BRW_REGISTER_TYPE_UD);
2011         if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
2012            dst = suboffset(dst, 1);
2013         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
2014
2015         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
2016         brw_MOV(p, dst, src[0]);
2017
2018         brw_set_default_access_mode(p, BRW_ALIGN_16);
2019         break;
2020      }
2021
2022      case VEC4_OPCODE_PACK_BYTES: {
2023         /* Is effectively:
2024          *
2025          *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
2026          *
2027          * but destinations' only regioning is horizontal stride, so instead we
2028          * have to use two instructions:
2029          *
2030          *   mov(4) dst<1>:UB     src<4,1,0>:UB
2031          *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
2032          *
2033          * where they pack the four bytes from the low and high four DW.
2034          */
2035         assert(_mesa_is_pow_two(dst.writemask) &&
2036                dst.writemask != 0);
2037         unsigned offset = __builtin_ctz(dst.writemask);
2038
2039         dst.type = BRW_REGISTER_TYPE_UB;
2040
2041         brw_set_default_access_mode(p, BRW_ALIGN_1);
2042
2043         src[0].type = BRW_REGISTER_TYPE_UB;
2044         src[0].vstride = BRW_VERTICAL_STRIDE_4;
2045         src[0].width = BRW_WIDTH_1;
2046         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
2047         dst.subnr = offset * 4;
2048         struct brw_inst *insn = brw_MOV(p, dst, src[0]);
2049         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2050         brw_inst_set_no_dd_clear(p->devinfo, insn, true);
2051         brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
2052
2053         src[0].subnr = 16;
2054         dst.subnr = 16 + offset * 4;
2055         insn = brw_MOV(p, dst, src[0]);
2056         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
2057         brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
2058         brw_inst_set_no_dd_check(p->devinfo, insn, true);
2059
2060         brw_set_default_access_mode(p, BRW_ALIGN_16);
2061         break;
2062      }
2063
2064      case TCS_OPCODE_URB_WRITE:
2065         generate_tcs_urb_write(p, inst, src[0]);
2066         break;
2067
2068      case VEC4_OPCODE_URB_READ:
2069         generate_vec4_urb_read(p, inst, dst, src[0]);
2070         break;
2071
2072      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
2073         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
2074         break;
2075
2076      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
2077         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
2078         break;
2079
2080      case TCS_OPCODE_GET_INSTANCE_ID:
2081         generate_tcs_get_instance_id(p, dst);
2082         break;
2083
2084      case TCS_OPCODE_GET_PRIMITIVE_ID:
2085         generate_tcs_get_primitive_id(p, dst);
2086         break;
2087
2088      case TCS_OPCODE_CREATE_BARRIER_HEADER:
2089         generate_tcs_create_barrier_header(p, prog_data, dst);
2090         break;
2091
2092      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
2093         generate_tes_create_input_read_header(p, dst);
2094         break;
2095
2096      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
2097         generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
2098         break;
2099
2100      case TES_OPCODE_GET_PRIMITIVE_ID:
2101         generate_tes_get_primitive_id(p, dst);
2102         break;
2103
2104      case TCS_OPCODE_SRC0_010_IS_ZERO:
2105         /* If src_reg had stride like fs_reg, we wouldn't need this. */
2106         brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
2107         break;
2108
2109      case TCS_OPCODE_RELEASE_INPUT:
2110         generate_tcs_release_input(p, dst, src[0], src[1]);
2111         break;
2112
2113      case TCS_OPCODE_THREAD_END:
2114         generate_tcs_thread_end(p, inst);
2115         break;
2116
2117      case SHADER_OPCODE_BARRIER:
2118         brw_barrier(p, src[0]);
2119         brw_WAIT(p);
2120         break;
2121
2122      case SHADER_OPCODE_MOV_INDIRECT:
2123         generate_mov_indirect(p, inst, dst, src[0], src[1]);
2124         break;
2125
2126      case BRW_OPCODE_DIM:
2127         assert(devinfo->is_haswell);
2128         assert(src[0].type == BRW_REGISTER_TYPE_DF);
2129         assert(dst.type == BRW_REGISTER_TYPE_DF);
2130         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2131         break;
2132
2133      default:
2134         unreachable("Unsupported opcode");
2135      }
2136
2137      if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
2138         /* Handled dependency hints in the generator. */
2139
2140         assert(!inst->conditional_mod);
2141      } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2142         assert(p->nr_insn == pre_emit_nr_insn + 1 ||
2143                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2144                 "emitting more than 1 instruction");
2145
2146         brw_inst *last = &p->store[pre_emit_nr_insn];
2147
2148         if (inst->conditional_mod)
2149            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2150         brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2151         brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2152      }
2153   }
2154
2155   brw_set_uip_jip(p, 0);
2156
2157   /* end of program sentinel */
2158   disasm_new_inst_group(disasm_info, p->next_insn_offset);
2159
2160#ifndef NDEBUG
2161   bool validated =
2162#else
2163   if (unlikely(debug_flag))
2164#endif
2165      brw_validate_instructions(devinfo, p->store,
2166                                0, p->next_insn_offset,
2167                                disasm_info);
2168
2169   int before_size = p->next_insn_offset;
2170   brw_compact_instructions(p, 0, disasm_info);
2171   int after_size = p->next_insn_offset;
2172
2173   if (unlikely(debug_flag)) {
2174      fprintf(stderr, "Native code for %s %s shader %s:\n",
2175              nir->info.label ? nir->info.label : "unnamed",
2176              _mesa_shader_stage_to_string(nir->info.stage), nir->info.name);
2177
2178      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
2179                      "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
2180              stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
2181              spill_count, fill_count, before_size, after_size,
2182              100.0f * (before_size - after_size) / before_size);
2183
2184      dump_assembly(p->store, disasm_info);
2185   }
2186   ralloc_free(disasm_info);
2187   assert(validated);
2188
2189   compiler->shader_debug_log(log_data,
2190                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
2191                              "%d:%d spills:fills, compacted %d to %d bytes.",
2192                              stage_abbrev, before_size / 16,
2193                              loop_count, cfg->cycle_count, spill_count,
2194                              fill_count, before_size, after_size);
2195
2196}
2197
2198extern "C" const unsigned *
2199brw_vec4_generate_assembly(const struct brw_compiler *compiler,
2200                           void *log_data,
2201                           void *mem_ctx,
2202                           const nir_shader *nir,
2203                           struct brw_vue_prog_data *prog_data,
2204                           const struct cfg_t *cfg)
2205{
2206   struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
2207   brw_init_codegen(compiler->devinfo, p, mem_ctx);
2208   brw_set_default_access_mode(p, BRW_ALIGN_16);
2209
2210   generate_code(p, compiler, log_data, nir, prog_data, cfg);
2211
2212   return brw_get_program(p, &prog_data->base.program_size);
2213}
2214