brw_eu_emit.c revision b8e80941
1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_eu_defines.h"
34#include "brw_eu.h"
35
36#include "util/ralloc.h"
37
38/**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case.  This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45void
46gen6_resolve_implied_move(struct brw_codegen *p,
47			  struct brw_reg *src,
48			  unsigned msg_reg_nr)
49{
50   const struct gen_device_info *devinfo = p->devinfo;
51   if (devinfo->gen < 6)
52      return;
53
54   if (src->file == BRW_MESSAGE_REGISTER_FILE)
55      return;
56
57   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58      brw_push_insn_state(p);
59      brw_set_default_exec_size(p, BRW_EXECUTE_8);
60      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
61      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
62      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
63	      retype(*src, BRW_REGISTER_TYPE_UD));
64      brw_pop_insn_state(p);
65   }
66   *src = brw_message_reg(msg_reg_nr);
67}
68
69static void
70gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
71{
72   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
73    * "The send with EOT should use register space R112-R127 for <src>. This is
74    *  to enable loading of a new thread into the same slot while the message
75    *  with EOT for current thread is pending dispatch."
76    *
77    * Since we're pretending to have 16 MRFs anyway, we may as well use the
78    * registers required for messages with EOT.
79    */
80   const struct gen_device_info *devinfo = p->devinfo;
81   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
82      reg->file = BRW_GENERAL_REGISTER_FILE;
83      reg->nr += GEN7_MRF_HACK_START;
84   }
85}
86
87void
88brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
89{
90   const struct gen_device_info *devinfo = p->devinfo;
91
92   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
93      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
94   else if (dest.file == BRW_GENERAL_REGISTER_FILE)
95      assert(dest.nr < 128);
96
97   /* The hardware has a restriction where if the destination is Byte,
98    * the instruction needs to have a stride of 2 (except for packed byte
99    * MOV). This seems to be required even if the destination is the NULL
100    * register.
101    */
102   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
103       dest.nr == BRW_ARF_NULL &&
104       type_sz(dest.type) == 1) {
105      dest.hstride = BRW_HORIZONTAL_STRIDE_2;
106   }
107
108   gen7_convert_mrf_to_grf(p, &dest);
109
110   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
111       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
112      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
113             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
114      assert(dest.address_mode == BRW_ADDRESS_DIRECT);
115      assert(dest.subnr % 16 == 0);
116      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
117             dest.vstride == dest.width + 1);
118      assert(!dest.negate && !dest.abs);
119      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
120      brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
121      brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
122   } else {
123      brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
124      brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
125
126      if (dest.address_mode == BRW_ADDRESS_DIRECT) {
127         brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
128
129         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
130            brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
131            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
132               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
133            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
134         } else {
135            brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
136            brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
137            if (dest.file == BRW_GENERAL_REGISTER_FILE ||
138                dest.file == BRW_MESSAGE_REGISTER_FILE) {
139               assert(dest.writemask != 0);
140            }
141            /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
142             *    Although Dst.HorzStride is a don't care for Align16, HW needs
143             *    this to be programmed as "01".
144             */
145            brw_inst_set_dst_hstride(devinfo, inst, 1);
146         }
147      } else {
148         brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
149
150         /* These are different sizes in align1 vs align16:
151          */
152         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
153            brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
154                                          dest.indirect_offset);
155            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
156               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
157            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
158         } else {
159            brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
160                                           dest.indirect_offset);
161            /* even ignored in da16, still need to set as '01' */
162            brw_inst_set_dst_hstride(devinfo, inst, 1);
163         }
164      }
165   }
166
167   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
168    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
169    * small registers, it can be useful for us to automatically reduce it to
170    * match the register size.
171    */
172   if (p->automatic_exec_sizes) {
173      /*
174       * In platforms that support fp64 we can emit instructions with a width
175       * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
176       * these cases we need to make sure that these instructions have their
177       * exec sizes set properly when they are emitted and we can't rely on
178       * this code to fix it.
179       */
180      bool fix_exec_size;
181      if (devinfo->gen >= 6)
182         fix_exec_size = dest.width < BRW_EXECUTE_4;
183      else
184         fix_exec_size = dest.width < BRW_EXECUTE_8;
185
186      if (fix_exec_size)
187         brw_inst_set_exec_size(devinfo, inst, dest.width);
188   }
189}
190
191void
192brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
193{
194   const struct gen_device_info *devinfo = p->devinfo;
195
196   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
197      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
198   else if (reg.file == BRW_GENERAL_REGISTER_FILE)
199      assert(reg.nr < 128);
200
201   gen7_convert_mrf_to_grf(p, &reg);
202
203   if (devinfo->gen >= 6 &&
204       (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
205        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
206        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
207        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
208      /* Any source modifiers or regions will be ignored, since this just
209       * identifies the MRF/GRF to start reading the message contents from.
210       * Check for some likely failures.
211       */
212      assert(!reg.negate);
213      assert(!reg.abs);
214      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
215   }
216
217   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
218       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
219      assert(reg.file == BRW_GENERAL_REGISTER_FILE);
220      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
221      assert(reg.subnr % 16 == 0);
222      assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
223             reg.vstride == reg.width + 1);
224      assert(!reg.negate && !reg.abs);
225      brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
226      brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
227   } else {
228      brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
229      brw_inst_set_src0_abs(devinfo, inst, reg.abs);
230      brw_inst_set_src0_negate(devinfo, inst, reg.negate);
231      brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
232
233      if (reg.file == BRW_IMMEDIATE_VALUE) {
234         if (reg.type == BRW_REGISTER_TYPE_DF ||
235             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
236            brw_inst_set_imm_df(devinfo, inst, reg.df);
237         else if (reg.type == BRW_REGISTER_TYPE_UQ ||
238                  reg.type == BRW_REGISTER_TYPE_Q)
239            brw_inst_set_imm_uq(devinfo, inst, reg.u64);
240         else
241            brw_inst_set_imm_ud(devinfo, inst, reg.ud);
242
243         if (type_sz(reg.type) < 8) {
244            brw_inst_set_src1_reg_file(devinfo, inst,
245                                       BRW_ARCHITECTURE_REGISTER_FILE);
246            brw_inst_set_src1_reg_hw_type(devinfo, inst,
247                                          brw_inst_src0_reg_hw_type(devinfo, inst));
248         }
249      } else {
250         if (reg.address_mode == BRW_ADDRESS_DIRECT) {
251            brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
252            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
253                brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
254            } else {
255               brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
256            }
257         } else {
258            brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
259
260            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
261               brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
262            } else {
263               brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
264            }
265         }
266
267         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
268            if (reg.width == BRW_WIDTH_1 &&
269                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
270               brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
271               brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
272               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
273            } else {
274               brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
275               brw_inst_set_src0_width(devinfo, inst, reg.width);
276               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
277            }
278         } else {
279            brw_inst_set_src0_da16_swiz_x(devinfo, inst,
280               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
281            brw_inst_set_src0_da16_swiz_y(devinfo, inst,
282               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
283            brw_inst_set_src0_da16_swiz_z(devinfo, inst,
284               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
285            brw_inst_set_src0_da16_swiz_w(devinfo, inst,
286               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
287
288            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
289               /* This is an oddity of the fact we're using the same
290                * descriptions for registers in align_16 as align_1:
291                */
292               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
293            } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
294                       reg.type == BRW_REGISTER_TYPE_DF &&
295                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
296               /* From SNB PRM:
297                *
298                * "For Align16 access mode, only encodings of 0000 and 0011
299                *  are allowed. Other codes are reserved."
300                *
301                * Presumably the DevSNB behavior applies to IVB as well.
302                */
303               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
304            } else {
305               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
306            }
307         }
308      }
309   }
310}
311
312
313void
314brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
315{
316   const struct gen_device_info *devinfo = p->devinfo;
317
318   if (reg.file == BRW_GENERAL_REGISTER_FILE)
319      assert(reg.nr < 128);
320
321   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
322       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
323      assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
324             reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
325      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
326      assert(reg.subnr == 0);
327      assert(reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
328             reg.vstride == reg.width + 1);
329      assert(!reg.negate && !reg.abs);
330      brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
331      brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
332   } else {
333      /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
334       *
335       *    "Accumulator registers may be accessed explicitly as src0
336       *    operands only."
337       */
338      assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
339             reg.nr != BRW_ARF_ACCUMULATOR);
340
341      gen7_convert_mrf_to_grf(p, &reg);
342      assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
343
344      brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
345      brw_inst_set_src1_abs(devinfo, inst, reg.abs);
346      brw_inst_set_src1_negate(devinfo, inst, reg.negate);
347
348      /* Only src1 can be immediate in two-argument instructions.
349       */
350      assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
351
352      if (reg.file == BRW_IMMEDIATE_VALUE) {
353         /* two-argument instructions can only use 32-bit immediates */
354         assert(type_sz(reg.type) < 8);
355         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
356      } else {
357         /* This is a hardware restriction, which may or may not be lifted
358          * in the future:
359          */
360         assert (reg.address_mode == BRW_ADDRESS_DIRECT);
361         /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
362
363         brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
364         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
365            brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
366         } else {
367            brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
368         }
369
370         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
371            if (reg.width == BRW_WIDTH_1 &&
372                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
373               brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
374               brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
375               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
376            } else {
377               brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
378               brw_inst_set_src1_width(devinfo, inst, reg.width);
379               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
380            }
381         } else {
382            brw_inst_set_src1_da16_swiz_x(devinfo, inst,
383               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
384            brw_inst_set_src1_da16_swiz_y(devinfo, inst,
385               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
386            brw_inst_set_src1_da16_swiz_z(devinfo, inst,
387               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
388            brw_inst_set_src1_da16_swiz_w(devinfo, inst,
389               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
390
391            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
392               /* This is an oddity of the fact we're using the same
393                * descriptions for registers in align_16 as align_1:
394                */
395               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
396            } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
397                       reg.type == BRW_REGISTER_TYPE_DF &&
398                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
399               /* From SNB PRM:
400                *
401                * "For Align16 access mode, only encodings of 0000 and 0011
402                *  are allowed. Other codes are reserved."
403                *
404                * Presumably the DevSNB behavior applies to IVB as well.
405                */
406               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
407            } else {
408               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
409            }
410         }
411      }
412   }
413}
414
415/**
416 * Specify the descriptor and extended descriptor immediate for a SEND(C)
417 * message instruction.
418 */
419void
420brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
421                unsigned desc, unsigned ex_desc)
422{
423   const struct gen_device_info *devinfo = p->devinfo;
424   assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
425          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
426   brw_inst_set_src1_file_type(devinfo, inst,
427                               BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
428   brw_inst_set_send_desc(devinfo, inst, desc);
429   if (devinfo->gen >= 9)
430      brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
431}
432
433static void brw_set_math_message( struct brw_codegen *p,
434				  brw_inst *inst,
435				  unsigned function,
436				  unsigned integer_type,
437				  bool low_precision,
438				  unsigned dataType )
439{
440   const struct gen_device_info *devinfo = p->devinfo;
441   unsigned msg_length;
442   unsigned response_length;
443
444   /* Infer message length from the function */
445   switch (function) {
446   case BRW_MATH_FUNCTION_POW:
447   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
448   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
449   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
450      msg_length = 2;
451      break;
452   default:
453      msg_length = 1;
454      break;
455   }
456
457   /* Infer response length from the function */
458   switch (function) {
459   case BRW_MATH_FUNCTION_SINCOS:
460   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
461      response_length = 2;
462      break;
463   default:
464      response_length = 1;
465      break;
466   }
467
468   brw_set_desc(p, inst, brw_message_desc(
469                   devinfo, msg_length, response_length, false));
470
471   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
472   brw_inst_set_math_msg_function(devinfo, inst, function);
473   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
474   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
475   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
476   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
477   brw_inst_set_saturate(devinfo, inst, 0);
478}
479
480
481static void brw_set_ff_sync_message(struct brw_codegen *p,
482				    brw_inst *insn,
483				    bool allocate,
484				    unsigned response_length,
485				    bool end_of_thread)
486{
487   const struct gen_device_info *devinfo = p->devinfo;
488
489   brw_set_desc(p, insn, brw_message_desc(
490                   devinfo, 1, response_length, true));
491
492   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
493   brw_inst_set_eot(devinfo, insn, end_of_thread);
494   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
495   brw_inst_set_urb_allocate(devinfo, insn, allocate);
496   /* The following fields are not used by FF_SYNC: */
497   brw_inst_set_urb_global_offset(devinfo, insn, 0);
498   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
499   brw_inst_set_urb_used(devinfo, insn, 0);
500   brw_inst_set_urb_complete(devinfo, insn, 0);
501}
502
503static void brw_set_urb_message( struct brw_codegen *p,
504				 brw_inst *insn,
505                                 enum brw_urb_write_flags flags,
506				 unsigned msg_length,
507				 unsigned response_length,
508				 unsigned offset,
509				 unsigned swizzle_control )
510{
511   const struct gen_device_info *devinfo = p->devinfo;
512
513   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
514   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
515   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
516
517   brw_set_desc(p, insn, brw_message_desc(
518                   devinfo, msg_length, response_length, true));
519
520   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
521   brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
522
523   if (flags & BRW_URB_WRITE_OWORD) {
524      assert(msg_length == 2); /* header + one OWORD of data */
525      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
526   } else {
527      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
528   }
529
530   brw_inst_set_urb_global_offset(devinfo, insn, offset);
531   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
532
533   if (devinfo->gen < 8) {
534      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
535   }
536
537   if (devinfo->gen < 7) {
538      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
539      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
540   } else {
541      brw_inst_set_urb_per_slot_offset(devinfo, insn,
542         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
543   }
544}
545
546static void
547gen7_set_dp_scratch_message(struct brw_codegen *p,
548                            brw_inst *inst,
549                            bool write,
550                            bool dword,
551                            bool invalidate_after_read,
552                            unsigned num_regs,
553                            unsigned addr_offset,
554                            unsigned mlen,
555                            unsigned rlen,
556                            bool header_present)
557{
558   const struct gen_device_info *devinfo = p->devinfo;
559   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
560          (devinfo->gen >= 8 && num_regs == 8));
561   const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
562                                num_regs - 1);
563
564   brw_set_desc(p, inst, brw_message_desc(
565                   devinfo, mlen, rlen, header_present));
566
567   brw_inst_set_sfid(devinfo, inst, GEN7_SFID_DATAPORT_DATA_CACHE);
568   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
569   brw_inst_set_scratch_read_write(devinfo, inst, write);
570   brw_inst_set_scratch_type(devinfo, inst, dword);
571   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
572   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
573   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
574}
575
576static void
577brw_inst_set_state(const struct gen_device_info *devinfo,
578                   brw_inst *insn,
579                   const struct brw_insn_state *state)
580{
581   brw_inst_set_exec_size(devinfo, insn, state->exec_size);
582   brw_inst_set_group(devinfo, insn, state->group);
583   brw_inst_set_compression(devinfo, insn, state->compressed);
584   brw_inst_set_access_mode(devinfo, insn, state->access_mode);
585   brw_inst_set_mask_control(devinfo, insn, state->mask_control);
586   brw_inst_set_saturate(devinfo, insn, state->saturate);
587   brw_inst_set_pred_control(devinfo, insn, state->predicate);
588   brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
589
590   if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
591       state->access_mode == BRW_ALIGN_16) {
592      brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
593      if (devinfo->gen >= 7)
594         brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
595   } else {
596      brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
597      if (devinfo->gen >= 7)
598         brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
599   }
600
601   if (devinfo->gen >= 6)
602      brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
603}
604
605#define next_insn brw_next_insn
606brw_inst *
607brw_next_insn(struct brw_codegen *p, unsigned opcode)
608{
609   const struct gen_device_info *devinfo = p->devinfo;
610   brw_inst *insn;
611
612   if (p->nr_insn + 1 > p->store_size) {
613      p->store_size <<= 1;
614      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
615   }
616
617   p->next_insn_offset += 16;
618   insn = &p->store[p->nr_insn++];
619
620   memset(insn, 0, sizeof(*insn));
621   brw_inst_set_opcode(devinfo, insn, opcode);
622
623   /* Apply the default instruction state */
624   brw_inst_set_state(devinfo, insn, p->current);
625
626   return insn;
627}
628
629static brw_inst *
630brw_alu1(struct brw_codegen *p, unsigned opcode,
631         struct brw_reg dest, struct brw_reg src)
632{
633   brw_inst *insn = next_insn(p, opcode);
634   brw_set_dest(p, insn, dest);
635   brw_set_src0(p, insn, src);
636   return insn;
637}
638
639static brw_inst *
640brw_alu2(struct brw_codegen *p, unsigned opcode,
641         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
642{
643   /* 64-bit immediates are only supported on 1-src instructions */
644   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
645   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
646
647   brw_inst *insn = next_insn(p, opcode);
648   brw_set_dest(p, insn, dest);
649   brw_set_src0(p, insn, src0);
650   brw_set_src1(p, insn, src1);
651   return insn;
652}
653
654static int
655get_3src_subreg_nr(struct brw_reg reg)
656{
657   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
658    * use 32-bit units (components 0..7).  Since they only support F/D/UD
659    * types, this doesn't lose any flexibility, but uses fewer bits.
660    */
661   return reg.subnr / 4;
662}
663
664static enum gen10_align1_3src_vertical_stride
665to_3src_align1_vstride(enum brw_vertical_stride vstride)
666{
667   switch (vstride) {
668   case BRW_VERTICAL_STRIDE_0:
669      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
670   case BRW_VERTICAL_STRIDE_2:
671      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
672   case BRW_VERTICAL_STRIDE_4:
673      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
674   case BRW_VERTICAL_STRIDE_8:
675   case BRW_VERTICAL_STRIDE_16:
676      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
677   default:
678      unreachable("invalid vstride");
679   }
680}
681
682
683static enum gen10_align1_3src_src_horizontal_stride
684to_3src_align1_hstride(enum brw_horizontal_stride hstride)
685{
686   switch (hstride) {
687   case BRW_HORIZONTAL_STRIDE_0:
688      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
689   case BRW_HORIZONTAL_STRIDE_1:
690      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
691   case BRW_HORIZONTAL_STRIDE_2:
692      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
693   case BRW_HORIZONTAL_STRIDE_4:
694      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
695   default:
696      unreachable("invalid hstride");
697   }
698}
699
700static brw_inst *
701brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
702         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
703{
704   const struct gen_device_info *devinfo = p->devinfo;
705   brw_inst *inst = next_insn(p, opcode);
706
707   gen7_convert_mrf_to_grf(p, &dest);
708
709   assert(dest.nr < 128);
710   assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
711   assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
712   assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
713   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
714   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
715   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
716   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
717
718   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
719      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
720             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
721
722      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
723         brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
724                                           BRW_ALIGN1_3SRC_ACCUMULATOR);
725         brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
726      } else {
727         brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
728                                           BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
729         brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
730      }
731      brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
732
733      brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
734
735      if (brw_reg_type_is_floating_point(dest.type)) {
736         brw_inst_set_3src_a1_exec_type(devinfo, inst,
737                                        BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
738      } else {
739         brw_inst_set_3src_a1_exec_type(devinfo, inst,
740                                        BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
741      }
742
743      brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
744      brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
745      brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
746      brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
747
748      brw_inst_set_3src_a1_src0_vstride(devinfo, inst,
749                                        to_3src_align1_vstride(src0.vstride));
750      brw_inst_set_3src_a1_src1_vstride(devinfo, inst,
751                                        to_3src_align1_vstride(src1.vstride));
752      /* no vstride on src2 */
753
754      brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
755                                        to_3src_align1_hstride(src0.hstride));
756      brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
757                                        to_3src_align1_hstride(src1.hstride));
758      brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
759                                        to_3src_align1_hstride(src2.hstride));
760
761      brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
762      if (src0.type == BRW_REGISTER_TYPE_NF) {
763         brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
764      } else {
765         brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
766      }
767      brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
768      brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
769
770      brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
771      if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
772         brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
773      } else {
774         brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
775      }
776      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
777      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
778
779      brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
780      brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
781      brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
782      brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
783
784      assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
785             src0.file == BRW_IMMEDIATE_VALUE ||
786             (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
787              src0.type == BRW_REGISTER_TYPE_NF));
788      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
789             src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
790      assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
791             src2.file == BRW_IMMEDIATE_VALUE);
792
793      brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
794                                         src0.file == BRW_GENERAL_REGISTER_FILE ?
795                                         BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
796                                         BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
797      brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
798                                         src1.file == BRW_GENERAL_REGISTER_FILE ?
799                                         BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
800                                         BRW_ALIGN1_3SRC_ACCUMULATOR);
801      brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
802                                         src2.file == BRW_GENERAL_REGISTER_FILE ?
803                                         BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
804                                         BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
805   } else {
806      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
807             dest.file == BRW_MESSAGE_REGISTER_FILE);
808      assert(dest.type == BRW_REGISTER_TYPE_F  ||
809             dest.type == BRW_REGISTER_TYPE_DF ||
810             dest.type == BRW_REGISTER_TYPE_D  ||
811             dest.type == BRW_REGISTER_TYPE_UD ||
812             (dest.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 8));
813      if (devinfo->gen == 6) {
814         brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
815                                            dest.file == BRW_MESSAGE_REGISTER_FILE);
816      }
817      brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
818      brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
819      brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
820
821      assert(src0.file == BRW_GENERAL_REGISTER_FILE);
822      brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
823      brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
824      brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
825      brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
826      brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
827      brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
828                                          src0.vstride == BRW_VERTICAL_STRIDE_0);
829
830      assert(src1.file == BRW_GENERAL_REGISTER_FILE);
831      brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
832      brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
833      brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
834      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
835      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
836      brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
837                                          src1.vstride == BRW_VERTICAL_STRIDE_0);
838
839      assert(src2.file == BRW_GENERAL_REGISTER_FILE);
840      brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
841      brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
842      brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
843      brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
844      brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
845      brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
846                                          src2.vstride == BRW_VERTICAL_STRIDE_0);
847
848      if (devinfo->gen >= 7) {
849         /* Set both the source and destination types based on dest.type,
850          * ignoring the source register types.  The MAD and LRP emitters ensure
851          * that all four types are float.  The BFE and BFI2 emitters, however,
852          * may send us mixed D and UD types and want us to ignore that and use
853          * the destination type.
854          */
855         brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
856         brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
857
858         /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
859          *
860          *    "Three source instructions can use operands with mixed-mode
861          *     precision. When SrcType field is set to :f or :hf it defines
862          *     precision for source 0 only, and fields Src1Type and Src2Type
863          *     define precision for other source operands:
864          *
865          *     0b = :f. Single precision Float (32-bit).
866          *     1b = :hf. Half precision Float (16-bit)."
867          */
868         if (src1.type == BRW_REGISTER_TYPE_HF)
869            brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
870
871         if (src2.type == BRW_REGISTER_TYPE_HF)
872            brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
873      }
874   }
875
876   return inst;
877}
878
879
880/***********************************************************************
881 * Convenience routines.
882 */
883#define ALU1(OP)					\
884brw_inst *brw_##OP(struct brw_codegen *p,		\
885	      struct brw_reg dest,			\
886	      struct brw_reg src0)   			\
887{							\
888   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
889}
890
891#define ALU2(OP)					\
892brw_inst *brw_##OP(struct brw_codegen *p,		\
893	      struct brw_reg dest,			\
894	      struct brw_reg src0,			\
895	      struct brw_reg src1)   			\
896{							\
897   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
898}
899
900#define ALU3(OP)					\
901brw_inst *brw_##OP(struct brw_codegen *p,		\
902	      struct brw_reg dest,			\
903	      struct brw_reg src0,			\
904	      struct brw_reg src1,			\
905	      struct brw_reg src2)   			\
906{                                                       \
907   if (p->current->access_mode == BRW_ALIGN_16) {       \
908      if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
909         src0.swizzle = BRW_SWIZZLE_XXXX;               \
910      if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
911         src1.swizzle = BRW_SWIZZLE_XXXX;               \
912      if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
913         src2.swizzle = BRW_SWIZZLE_XXXX;               \
914   }                                                    \
915   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
916}
917
918#define ALU3F(OP)                                               \
919brw_inst *brw_##OP(struct brw_codegen *p,         \
920                                 struct brw_reg dest,           \
921                                 struct brw_reg src0,           \
922                                 struct brw_reg src1,           \
923                                 struct brw_reg src2)           \
924{                                                               \
925   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
926          dest.type == BRW_REGISTER_TYPE_DF);                   \
927   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
928      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
929      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
930      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
931   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
932      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
933      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
934      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
935   }                                                            \
936                                                                \
937   if (p->current->access_mode == BRW_ALIGN_16) {               \
938      if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
939         src0.swizzle = BRW_SWIZZLE_XXXX;                       \
940      if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
941         src1.swizzle = BRW_SWIZZLE_XXXX;                       \
942      if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
943         src2.swizzle = BRW_SWIZZLE_XXXX;                       \
944   }                                                            \
945   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
946}
947
948/* Rounding operations (other than RNDD) require two instructions - the first
949 * stores a rounded value (possibly the wrong way) in the dest register, but
950 * also sets a per-channel "increment bit" in the flag register.  A predicated
951 * add of 1.0 fixes dest to contain the desired result.
952 *
953 * Sandybridge and later appear to round correctly without an ADD.
954 */
955#define ROUND(OP)							      \
956void brw_##OP(struct brw_codegen *p,					      \
957	      struct brw_reg dest,					      \
958	      struct brw_reg src)					      \
959{									      \
960   const struct gen_device_info *devinfo = p->devinfo;					      \
961   brw_inst *rnd, *add;							      \
962   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
963   brw_set_dest(p, rnd, dest);						      \
964   brw_set_src0(p, rnd, src);						      \
965									      \
966   if (devinfo->gen < 6) {							      \
967      /* turn on round-increments */					      \
968      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
969      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
970      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
971   }									      \
972}
973
974
975ALU2(SEL)
976ALU1(NOT)
977ALU2(AND)
978ALU2(OR)
979ALU2(XOR)
980ALU2(SHR)
981ALU2(SHL)
982ALU1(DIM)
983ALU2(ASR)
984ALU3(CSEL)
985ALU1(FRC)
986ALU1(RNDD)
987ALU2(MAC)
988ALU2(MACH)
989ALU1(LZD)
990ALU2(DP4)
991ALU2(DPH)
992ALU2(DP3)
993ALU2(DP2)
994ALU3(MAD)
995ALU3F(LRP)
996ALU1(BFREV)
997ALU3(BFE)
998ALU2(BFI1)
999ALU3(BFI2)
1000ALU1(FBH)
1001ALU1(FBL)
1002ALU1(CBIT)
1003ALU2(ADDC)
1004ALU2(SUBB)
1005
1006ROUND(RNDZ)
1007ROUND(RNDE)
1008
1009brw_inst *
1010brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1011{
1012   const struct gen_device_info *devinfo = p->devinfo;
1013
1014   /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1015    * To avoid the problems that causes, we use an <X,2,0> source region to
1016    * read each element twice.
1017    */
1018   if (devinfo->gen == 7 && !devinfo->is_haswell &&
1019       brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1020       dest.type == BRW_REGISTER_TYPE_DF &&
1021       (src0.type == BRW_REGISTER_TYPE_F ||
1022        src0.type == BRW_REGISTER_TYPE_D ||
1023        src0.type == BRW_REGISTER_TYPE_UD) &&
1024       !has_scalar_region(src0)) {
1025      assert(src0.vstride == src0.width + src0.hstride);
1026      src0.vstride = src0.hstride;
1027      src0.width = BRW_WIDTH_2;
1028      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1029   }
1030
1031   return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1032}
1033
1034brw_inst *
1035brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1036        struct brw_reg src0, struct brw_reg src1)
1037{
1038   /* 6.2.2: add */
1039   if (src0.type == BRW_REGISTER_TYPE_F ||
1040       (src0.file == BRW_IMMEDIATE_VALUE &&
1041	src0.type == BRW_REGISTER_TYPE_VF)) {
1042      assert(src1.type != BRW_REGISTER_TYPE_UD);
1043      assert(src1.type != BRW_REGISTER_TYPE_D);
1044   }
1045
1046   if (src1.type == BRW_REGISTER_TYPE_F ||
1047       (src1.file == BRW_IMMEDIATE_VALUE &&
1048	src1.type == BRW_REGISTER_TYPE_VF)) {
1049      assert(src0.type != BRW_REGISTER_TYPE_UD);
1050      assert(src0.type != BRW_REGISTER_TYPE_D);
1051   }
1052
1053   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1054}
1055
1056brw_inst *
1057brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1058        struct brw_reg src0, struct brw_reg src1)
1059{
1060   assert(dest.type == src0.type);
1061   assert(src0.type == src1.type);
1062   switch (src0.type) {
1063   case BRW_REGISTER_TYPE_B:
1064   case BRW_REGISTER_TYPE_UB:
1065   case BRW_REGISTER_TYPE_W:
1066   case BRW_REGISTER_TYPE_UW:
1067   case BRW_REGISTER_TYPE_D:
1068   case BRW_REGISTER_TYPE_UD:
1069      break;
1070   default:
1071      unreachable("Bad type for brw_AVG");
1072   }
1073
1074   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1075}
1076
1077brw_inst *
1078brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1079        struct brw_reg src0, struct brw_reg src1)
1080{
1081   /* 6.32.38: mul */
1082   if (src0.type == BRW_REGISTER_TYPE_D ||
1083       src0.type == BRW_REGISTER_TYPE_UD ||
1084       src1.type == BRW_REGISTER_TYPE_D ||
1085       src1.type == BRW_REGISTER_TYPE_UD) {
1086      assert(dest.type != BRW_REGISTER_TYPE_F);
1087   }
1088
1089   if (src0.type == BRW_REGISTER_TYPE_F ||
1090       (src0.file == BRW_IMMEDIATE_VALUE &&
1091	src0.type == BRW_REGISTER_TYPE_VF)) {
1092      assert(src1.type != BRW_REGISTER_TYPE_UD);
1093      assert(src1.type != BRW_REGISTER_TYPE_D);
1094   }
1095
1096   if (src1.type == BRW_REGISTER_TYPE_F ||
1097       (src1.file == BRW_IMMEDIATE_VALUE &&
1098	src1.type == BRW_REGISTER_TYPE_VF)) {
1099      assert(src0.type != BRW_REGISTER_TYPE_UD);
1100      assert(src0.type != BRW_REGISTER_TYPE_D);
1101   }
1102
1103   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1104	  src0.nr != BRW_ARF_ACCUMULATOR);
1105   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1106	  src1.nr != BRW_ARF_ACCUMULATOR);
1107
1108   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1109}
1110
1111brw_inst *
1112brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1113         struct brw_reg src0, struct brw_reg src1)
1114{
1115   src0.vstride = BRW_VERTICAL_STRIDE_0;
1116   src0.width = BRW_WIDTH_1;
1117   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1118   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1119}
1120
1121brw_inst *
1122brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1123        struct brw_reg src0, struct brw_reg src1)
1124{
1125   src0.vstride = BRW_VERTICAL_STRIDE_0;
1126   src0.width = BRW_WIDTH_1;
1127   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1128   src1.vstride = BRW_VERTICAL_STRIDE_8;
1129   src1.width = BRW_WIDTH_8;
1130   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1131   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1132}
1133
1134brw_inst *
1135brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1136{
1137   const struct gen_device_info *devinfo = p->devinfo;
1138   const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1139   /* The F32TO16 instruction doesn't support 32-bit destination types in
1140    * Align1 mode, and neither does the Gen8 implementation in terms of a
1141    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
1142    * an undocumented feature.
1143    */
1144   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1145                                 (!align16 || devinfo->gen >= 8));
1146   brw_inst *inst;
1147
1148   if (align16) {
1149      assert(dst.type == BRW_REGISTER_TYPE_UD);
1150   } else {
1151      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1152             dst.type == BRW_REGISTER_TYPE_W ||
1153             dst.type == BRW_REGISTER_TYPE_UW ||
1154             dst.type == BRW_REGISTER_TYPE_HF);
1155   }
1156
1157   brw_push_insn_state(p);
1158
1159   if (needs_zero_fill) {
1160      brw_set_default_access_mode(p, BRW_ALIGN_1);
1161      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1162   }
1163
1164   if (devinfo->gen >= 8) {
1165      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1166   } else {
1167      assert(devinfo->gen == 7);
1168      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1169   }
1170
1171   if (needs_zero_fill) {
1172      brw_inst_set_no_dd_clear(devinfo, inst, true);
1173      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1174      brw_inst_set_no_dd_check(devinfo, inst, true);
1175   }
1176
1177   brw_pop_insn_state(p);
1178   return inst;
1179}
1180
1181brw_inst *
1182brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1183{
1184   const struct gen_device_info *devinfo = p->devinfo;
1185   bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1186
1187   if (align16) {
1188      assert(src.type == BRW_REGISTER_TYPE_UD);
1189   } else {
1190      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1191       *
1192       *   Because this instruction does not have a 16-bit floating-point
1193       *   type, the source data type must be Word (W). The destination type
1194       *   must be F (Float).
1195       */
1196      if (src.type == BRW_REGISTER_TYPE_UD)
1197         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1198
1199      assert(src.type == BRW_REGISTER_TYPE_W ||
1200             src.type == BRW_REGISTER_TYPE_UW ||
1201             src.type == BRW_REGISTER_TYPE_HF);
1202   }
1203
1204   if (devinfo->gen >= 8) {
1205      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1206   } else {
1207      assert(devinfo->gen == 7);
1208      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1209   }
1210}
1211
1212
1213void brw_NOP(struct brw_codegen *p)
1214{
1215   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1216   memset(insn, 0, sizeof(*insn));
1217   brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1218}
1219
1220
1221
1222
1223
1224/***********************************************************************
1225 * Comparisons, if/else/endif
1226 */
1227
1228brw_inst *
1229brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1230         unsigned predicate_control)
1231{
1232   const struct gen_device_info *devinfo = p->devinfo;
1233   struct brw_reg ip = brw_ip_reg();
1234   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1235
1236   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1237   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1238   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1239   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1240
1241   return inst;
1242}
1243
1244static void
1245push_if_stack(struct brw_codegen *p, brw_inst *inst)
1246{
1247   p->if_stack[p->if_stack_depth] = inst - p->store;
1248
1249   p->if_stack_depth++;
1250   if (p->if_stack_array_size <= p->if_stack_depth) {
1251      p->if_stack_array_size *= 2;
1252      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1253			     p->if_stack_array_size);
1254   }
1255}
1256
1257static brw_inst *
1258pop_if_stack(struct brw_codegen *p)
1259{
1260   p->if_stack_depth--;
1261   return &p->store[p->if_stack[p->if_stack_depth]];
1262}
1263
1264static void
1265push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1266{
1267   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1268      p->loop_stack_array_size *= 2;
1269      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1270			       p->loop_stack_array_size);
1271      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1272				     p->loop_stack_array_size);
1273   }
1274
1275   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1276   p->loop_stack_depth++;
1277   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1278}
1279
1280static brw_inst *
1281get_inner_do_insn(struct brw_codegen *p)
1282{
1283   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1284}
1285
1286/* EU takes the value from the flag register and pushes it onto some
1287 * sort of a stack (presumably merging with any flag value already on
1288 * the stack).  Within an if block, the flags at the top of the stack
1289 * control execution on each channel of the unit, eg. on each of the
1290 * 16 pixel values in our wm programs.
1291 *
1292 * When the matching 'else' instruction is reached (presumably by
1293 * countdown of the instruction count patched in by our ELSE/ENDIF
1294 * functions), the relevant flags are inverted.
1295 *
1296 * When the matching 'endif' instruction is reached, the flags are
1297 * popped off.  If the stack is now empty, normal execution resumes.
1298 */
1299brw_inst *
1300brw_IF(struct brw_codegen *p, unsigned execute_size)
1301{
1302   const struct gen_device_info *devinfo = p->devinfo;
1303   brw_inst *insn;
1304
1305   insn = next_insn(p, BRW_OPCODE_IF);
1306
1307   /* Override the defaults for this instruction:
1308    */
1309   if (devinfo->gen < 6) {
1310      brw_set_dest(p, insn, brw_ip_reg());
1311      brw_set_src0(p, insn, brw_ip_reg());
1312      brw_set_src1(p, insn, brw_imm_d(0x0));
1313   } else if (devinfo->gen == 6) {
1314      brw_set_dest(p, insn, brw_imm_w(0));
1315      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1316      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1317      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1318   } else if (devinfo->gen == 7) {
1319      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1320      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1321      brw_set_src1(p, insn, brw_imm_w(0));
1322      brw_inst_set_jip(devinfo, insn, 0);
1323      brw_inst_set_uip(devinfo, insn, 0);
1324   } else {
1325      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1326      brw_set_src0(p, insn, brw_imm_d(0));
1327      brw_inst_set_jip(devinfo, insn, 0);
1328      brw_inst_set_uip(devinfo, insn, 0);
1329   }
1330
1331   brw_inst_set_exec_size(devinfo, insn, execute_size);
1332   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1333   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1334   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1335   if (!p->single_program_flow && devinfo->gen < 6)
1336      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1337
1338   push_if_stack(p, insn);
1339   p->if_depth_in_loop[p->loop_stack_depth]++;
1340   return insn;
1341}
1342
1343/* This function is only used for gen6-style IF instructions with an
1344 * embedded comparison (conditional modifier).  It is not used on gen7.
1345 */
1346brw_inst *
1347gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1348	struct brw_reg src0, struct brw_reg src1)
1349{
1350   const struct gen_device_info *devinfo = p->devinfo;
1351   brw_inst *insn;
1352
1353   insn = next_insn(p, BRW_OPCODE_IF);
1354
1355   brw_set_dest(p, insn, brw_imm_w(0));
1356   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1357   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1358   brw_set_src0(p, insn, src0);
1359   brw_set_src1(p, insn, src1);
1360
1361   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1362   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1363   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1364
1365   push_if_stack(p, insn);
1366   return insn;
1367}
1368
1369/**
1370 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1371 */
1372static void
1373convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1374                       brw_inst *if_inst, brw_inst *else_inst)
1375{
1376   const struct gen_device_info *devinfo = p->devinfo;
1377
1378   /* The next instruction (where the ENDIF would be, if it existed) */
1379   brw_inst *next_inst = &p->store[p->nr_insn];
1380
1381   assert(p->single_program_flow);
1382   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1383   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1384   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1385
1386   /* Convert IF to an ADD instruction that moves the instruction pointer
1387    * to the first instruction of the ELSE block.  If there is no ELSE
1388    * block, point to where ENDIF would be.  Reverse the predicate.
1389    *
1390    * There's no need to execute an ENDIF since we don't need to do any
1391    * stack operations, and if we're currently executing, we just want to
1392    * continue normally.
1393    */
1394   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1395   brw_inst_set_pred_inv(devinfo, if_inst, true);
1396
1397   if (else_inst != NULL) {
1398      /* Convert ELSE to an ADD instruction that points where the ENDIF
1399       * would be.
1400       */
1401      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1402
1403      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1404      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1405   } else {
1406      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1407   }
1408}
1409
1410/**
1411 * Patch IF and ELSE instructions with appropriate jump targets.
1412 */
1413static void
1414patch_IF_ELSE(struct brw_codegen *p,
1415              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1416{
1417   const struct gen_device_info *devinfo = p->devinfo;
1418
1419   /* We shouldn't be patching IF and ELSE instructions in single program flow
1420    * mode when gen < 6, because in single program flow mode on those
1421    * platforms, we convert flow control instructions to conditional ADDs that
1422    * operate on IP (see brw_ENDIF).
1423    *
1424    * However, on Gen6, writing to IP doesn't work in single program flow mode
1425    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1426    * not be updated by non-flow control instructions.").  And on later
1427    * platforms, there is no significant benefit to converting control flow
1428    * instructions to conditional ADDs.  So we do patch IF and ELSE
1429    * instructions in single program flow mode on those platforms.
1430    */
1431   if (devinfo->gen < 6)
1432      assert(!p->single_program_flow);
1433
1434   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1435   assert(endif_inst != NULL);
1436   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1437
1438   unsigned br = brw_jump_scale(devinfo);
1439
1440   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1441   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1442
1443   if (else_inst == NULL) {
1444      /* Patch IF -> ENDIF */
1445      if (devinfo->gen < 6) {
1446	 /* Turn it into an IFF, which means no mask stack operations for
1447	  * all-false and jumping past the ENDIF.
1448	  */
1449         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1450         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1451                                      br * (endif_inst - if_inst + 1));
1452         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1453      } else if (devinfo->gen == 6) {
1454	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
1455         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1456      } else {
1457         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1458         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1459      }
1460   } else {
1461      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1462
1463      /* Patch IF -> ELSE */
1464      if (devinfo->gen < 6) {
1465         brw_inst_set_gen4_jump_count(devinfo, if_inst,
1466                                      br * (else_inst - if_inst));
1467         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
1468      } else if (devinfo->gen == 6) {
1469         brw_inst_set_gen6_jump_count(devinfo, if_inst,
1470                                      br * (else_inst - if_inst + 1));
1471      }
1472
1473      /* Patch ELSE -> ENDIF */
1474      if (devinfo->gen < 6) {
1475	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
1476	  * matching ENDIF.
1477	  */
1478         brw_inst_set_gen4_jump_count(devinfo, else_inst,
1479                                      br * (endif_inst - else_inst + 1));
1480         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
1481      } else if (devinfo->gen == 6) {
1482	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
1483         brw_inst_set_gen6_jump_count(devinfo, else_inst,
1484                                      br * (endif_inst - else_inst));
1485      } else {
1486	 /* The IF instruction's JIP should point just past the ELSE */
1487         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1488	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1489         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1490         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1491         if (devinfo->gen >= 8) {
1492            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1493             * should point to ENDIF.
1494             */
1495            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1496         }
1497      }
1498   }
1499}
1500
1501void
1502brw_ELSE(struct brw_codegen *p)
1503{
1504   const struct gen_device_info *devinfo = p->devinfo;
1505   brw_inst *insn;
1506
1507   insn = next_insn(p, BRW_OPCODE_ELSE);
1508
1509   if (devinfo->gen < 6) {
1510      brw_set_dest(p, insn, brw_ip_reg());
1511      brw_set_src0(p, insn, brw_ip_reg());
1512      brw_set_src1(p, insn, brw_imm_d(0x0));
1513   } else if (devinfo->gen == 6) {
1514      brw_set_dest(p, insn, brw_imm_w(0));
1515      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
1516      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1517      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1518   } else if (devinfo->gen == 7) {
1519      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1520      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1521      brw_set_src1(p, insn, brw_imm_w(0));
1522      brw_inst_set_jip(devinfo, insn, 0);
1523      brw_inst_set_uip(devinfo, insn, 0);
1524   } else {
1525      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1526      brw_set_src0(p, insn, brw_imm_d(0));
1527      brw_inst_set_jip(devinfo, insn, 0);
1528      brw_inst_set_uip(devinfo, insn, 0);
1529   }
1530
1531   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1532   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1533   if (!p->single_program_flow && devinfo->gen < 6)
1534      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1535
1536   push_if_stack(p, insn);
1537}
1538
1539void
1540brw_ENDIF(struct brw_codegen *p)
1541{
1542   const struct gen_device_info *devinfo = p->devinfo;
1543   brw_inst *insn = NULL;
1544   brw_inst *else_inst = NULL;
1545   brw_inst *if_inst = NULL;
1546   brw_inst *tmp;
1547   bool emit_endif = true;
1548
1549   /* In single program flow mode, we can express IF and ELSE instructions
1550    * equivalently as ADD instructions that operate on IP.  On platforms prior
1551    * to Gen6, flow control instructions cause an implied thread switch, so
1552    * this is a significant savings.
1553    *
1554    * However, on Gen6, writing to IP doesn't work in single program flow mode
1555    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1556    * not be updated by non-flow control instructions.").  And on later
1557    * platforms, there is no significant benefit to converting control flow
1558    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
1559    * Gen5.
1560    */
1561   if (devinfo->gen < 6 && p->single_program_flow)
1562      emit_endif = false;
1563
1564   /*
1565    * A single next_insn() may change the base address of instruction store
1566    * memory(p->store), so call it first before referencing the instruction
1567    * store pointer from an index
1568    */
1569   if (emit_endif)
1570      insn = next_insn(p, BRW_OPCODE_ENDIF);
1571
1572   /* Pop the IF and (optional) ELSE instructions from the stack */
1573   p->if_depth_in_loop[p->loop_stack_depth]--;
1574   tmp = pop_if_stack(p);
1575   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1576      else_inst = tmp;
1577      tmp = pop_if_stack(p);
1578   }
1579   if_inst = tmp;
1580
1581   if (!emit_endif) {
1582      /* ENDIF is useless; don't bother emitting it. */
1583      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1584      return;
1585   }
1586
1587   if (devinfo->gen < 6) {
1588      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1589      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1590      brw_set_src1(p, insn, brw_imm_d(0x0));
1591   } else if (devinfo->gen == 6) {
1592      brw_set_dest(p, insn, brw_imm_w(0));
1593      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1594      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1595   } else if (devinfo->gen == 7) {
1596      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1597      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1598      brw_set_src1(p, insn, brw_imm_w(0));
1599   } else {
1600      brw_set_src0(p, insn, brw_imm_d(0));
1601   }
1602
1603   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1604   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1605   if (devinfo->gen < 6)
1606      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1607
1608   /* Also pop item off the stack in the endif instruction: */
1609   if (devinfo->gen < 6) {
1610      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
1611      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
1612   } else if (devinfo->gen == 6) {
1613      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
1614   } else {
1615      brw_inst_set_jip(devinfo, insn, 2);
1616   }
1617   patch_IF_ELSE(p, if_inst, else_inst, insn);
1618}
1619
1620brw_inst *
1621brw_BREAK(struct brw_codegen *p)
1622{
1623   const struct gen_device_info *devinfo = p->devinfo;
1624   brw_inst *insn;
1625
1626   insn = next_insn(p, BRW_OPCODE_BREAK);
1627   if (devinfo->gen >= 8) {
1628      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1629      brw_set_src0(p, insn, brw_imm_d(0x0));
1630   } else if (devinfo->gen >= 6) {
1631      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1632      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1633      brw_set_src1(p, insn, brw_imm_d(0x0));
1634   } else {
1635      brw_set_dest(p, insn, brw_ip_reg());
1636      brw_set_src0(p, insn, brw_ip_reg());
1637      brw_set_src1(p, insn, brw_imm_d(0x0));
1638      brw_inst_set_gen4_pop_count(devinfo, insn,
1639                                  p->if_depth_in_loop[p->loop_stack_depth]);
1640   }
1641   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1642   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1643
1644   return insn;
1645}
1646
1647brw_inst *
1648brw_CONT(struct brw_codegen *p)
1649{
1650   const struct gen_device_info *devinfo = p->devinfo;
1651   brw_inst *insn;
1652
1653   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1654   brw_set_dest(p, insn, brw_ip_reg());
1655   if (devinfo->gen >= 8) {
1656      brw_set_src0(p, insn, brw_imm_d(0x0));
1657   } else {
1658      brw_set_src0(p, insn, brw_ip_reg());
1659      brw_set_src1(p, insn, brw_imm_d(0x0));
1660   }
1661
1662   if (devinfo->gen < 6) {
1663      brw_inst_set_gen4_pop_count(devinfo, insn,
1664                                  p->if_depth_in_loop[p->loop_stack_depth]);
1665   }
1666   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1667   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1668   return insn;
1669}
1670
1671brw_inst *
1672gen6_HALT(struct brw_codegen *p)
1673{
1674   const struct gen_device_info *devinfo = p->devinfo;
1675   brw_inst *insn;
1676
1677   insn = next_insn(p, BRW_OPCODE_HALT);
1678   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1679   if (devinfo->gen >= 8) {
1680      brw_set_src0(p, insn, brw_imm_d(0x0));
1681   } else {
1682      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1683      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1684   }
1685
1686   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1687   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1688   return insn;
1689}
1690
1691/* DO/WHILE loop:
1692 *
1693 * The DO/WHILE is just an unterminated loop -- break or continue are
1694 * used for control within the loop.  We have a few ways they can be
1695 * done.
1696 *
1697 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1698 * jip and no DO instruction.
1699 *
1700 * For non-uniform control flow pre-gen6, there's a DO instruction to
1701 * push the mask, and a WHILE to jump back, and BREAK to get out and
1702 * pop the mask.
1703 *
1704 * For gen6, there's no more mask stack, so no need for DO.  WHILE
1705 * just points back to the first instruction of the loop.
1706 */
1707brw_inst *
1708brw_DO(struct brw_codegen *p, unsigned execute_size)
1709{
1710   const struct gen_device_info *devinfo = p->devinfo;
1711
1712   if (devinfo->gen >= 6 || p->single_program_flow) {
1713      push_loop_stack(p, &p->store[p->nr_insn]);
1714      return &p->store[p->nr_insn];
1715   } else {
1716      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1717
1718      push_loop_stack(p, insn);
1719
1720      /* Override the defaults for this instruction:
1721       */
1722      brw_set_dest(p, insn, brw_null_reg());
1723      brw_set_src0(p, insn, brw_null_reg());
1724      brw_set_src1(p, insn, brw_null_reg());
1725
1726      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1727      brw_inst_set_exec_size(devinfo, insn, execute_size);
1728      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1729
1730      return insn;
1731   }
1732}
1733
1734/**
1735 * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
1736 * instruction here.
1737 *
1738 * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1739 * nesting, since it can always just point to the end of the block/current loop.
1740 */
1741static void
1742brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1743{
1744   const struct gen_device_info *devinfo = p->devinfo;
1745   brw_inst *do_inst = get_inner_do_insn(p);
1746   brw_inst *inst;
1747   unsigned br = brw_jump_scale(devinfo);
1748
1749   assert(devinfo->gen < 6);
1750
1751   for (inst = while_inst - 1; inst != do_inst; inst--) {
1752      /* If the jump count is != 0, that means that this instruction has already
1753       * been patched because it's part of a loop inside of the one we're
1754       * patching.
1755       */
1756      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1757          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1758         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1759      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1760                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
1761         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
1762      }
1763   }
1764}
1765
1766brw_inst *
1767brw_WHILE(struct brw_codegen *p)
1768{
1769   const struct gen_device_info *devinfo = p->devinfo;
1770   brw_inst *insn, *do_insn;
1771   unsigned br = brw_jump_scale(devinfo);
1772
1773   if (devinfo->gen >= 6) {
1774      insn = next_insn(p, BRW_OPCODE_WHILE);
1775      do_insn = get_inner_do_insn(p);
1776
1777      if (devinfo->gen >= 8) {
1778         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1779         brw_set_src0(p, insn, brw_imm_d(0));
1780         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1781      } else if (devinfo->gen == 7) {
1782         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1783         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1784         brw_set_src1(p, insn, brw_imm_w(0));
1785         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1786      } else {
1787         brw_set_dest(p, insn, brw_imm_w(0));
1788         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
1789         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1790         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1791      }
1792
1793      brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1794
1795   } else {
1796      if (p->single_program_flow) {
1797	 insn = next_insn(p, BRW_OPCODE_ADD);
1798         do_insn = get_inner_do_insn(p);
1799
1800	 brw_set_dest(p, insn, brw_ip_reg());
1801	 brw_set_src0(p, insn, brw_ip_reg());
1802	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1803         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1804      } else {
1805	 insn = next_insn(p, BRW_OPCODE_WHILE);
1806         do_insn = get_inner_do_insn(p);
1807
1808         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1809
1810	 brw_set_dest(p, insn, brw_ip_reg());
1811	 brw_set_src0(p, insn, brw_ip_reg());
1812	 brw_set_src1(p, insn, brw_imm_d(0));
1813
1814         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1815         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1816         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
1817
1818	 brw_patch_break_cont(p, insn);
1819      }
1820   }
1821   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1822
1823   p->loop_stack_depth--;
1824
1825   return insn;
1826}
1827
1828/* FORWARD JUMPS:
1829 */
1830void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1831{
1832   const struct gen_device_info *devinfo = p->devinfo;
1833   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1834   unsigned jmpi = 1;
1835
1836   if (devinfo->gen >= 5)
1837      jmpi = 2;
1838
1839   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1840   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1841
1842   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
1843                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1844}
1845
1846/* To integrate with the above, it makes sense that the comparison
1847 * instruction should populate the flag register.  It might be simpler
1848 * just to use the flag reg for most WM tasks?
1849 */
1850void brw_CMP(struct brw_codegen *p,
1851	     struct brw_reg dest,
1852	     unsigned conditional,
1853	     struct brw_reg src0,
1854	     struct brw_reg src1)
1855{
1856   const struct gen_device_info *devinfo = p->devinfo;
1857   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1858
1859   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1860   brw_set_dest(p, insn, dest);
1861   brw_set_src0(p, insn, src0);
1862   brw_set_src1(p, insn, src1);
1863
1864   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1865    * page says:
1866    *    "Any CMP instruction with a null destination must use a {switch}."
1867    *
1868    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
1869    * mentioned on their work-arounds pages.
1870    */
1871   if (devinfo->gen == 7) {
1872      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
1873          dest.nr == BRW_ARF_NULL) {
1874         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1875      }
1876   }
1877}
1878
1879/***********************************************************************
1880 * Helpers for the various SEND message types:
1881 */
1882
1883/** Extended math function, float[8].
1884 */
1885void gen4_math(struct brw_codegen *p,
1886	       struct brw_reg dest,
1887	       unsigned function,
1888	       unsigned msg_reg_nr,
1889	       struct brw_reg src,
1890	       unsigned precision )
1891{
1892   const struct gen_device_info *devinfo = p->devinfo;
1893   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
1894   unsigned data_type;
1895   if (has_scalar_region(src)) {
1896      data_type = BRW_MATH_DATA_SCALAR;
1897   } else {
1898      data_type = BRW_MATH_DATA_VECTOR;
1899   }
1900
1901   assert(devinfo->gen < 6);
1902
1903   /* Example code doesn't set predicate_control for send
1904    * instructions.
1905    */
1906   brw_inst_set_pred_control(devinfo, insn, 0);
1907   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
1908
1909   brw_set_dest(p, insn, dest);
1910   brw_set_src0(p, insn, src);
1911   brw_set_math_message(p,
1912                        insn,
1913                        function,
1914                        src.type == BRW_REGISTER_TYPE_D,
1915                        precision,
1916                        data_type);
1917}
1918
1919void gen6_math(struct brw_codegen *p,
1920	       struct brw_reg dest,
1921	       unsigned function,
1922	       struct brw_reg src0,
1923	       struct brw_reg src1)
1924{
1925   const struct gen_device_info *devinfo = p->devinfo;
1926   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
1927
1928   assert(devinfo->gen >= 6);
1929
1930   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
1931          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
1932
1933   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
1934   if (devinfo->gen == 6) {
1935      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
1936      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
1937   }
1938
1939   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
1940       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
1941       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
1942      assert(src0.type != BRW_REGISTER_TYPE_F);
1943      assert(src1.type != BRW_REGISTER_TYPE_F);
1944      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
1945             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
1946   } else {
1947      assert(src0.type == BRW_REGISTER_TYPE_F ||
1948             (src0.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
1949      assert(src1.type == BRW_REGISTER_TYPE_F ||
1950             (src1.type == BRW_REGISTER_TYPE_HF && devinfo->gen >= 9));
1951   }
1952
1953   /* Source modifiers are ignored for extended math instructions on Gen6. */
1954   if (devinfo->gen == 6) {
1955      assert(!src0.negate);
1956      assert(!src0.abs);
1957      assert(!src1.negate);
1958      assert(!src1.abs);
1959   }
1960
1961   brw_inst_set_math_function(devinfo, insn, function);
1962
1963   brw_set_dest(p, insn, dest);
1964   brw_set_src0(p, insn, src0);
1965   brw_set_src1(p, insn, src1);
1966}
1967
1968/**
1969 * Return the right surface index to access the thread scratch space using
1970 * stateless dataport messages.
1971 */
1972unsigned
1973brw_scratch_surface_idx(const struct brw_codegen *p)
1974{
1975   /* The scratch space is thread-local so IA coherency is unnecessary. */
1976   if (p->devinfo->gen >= 8)
1977      return GEN8_BTI_STATELESS_NON_COHERENT;
1978   else
1979      return BRW_BTI_STATELESS;
1980}
1981
1982/**
1983 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
1984 * using a constant offset per channel.
1985 *
1986 * The offset must be aligned to oword size (16 bytes).  Used for
1987 * register spilling.
1988 */
1989void brw_oword_block_write_scratch(struct brw_codegen *p,
1990				   struct brw_reg mrf,
1991				   int num_regs,
1992				   unsigned offset)
1993{
1994   const struct gen_device_info *devinfo = p->devinfo;
1995   const unsigned target_cache =
1996      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
1997       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
1998       BRW_SFID_DATAPORT_WRITE);
1999   uint32_t msg_type;
2000
2001   if (devinfo->gen >= 6)
2002      offset /= 16;
2003
2004   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2005
2006   const unsigned mlen = 1 + num_regs;
2007
2008   /* Set up the message header.  This is g0, with g0.2 filled with
2009    * the offset.  We don't want to leave our offset around in g0 or
2010    * it'll screw up texture samples, so set it up inside the message
2011    * reg.
2012    */
2013   {
2014      brw_push_insn_state(p);
2015      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2016      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2017      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2018
2019      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2020
2021      /* set message header global offset field (reg 0, element 2) */
2022      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2023      brw_MOV(p,
2024	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2025				  mrf.nr,
2026				  2), BRW_REGISTER_TYPE_UD),
2027	      brw_imm_ud(offset));
2028
2029      brw_pop_insn_state(p);
2030   }
2031
2032   {
2033      struct brw_reg dest;
2034      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2035      int send_commit_msg;
2036      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2037					 BRW_REGISTER_TYPE_UW);
2038
2039      brw_inst_set_sfid(devinfo, insn, target_cache);
2040      brw_inst_set_compression(devinfo, insn, false);
2041
2042      if (brw_inst_exec_size(devinfo, insn) >= 16)
2043	 src_header = vec16(src_header);
2044
2045      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2046      if (devinfo->gen < 6)
2047         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2048
2049      /* Until gen6, writes followed by reads from the same location
2050       * are not guaranteed to be ordered unless write_commit is set.
2051       * If set, then a no-op write is issued to the destination
2052       * register to set a dependency, and a read from the destination
2053       * can be used to ensure the ordering.
2054       *
2055       * For gen6, only writes between different threads need ordering
2056       * protection.  Our use of DP writes is all about register
2057       * spilling within a thread.
2058       */
2059      if (devinfo->gen >= 6) {
2060	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2061	 send_commit_msg = 0;
2062      } else {
2063	 dest = src_header;
2064	 send_commit_msg = 1;
2065      }
2066
2067      brw_set_dest(p, insn, dest);
2068      if (devinfo->gen >= 6) {
2069	 brw_set_src0(p, insn, mrf);
2070      } else {
2071	 brw_set_src0(p, insn, brw_null_reg());
2072      }
2073
2074      if (devinfo->gen >= 6)
2075	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2076      else
2077	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2078
2079      brw_set_desc(p, insn,
2080                   brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2081                   brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2082                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2083                                     msg_type, 0, /* not a render target */
2084                                     send_commit_msg));
2085   }
2086}
2087
2088
2089/**
2090 * Read a block of owords (half a GRF each) from the scratch buffer
2091 * using a constant index per channel.
2092 *
2093 * Offset must be aligned to oword size (16 bytes).  Used for register
2094 * spilling.
2095 */
2096void
2097brw_oword_block_read_scratch(struct brw_codegen *p,
2098			     struct brw_reg dest,
2099			     struct brw_reg mrf,
2100			     int num_regs,
2101			     unsigned offset)
2102{
2103   const struct gen_device_info *devinfo = p->devinfo;
2104
2105   if (devinfo->gen >= 6)
2106      offset /= 16;
2107
2108   if (p->devinfo->gen >= 7) {
2109      /* On gen 7 and above, we no longer have message registers and we can
2110       * send from any register we want.  By using the destination register
2111       * for the message, we guarantee that the implied message write won't
2112       * accidentally overwrite anything.  This has been a problem because
2113       * the MRF registers and source for the final FB write are both fixed
2114       * and may overlap.
2115       */
2116      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2117   } else {
2118      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2119   }
2120   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2121
2122   const unsigned rlen = num_regs;
2123   const unsigned target_cache =
2124      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2125       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2126       BRW_SFID_DATAPORT_READ);
2127
2128   {
2129      brw_push_insn_state(p);
2130      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2131      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2132      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2133
2134      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2135
2136      /* set message header global offset field (reg 0, element 2) */
2137      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2138      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2139
2140      brw_pop_insn_state(p);
2141   }
2142
2143   {
2144      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2145
2146      brw_inst_set_sfid(devinfo, insn, target_cache);
2147      assert(brw_inst_pred_control(devinfo, insn) == 0);
2148      brw_inst_set_compression(devinfo, insn, false);
2149
2150      brw_set_dest(p, insn, dest);	/* UW? */
2151      if (devinfo->gen >= 6) {
2152	 brw_set_src0(p, insn, mrf);
2153      } else {
2154	 brw_set_src0(p, insn, brw_null_reg());
2155         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2156      }
2157
2158      brw_set_desc(p, insn,
2159                   brw_message_desc(devinfo, 1, rlen, true) |
2160                   brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2161                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2162                                    BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2163                                    BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2164   }
2165}
2166
2167void
2168gen7_block_read_scratch(struct brw_codegen *p,
2169                        struct brw_reg dest,
2170                        int num_regs,
2171                        unsigned offset)
2172{
2173   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2174   assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2175
2176   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2177
2178   /* The HW requires that the header is present; this is to get the g0.5
2179    * scratch offset.
2180    */
2181   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2182
2183   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2184    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2185    * is 32 bytes, which happens to be the size of a register.
2186    */
2187   offset /= REG_SIZE;
2188   assert(offset < (1 << 12));
2189
2190   gen7_set_dp_scratch_message(p, insn,
2191                               false, /* scratch read */
2192                               false, /* OWords */
2193                               false, /* invalidate after read */
2194                               num_regs,
2195                               offset,
2196                               1,        /* mlen: just g0 */
2197                               num_regs, /* rlen */
2198                               true);    /* header present */
2199}
2200
2201/**
2202 * Read float[4] vectors from the data port constant cache.
2203 * Location (in buffer) should be a multiple of 16.
2204 * Used for fetching shader constants.
2205 */
2206void brw_oword_block_read(struct brw_codegen *p,
2207			  struct brw_reg dest,
2208			  struct brw_reg mrf,
2209			  uint32_t offset,
2210			  uint32_t bind_table_index)
2211{
2212   const struct gen_device_info *devinfo = p->devinfo;
2213   const unsigned target_cache =
2214      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
2215       BRW_SFID_DATAPORT_READ);
2216   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2217
2218   /* On newer hardware, offset is in units of owords. */
2219   if (devinfo->gen >= 6)
2220      offset /= 16;
2221
2222   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2223
2224   brw_push_insn_state(p);
2225   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2226   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2227   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2228
2229   brw_push_insn_state(p);
2230   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2231   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2232
2233   /* set message header global offset field (reg 0, element 2) */
2234   brw_set_default_exec_size(p, BRW_EXECUTE_1);
2235   brw_MOV(p,
2236	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2237			       mrf.nr,
2238			       2), BRW_REGISTER_TYPE_UD),
2239	   brw_imm_ud(offset));
2240   brw_pop_insn_state(p);
2241
2242   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2243
2244   brw_inst_set_sfid(devinfo, insn, target_cache);
2245
2246   /* cast dest to a uword[8] vector */
2247   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2248
2249   brw_set_dest(p, insn, dest);
2250   if (devinfo->gen >= 6) {
2251      brw_set_src0(p, insn, mrf);
2252   } else {
2253      brw_set_src0(p, insn, brw_null_reg());
2254      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2255   }
2256
2257   brw_set_desc(p, insn,
2258                brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2259                brw_dp_read_desc(devinfo, bind_table_index,
2260                                 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2261                                 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2262                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2263
2264   brw_pop_insn_state(p);
2265}
2266
2267brw_inst *
2268brw_fb_WRITE(struct brw_codegen *p,
2269             struct brw_reg payload,
2270             struct brw_reg implied_header,
2271             unsigned msg_control,
2272             unsigned binding_table_index,
2273             unsigned msg_length,
2274             unsigned response_length,
2275             bool eot,
2276             bool last_render_target,
2277             bool header_present)
2278{
2279   const struct gen_device_info *devinfo = p->devinfo;
2280   const unsigned target_cache =
2281      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2282       BRW_SFID_DATAPORT_WRITE);
2283   brw_inst *insn;
2284   unsigned msg_type;
2285   struct brw_reg dest, src0;
2286
2287   if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2288      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2289   else
2290      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2291
2292   if (devinfo->gen >= 6) {
2293      insn = next_insn(p, BRW_OPCODE_SENDC);
2294   } else {
2295      insn = next_insn(p, BRW_OPCODE_SEND);
2296   }
2297   brw_inst_set_sfid(devinfo, insn, target_cache);
2298   brw_inst_set_compression(devinfo, insn, false);
2299
2300   if (devinfo->gen >= 6) {
2301      /* headerless version, just submit color payload */
2302      src0 = payload;
2303
2304      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2305   } else {
2306      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2307      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2308      src0 = implied_header;
2309
2310      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
2311   }
2312
2313   brw_set_dest(p, insn, dest);
2314   brw_set_src0(p, insn, src0);
2315   brw_set_desc(p, insn,
2316                brw_message_desc(devinfo, msg_length, response_length,
2317                                 header_present) |
2318                brw_dp_write_desc(devinfo, binding_table_index, msg_control,
2319                                  msg_type, last_render_target,
2320                                  0 /* send_commit_msg */));
2321   brw_inst_set_eot(devinfo, insn, eot);
2322
2323   return insn;
2324}
2325
2326brw_inst *
2327gen9_fb_READ(struct brw_codegen *p,
2328             struct brw_reg dst,
2329             struct brw_reg payload,
2330             unsigned binding_table_index,
2331             unsigned msg_length,
2332             unsigned response_length,
2333             bool per_sample)
2334{
2335   const struct gen_device_info *devinfo = p->devinfo;
2336   assert(devinfo->gen >= 9);
2337   const unsigned msg_subtype =
2338      brw_get_default_exec_size(p) == BRW_EXECUTE_16 ? 0 : 1;
2339   brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2340
2341   brw_inst_set_sfid(devinfo, insn, GEN6_SFID_DATAPORT_RENDER_CACHE);
2342   brw_set_dest(p, insn, dst);
2343   brw_set_src0(p, insn, payload);
2344   brw_set_desc(
2345      p, insn,
2346      brw_message_desc(devinfo, msg_length, response_length, true) |
2347      brw_dp_read_desc(devinfo, binding_table_index,
2348                       per_sample << 5 | msg_subtype,
2349                       GEN9_DATAPORT_RC_RENDER_TARGET_READ,
2350                       BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2351   brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2352
2353   return insn;
2354}
2355
2356/**
2357 * Texture sample instruction.
2358 * Note: the msg_type plus msg_length values determine exactly what kind
2359 * of sampling operation is performed.  See volume 4, page 161 of docs.
2360 */
2361void brw_SAMPLE(struct brw_codegen *p,
2362		struct brw_reg dest,
2363		unsigned msg_reg_nr,
2364		struct brw_reg src0,
2365		unsigned binding_table_index,
2366		unsigned sampler,
2367		unsigned msg_type,
2368		unsigned response_length,
2369		unsigned msg_length,
2370		unsigned header_present,
2371		unsigned simd_mode,
2372		unsigned return_format)
2373{
2374   const struct gen_device_info *devinfo = p->devinfo;
2375   brw_inst *insn;
2376
2377   if (msg_reg_nr != -1)
2378      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2379
2380   insn = next_insn(p, BRW_OPCODE_SEND);
2381   brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2382   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2383
2384   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2385    *
2386    *    "Instruction compression is not allowed for this instruction (that
2387    *     is, send). The hardware behavior is undefined if this instruction is
2388    *     set as compressed. However, compress control can be set to "SecHalf"
2389    *     to affect the EMask generation."
2390    *
2391    * No similar wording is found in later PRMs, but there are examples
2392    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2393    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2394    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2395    */
2396   brw_inst_set_compression(devinfo, insn, false);
2397
2398   if (devinfo->gen < 6)
2399      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2400
2401   brw_set_dest(p, insn, dest);
2402   brw_set_src0(p, insn, src0);
2403   brw_set_desc(p, insn,
2404                brw_message_desc(devinfo, msg_length, response_length,
2405                                 header_present) |
2406                brw_sampler_desc(devinfo, binding_table_index, sampler,
2407                                 msg_type, simd_mode, return_format));
2408}
2409
2410/* Adjust the message header's sampler state pointer to
2411 * select the correct group of 16 samplers.
2412 */
2413void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2414                                      struct brw_reg header,
2415                                      struct brw_reg sampler_index)
2416{
2417   /* The "Sampler Index" field can only store values between 0 and 15.
2418    * However, we can add an offset to the "Sampler State Pointer"
2419    * field, effectively selecting a different set of 16 samplers.
2420    *
2421    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2422    * offset, and each sampler state is only 16-bytes, so we can't
2423    * exclusively use the offset - we have to use both.
2424    */
2425
2426   const struct gen_device_info *devinfo = p->devinfo;
2427
2428   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2429      const int sampler_state_size = 16; /* 16 bytes */
2430      uint32_t sampler = sampler_index.ud;
2431
2432      if (sampler >= 16) {
2433         assert(devinfo->is_haswell || devinfo->gen >= 8);
2434         brw_ADD(p,
2435                 get_element_ud(header, 3),
2436                 get_element_ud(brw_vec8_grf(0, 0), 3),
2437                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2438      }
2439   } else {
2440      /* Non-const sampler array indexing case */
2441      if (devinfo->gen < 8 && !devinfo->is_haswell) {
2442         return;
2443      }
2444
2445      struct brw_reg temp = get_element_ud(header, 3);
2446
2447      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2448      brw_SHL(p, temp, temp, brw_imm_ud(4));
2449      brw_ADD(p,
2450              get_element_ud(header, 3),
2451              get_element_ud(brw_vec8_grf(0, 0), 3),
2452              temp);
2453   }
2454}
2455
2456/* All these variables are pretty confusing - we might be better off
2457 * using bitmasks and macros for this, in the old style.  Or perhaps
2458 * just having the caller instantiate the fields in dword3 itself.
2459 */
2460void brw_urb_WRITE(struct brw_codegen *p,
2461		   struct brw_reg dest,
2462		   unsigned msg_reg_nr,
2463		   struct brw_reg src0,
2464                   enum brw_urb_write_flags flags,
2465		   unsigned msg_length,
2466		   unsigned response_length,
2467		   unsigned offset,
2468		   unsigned swizzle)
2469{
2470   const struct gen_device_info *devinfo = p->devinfo;
2471   brw_inst *insn;
2472
2473   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2474
2475   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2476      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2477      brw_push_insn_state(p);
2478      brw_set_default_access_mode(p, BRW_ALIGN_1);
2479      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2480      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2481      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2482		       BRW_REGISTER_TYPE_UD),
2483	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2484		brw_imm_ud(0xff00));
2485      brw_pop_insn_state(p);
2486   }
2487
2488   insn = next_insn(p, BRW_OPCODE_SEND);
2489
2490   assert(msg_length < BRW_MAX_MRF(devinfo->gen));
2491
2492   brw_set_dest(p, insn, dest);
2493   brw_set_src0(p, insn, src0);
2494   brw_set_src1(p, insn, brw_imm_d(0));
2495
2496   if (devinfo->gen < 6)
2497      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2498
2499   brw_set_urb_message(p,
2500		       insn,
2501		       flags,
2502		       msg_length,
2503		       response_length,
2504		       offset,
2505		       swizzle);
2506}
2507
2508void
2509brw_send_indirect_message(struct brw_codegen *p,
2510                          unsigned sfid,
2511                          struct brw_reg dst,
2512                          struct brw_reg payload,
2513                          struct brw_reg desc,
2514                          unsigned desc_imm,
2515                          bool eot)
2516{
2517   const struct gen_device_info *devinfo = p->devinfo;
2518   struct brw_inst *send;
2519
2520   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2521
2522   assert(desc.type == BRW_REGISTER_TYPE_UD);
2523
2524   if (desc.file == BRW_IMMEDIATE_VALUE) {
2525      send = next_insn(p, BRW_OPCODE_SEND);
2526      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2527      brw_set_desc(p, send, desc.ud | desc_imm);
2528   } else {
2529      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2530
2531      brw_push_insn_state(p);
2532      brw_set_default_access_mode(p, BRW_ALIGN_1);
2533      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2534      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2535      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2536
2537      /* Load the indirect descriptor to an address register using OR so the
2538       * caller can specify additional descriptor bits with the desc_imm
2539       * immediate.
2540       */
2541      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2542
2543      brw_pop_insn_state(p);
2544
2545      send = next_insn(p, BRW_OPCODE_SEND);
2546      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2547      brw_set_src1(p, send, addr);
2548   }
2549
2550   brw_set_dest(p, send, dst);
2551   brw_inst_set_sfid(devinfo, send, sfid);
2552   brw_inst_set_eot(devinfo, send, eot);
2553}
2554
2555void
2556brw_send_indirect_split_message(struct brw_codegen *p,
2557                                unsigned sfid,
2558                                struct brw_reg dst,
2559                                struct brw_reg payload0,
2560                                struct brw_reg payload1,
2561                                struct brw_reg desc,
2562                                unsigned desc_imm,
2563                                struct brw_reg ex_desc,
2564                                unsigned ex_desc_imm,
2565                                bool eot)
2566{
2567   const struct gen_device_info *devinfo = p->devinfo;
2568   struct brw_inst *send;
2569
2570   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2571
2572   assert(desc.type == BRW_REGISTER_TYPE_UD);
2573
2574   if (desc.file == BRW_IMMEDIATE_VALUE) {
2575      desc.ud |= desc_imm;
2576   } else {
2577      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2578
2579      brw_push_insn_state(p);
2580      brw_set_default_access_mode(p, BRW_ALIGN_1);
2581      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2582      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2583      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2584
2585      /* Load the indirect descriptor to an address register using OR so the
2586       * caller can specify additional descriptor bits with the desc_imm
2587       * immediate.
2588       */
2589      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2590
2591      brw_pop_insn_state(p);
2592      desc = addr;
2593   }
2594
2595   if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2596      ex_desc.ud |= ex_desc_imm;
2597   } else {
2598      struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2599
2600      brw_push_insn_state(p);
2601      brw_set_default_access_mode(p, BRW_ALIGN_1);
2602      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2603      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2604      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2605
2606      /* Load the indirect extended descriptor to an address register using OR
2607       * so the caller can specify additional descriptor bits with the
2608       * desc_imm immediate.
2609       *
2610       * Even though the instruction dispatcher always pulls the SFID and EOT
2611       * fields from the instruction itself, actual external unit which
2612       * processes the message gets the SFID and EOT from the extended
2613       * descriptor which comes from the address register.  If we don't OR
2614       * those two bits in, the external unit may get confused and hang.
2615       */
2616      brw_OR(p, addr, ex_desc, brw_imm_ud(ex_desc_imm | sfid | eot << 5));
2617
2618      brw_pop_insn_state(p);
2619      ex_desc = addr;
2620   }
2621
2622   send = next_insn(p, BRW_OPCODE_SENDS);
2623   brw_set_dest(p, send, dst);
2624   brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2625   brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2626
2627   if (desc.file == BRW_IMMEDIATE_VALUE) {
2628      brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2629      brw_inst_set_send_desc(devinfo, send, desc.ud);
2630   } else {
2631      assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2632      assert(desc.nr == BRW_ARF_ADDRESS);
2633      assert(desc.subnr == 0);
2634      brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2635   }
2636
2637   if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2638      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2639      brw_inst_set_send_ex_desc(devinfo, send, ex_desc.ud);
2640   } else {
2641      assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2642      assert(ex_desc.nr == BRW_ARF_ADDRESS);
2643      assert((ex_desc.subnr & 0x3) == 0);
2644      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2645      brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2646   }
2647
2648   brw_inst_set_sfid(devinfo, send, sfid);
2649   brw_inst_set_eot(devinfo, send, eot);
2650}
2651
2652static void
2653brw_send_indirect_surface_message(struct brw_codegen *p,
2654                                  unsigned sfid,
2655                                  struct brw_reg dst,
2656                                  struct brw_reg payload,
2657                                  struct brw_reg surface,
2658                                  unsigned desc_imm)
2659{
2660   if (surface.file != BRW_IMMEDIATE_VALUE) {
2661      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2662
2663      brw_push_insn_state(p);
2664      brw_set_default_access_mode(p, BRW_ALIGN_1);
2665      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2666      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2667      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2668
2669      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2670       * some surface array is accessed out of bounds.
2671       */
2672      brw_AND(p, addr,
2673              suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2674                        BRW_GET_SWZ(surface.swizzle, 0)),
2675              brw_imm_ud(0xff));
2676
2677      brw_pop_insn_state(p);
2678
2679      surface = addr;
2680   }
2681
2682   brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2683}
2684
2685static bool
2686while_jumps_before_offset(const struct gen_device_info *devinfo,
2687                          brw_inst *insn, int while_offset, int start_offset)
2688{
2689   int scale = 16 / brw_jump_scale(devinfo);
2690   int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
2691                               : brw_inst_jip(devinfo, insn);
2692   assert(jip < 0);
2693   return while_offset + jip * scale <= start_offset;
2694}
2695
2696
2697static int
2698brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2699{
2700   int offset;
2701   void *store = p->store;
2702   const struct gen_device_info *devinfo = p->devinfo;
2703
2704   int depth = 0;
2705
2706   for (offset = next_offset(devinfo, store, start_offset);
2707        offset < p->next_insn_offset;
2708        offset = next_offset(devinfo, store, offset)) {
2709      brw_inst *insn = store + offset;
2710
2711      switch (brw_inst_opcode(devinfo, insn)) {
2712      case BRW_OPCODE_IF:
2713         depth++;
2714         break;
2715      case BRW_OPCODE_ENDIF:
2716         if (depth == 0)
2717            return offset;
2718         depth--;
2719         break;
2720      case BRW_OPCODE_WHILE:
2721         /* If the while doesn't jump before our instruction, it's the end
2722          * of a sibling do...while loop.  Ignore it.
2723          */
2724         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2725            continue;
2726         /* fallthrough */
2727      case BRW_OPCODE_ELSE:
2728      case BRW_OPCODE_HALT:
2729         if (depth == 0)
2730            return offset;
2731      }
2732   }
2733
2734   return 0;
2735}
2736
2737/* There is no DO instruction on gen6, so to find the end of the loop
2738 * we have to see if the loop is jumping back before our start
2739 * instruction.
2740 */
2741static int
2742brw_find_loop_end(struct brw_codegen *p, int start_offset)
2743{
2744   const struct gen_device_info *devinfo = p->devinfo;
2745   int offset;
2746   void *store = p->store;
2747
2748   assert(devinfo->gen >= 6);
2749
2750   /* Always start after the instruction (such as a WHILE) we're trying to fix
2751    * up.
2752    */
2753   for (offset = next_offset(devinfo, store, start_offset);
2754        offset < p->next_insn_offset;
2755        offset = next_offset(devinfo, store, offset)) {
2756      brw_inst *insn = store + offset;
2757
2758      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2759	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2760	    return offset;
2761      }
2762   }
2763   assert(!"not reached");
2764   return start_offset;
2765}
2766
2767/* After program generation, go back and update the UIP and JIP of
2768 * BREAK, CONT, and HALT instructions to their correct locations.
2769 */
2770void
2771brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2772{
2773   const struct gen_device_info *devinfo = p->devinfo;
2774   int offset;
2775   int br = brw_jump_scale(devinfo);
2776   int scale = 16 / br;
2777   void *store = p->store;
2778
2779   if (devinfo->gen < 6)
2780      return;
2781
2782   for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2783      brw_inst *insn = store + offset;
2784      assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2785
2786      int block_end_offset = brw_find_next_block_end(p, offset);
2787      switch (brw_inst_opcode(devinfo, insn)) {
2788      case BRW_OPCODE_BREAK:
2789         assert(block_end_offset != 0);
2790         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2791	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
2792         brw_inst_set_uip(devinfo, insn,
2793	    (brw_find_loop_end(p, offset) - offset +
2794             (devinfo->gen == 6 ? 16 : 0)) / scale);
2795	 break;
2796      case BRW_OPCODE_CONTINUE:
2797         assert(block_end_offset != 0);
2798         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2799         brw_inst_set_uip(devinfo, insn,
2800            (brw_find_loop_end(p, offset) - offset) / scale);
2801
2802         assert(brw_inst_uip(devinfo, insn) != 0);
2803         assert(brw_inst_jip(devinfo, insn) != 0);
2804	 break;
2805
2806      case BRW_OPCODE_ENDIF: {
2807         int32_t jump = (block_end_offset == 0) ?
2808                        1 * br : (block_end_offset - offset) / scale;
2809         if (devinfo->gen >= 7)
2810            brw_inst_set_jip(devinfo, insn, jump);
2811         else
2812            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
2813	 break;
2814      }
2815
2816      case BRW_OPCODE_HALT:
2817	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
2818	  *
2819	  *    "In case of the halt instruction not inside any conditional
2820	  *     code block, the value of <JIP> and <UIP> should be the
2821	  *     same. In case of the halt instruction inside conditional code
2822	  *     block, the <UIP> should be the end of the program, and the
2823	  *     <JIP> should be end of the most inner conditional code block."
2824	  *
2825	  * The uip will have already been set by whoever set up the
2826	  * instruction.
2827	  */
2828	 if (block_end_offset == 0) {
2829            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
2830	 } else {
2831            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2832	 }
2833         assert(brw_inst_uip(devinfo, insn) != 0);
2834         assert(brw_inst_jip(devinfo, insn) != 0);
2835	 break;
2836      }
2837   }
2838}
2839
2840void brw_ff_sync(struct brw_codegen *p,
2841		   struct brw_reg dest,
2842		   unsigned msg_reg_nr,
2843		   struct brw_reg src0,
2844		   bool allocate,
2845		   unsigned response_length,
2846		   bool eot)
2847{
2848   const struct gen_device_info *devinfo = p->devinfo;
2849   brw_inst *insn;
2850
2851   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2852
2853   insn = next_insn(p, BRW_OPCODE_SEND);
2854   brw_set_dest(p, insn, dest);
2855   brw_set_src0(p, insn, src0);
2856   brw_set_src1(p, insn, brw_imm_d(0));
2857
2858   if (devinfo->gen < 6)
2859      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2860
2861   brw_set_ff_sync_message(p,
2862			   insn,
2863			   allocate,
2864			   response_length,
2865			   eot);
2866}
2867
2868/**
2869 * Emit the SEND instruction necessary to generate stream output data on Gen6
2870 * (for transform feedback).
2871 *
2872 * If send_commit_msg is true, this is the last piece of stream output data
2873 * from this thread, so send the data as a committed write.  According to the
2874 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
2875 *
2876 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
2877 *   writes are complete by sending the final write as a committed write."
2878 */
2879void
2880brw_svb_write(struct brw_codegen *p,
2881              struct brw_reg dest,
2882              unsigned msg_reg_nr,
2883              struct brw_reg src0,
2884              unsigned binding_table_index,
2885              bool   send_commit_msg)
2886{
2887   const struct gen_device_info *devinfo = p->devinfo;
2888   const unsigned target_cache =
2889      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
2890       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
2891       BRW_SFID_DATAPORT_WRITE);
2892   brw_inst *insn;
2893
2894   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
2895
2896   insn = next_insn(p, BRW_OPCODE_SEND);
2897   brw_inst_set_sfid(devinfo, insn, target_cache);
2898   brw_set_dest(p, insn, dest);
2899   brw_set_src0(p, insn, src0);
2900   brw_set_desc(p, insn,
2901                brw_message_desc(devinfo, 1, send_commit_msg, true) |
2902                brw_dp_write_desc(devinfo, binding_table_index,
2903                                  0, /* msg_control: ignored */
2904                                  GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
2905                                  0, /* last_render_target: ignored */
2906                                  send_commit_msg)); /* send_commit_msg */
2907}
2908
2909static unsigned
2910brw_surface_payload_size(struct brw_codegen *p,
2911                         unsigned num_channels,
2912                         unsigned exec_size /**< 0 for SIMD4x2 */)
2913{
2914   if (exec_size == 0)
2915      return 1; /* SIMD4x2 */
2916   else if (exec_size <= 8)
2917      return num_channels;
2918   else
2919      return 2 * num_channels;
2920}
2921
2922void
2923brw_untyped_atomic(struct brw_codegen *p,
2924                   struct brw_reg dst,
2925                   struct brw_reg payload,
2926                   struct brw_reg surface,
2927                   unsigned atomic_op,
2928                   unsigned msg_length,
2929                   bool response_expected,
2930                   bool header_present)
2931{
2932   const struct gen_device_info *devinfo = p->devinfo;
2933   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2934                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2935                          GEN7_SFID_DATAPORT_DATA_CACHE);
2936   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2937   /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
2938   const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2939   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2940                              has_simd4x2 ? 0 : 8;
2941   const unsigned response_length =
2942      brw_surface_payload_size(p, response_expected, exec_size);
2943   const unsigned desc =
2944      brw_message_desc(devinfo, msg_length, response_length, header_present) |
2945      brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
2946                                 response_expected);
2947   /* Mask out unused components -- This is especially important in Align16
2948    * mode on generations that don't have native support for SIMD4x2 atomics,
2949    * because unused but enabled components will cause the dataport to perform
2950    * additional atomic operations on the addresses that happen to be in the
2951    * uninitialized Y, Z and W coordinates of the payload.
2952    */
2953   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
2954
2955   brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
2956                                     payload, surface, desc);
2957}
2958
2959void
2960brw_untyped_surface_read(struct brw_codegen *p,
2961                         struct brw_reg dst,
2962                         struct brw_reg payload,
2963                         struct brw_reg surface,
2964                         unsigned msg_length,
2965                         unsigned num_channels)
2966{
2967   const struct gen_device_info *devinfo = p->devinfo;
2968   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2969                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2970                          GEN7_SFID_DATAPORT_DATA_CACHE);
2971   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2972   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
2973   const unsigned response_length =
2974      brw_surface_payload_size(p, num_channels, exec_size);
2975   const unsigned desc =
2976      brw_message_desc(devinfo, msg_length, response_length, false) |
2977      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
2978
2979   brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
2980}
2981
2982void
2983brw_untyped_surface_write(struct brw_codegen *p,
2984                          struct brw_reg payload,
2985                          struct brw_reg surface,
2986                          unsigned msg_length,
2987                          unsigned num_channels,
2988                          bool header_present)
2989{
2990   const struct gen_device_info *devinfo = p->devinfo;
2991   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
2992                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
2993                          GEN7_SFID_DATAPORT_DATA_CACHE);
2994   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
2995   /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
2996   const bool has_simd4x2 = devinfo->gen >= 8 || devinfo->is_haswell;
2997   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
2998                              has_simd4x2 ? 0 : 8;
2999   const unsigned desc =
3000      brw_message_desc(devinfo, msg_length, 0, header_present) |
3001      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3002   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3003   const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3004
3005   brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3006                                     payload, surface, desc);
3007}
3008
3009static void
3010brw_set_memory_fence_message(struct brw_codegen *p,
3011                             struct brw_inst *insn,
3012                             enum brw_message_target sfid,
3013                             bool commit_enable)
3014{
3015   const struct gen_device_info *devinfo = p->devinfo;
3016
3017   brw_set_desc(p, insn, brw_message_desc(
3018                   devinfo, 1, (commit_enable ? 1 : 0), true));
3019
3020   brw_inst_set_sfid(devinfo, insn, sfid);
3021
3022   switch (sfid) {
3023   case GEN6_SFID_DATAPORT_RENDER_CACHE:
3024      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
3025      break;
3026   case GEN7_SFID_DATAPORT_DATA_CACHE:
3027      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
3028      break;
3029   default:
3030      unreachable("Not reached");
3031   }
3032
3033   if (commit_enable)
3034      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3035}
3036
3037void
3038brw_memory_fence(struct brw_codegen *p,
3039                 struct brw_reg dst,
3040                 struct brw_reg src,
3041                 enum opcode send_op,
3042                 bool stall)
3043{
3044   const struct gen_device_info *devinfo = p->devinfo;
3045   const bool commit_enable = stall ||
3046      devinfo->gen >= 10 || /* HSD ES # 1404612949 */
3047      (devinfo->gen == 7 && !devinfo->is_haswell);
3048   struct brw_inst *insn;
3049
3050   brw_push_insn_state(p);
3051   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3052   brw_set_default_exec_size(p, BRW_EXECUTE_1);
3053   dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3054   src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3055
3056   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3057    * message doesn't write anything back.
3058    */
3059   insn = next_insn(p, send_op);
3060   brw_set_dest(p, insn, dst);
3061   brw_set_src0(p, insn, src);
3062   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
3063                                commit_enable);
3064
3065   if (devinfo->gen == 7 && !devinfo->is_haswell) {
3066      /* IVB does typed surface access through the render cache, so we need to
3067       * flush it too.  Use a different register so both flushes can be
3068       * pipelined by the hardware.
3069       */
3070      insn = next_insn(p, send_op);
3071      brw_set_dest(p, insn, offset(dst, 1));
3072      brw_set_src0(p, insn, src);
3073      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
3074                                   commit_enable);
3075
3076      /* Now write the response of the second message into the response of the
3077       * first to trigger a pipeline stall -- This way future render and data
3078       * cache messages will be properly ordered with respect to past data and
3079       * render cache messages.
3080       */
3081      brw_MOV(p, dst, offset(dst, 1));
3082   }
3083
3084   if (stall)
3085      brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW), dst);
3086
3087   brw_pop_insn_state(p);
3088}
3089
3090void
3091brw_pixel_interpolator_query(struct brw_codegen *p,
3092                             struct brw_reg dest,
3093                             struct brw_reg mrf,
3094                             bool noperspective,
3095                             unsigned mode,
3096                             struct brw_reg data,
3097                             unsigned msg_length,
3098                             unsigned response_length)
3099{
3100   const struct gen_device_info *devinfo = p->devinfo;
3101   const uint16_t exec_size = brw_get_default_exec_size(p);
3102   const unsigned slot_group = brw_get_default_group(p) / 16;
3103   const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3104   const unsigned desc =
3105      brw_message_desc(devinfo, msg_length, response_length, false) |
3106      brw_pixel_interp_desc(devinfo, mode, noperspective, simd_mode,
3107                            slot_group);
3108
3109   /* brw_send_indirect_message will automatically use a direct send message
3110    * if data is actually immediate.
3111    */
3112   brw_send_indirect_message(p,
3113                             GEN7_SFID_PIXEL_INTERPOLATOR,
3114                             dest,
3115                             mrf,
3116                             vec1(data),
3117                             desc,
3118                             false);
3119}
3120
3121void
3122brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3123                      struct brw_reg mask)
3124{
3125   const struct gen_device_info *devinfo = p->devinfo;
3126   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3127   const unsigned qtr_control = brw_get_default_group(p) / 8;
3128   brw_inst *inst;
3129
3130   assert(devinfo->gen >= 7);
3131   assert(mask.type == BRW_REGISTER_TYPE_UD);
3132
3133   brw_push_insn_state(p);
3134
3135   /* The flag register is only used on Gen7 in align1 mode, so avoid setting
3136    * unnecessary bits in the instruction words, get the information we need
3137    * and reset the default flag register. This allows more instructions to be
3138    * compacted.
3139    */
3140   const unsigned flag_subreg = p->current->flag_subreg;
3141   brw_set_default_flag_reg(p, 0, 0);
3142
3143   if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3144      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3145
3146      if (devinfo->gen >= 8) {
3147         /* Getting the first active channel index is easy on Gen8: Just find
3148          * the first bit set in the execution mask.  The register exists on
3149          * HSW already but it reads back as all ones when the current
3150          * instruction has execution masking disabled, so it's kind of
3151          * useless.
3152          */
3153         struct brw_reg exec_mask =
3154            retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3155
3156         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3157         if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3158            /* Unfortunately, ce0 does not take into account the thread
3159             * dispatch mask, which may be a problem in cases where it's not
3160             * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3161             * some n).  Combine ce0 with the given dispatch (or vector) mask
3162             * to mask off those channels which were never dispatched by the
3163             * hardware.
3164             */
3165            brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3166            brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3167            exec_mask = vec1(dst);
3168         }
3169
3170         /* Quarter control has the effect of magically shifting the value of
3171          * ce0 so you'll get the first active channel relative to the
3172          * specified quarter control as result.
3173          */
3174         inst = brw_FBL(p, vec1(dst), exec_mask);
3175      } else {
3176         const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3177
3178         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3179         brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3180
3181         /* Run enough instructions returning zero with execution masking and
3182          * a conditional modifier enabled in order to get the full execution
3183          * mask in f1.0.  We could use a single 32-wide move here if it
3184          * weren't because of the hardware bug that causes channel enables to
3185          * be applied incorrectly to the second half of 32-wide instructions
3186          * on Gen7.
3187          */
3188         const unsigned lower_size = MIN2(16, exec_size);
3189         for (unsigned i = 0; i < exec_size / lower_size; i++) {
3190            inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3191                           brw_imm_uw(0));
3192            brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3193            brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3194            brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3195            brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3196            brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3197            brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3198         }
3199
3200         /* Find the first bit set in the exec_size-wide portion of the flag
3201          * register that was updated by the last sequence of MOV
3202          * instructions.
3203          */
3204         const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3205         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3206         brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3207      }
3208   } else {
3209      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3210
3211      if (devinfo->gen >= 8 &&
3212          mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3213         /* In SIMD4x2 mode the first active channel index is just the
3214          * negation of the first bit of the mask register.  Note that ce0
3215          * doesn't take into account the dispatch mask, so the Gen7 path
3216          * should be used instead unless you have the guarantee that the
3217          * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3218          * for some n).
3219          */
3220         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3221                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3222                        brw_imm_ud(1));
3223
3224      } else {
3225         /* Overwrite the destination without and with execution masking to
3226          * find out which of the channels is active.
3227          */
3228         brw_push_insn_state(p);
3229         brw_set_default_exec_size(p, BRW_EXECUTE_4);
3230         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3231                 brw_imm_ud(1));
3232
3233         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3234                        brw_imm_ud(0));
3235         brw_pop_insn_state(p);
3236         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3237      }
3238   }
3239
3240   brw_pop_insn_state(p);
3241}
3242
3243void
3244brw_broadcast(struct brw_codegen *p,
3245              struct brw_reg dst,
3246              struct brw_reg src,
3247              struct brw_reg idx)
3248{
3249   const struct gen_device_info *devinfo = p->devinfo;
3250   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3251   brw_inst *inst;
3252
3253   brw_push_insn_state(p);
3254   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3255   brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3256
3257   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3258          src.address_mode == BRW_ADDRESS_DIRECT);
3259   assert(!src.abs && !src.negate);
3260   assert(src.type == dst.type);
3261
3262   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3263       idx.file == BRW_IMMEDIATE_VALUE) {
3264      /* Trivial, the source is already uniform or the index is a constant.
3265       * We will typically not get here if the optimizer is doing its job, but
3266       * asserting would be mean.
3267       */
3268      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3269      brw_MOV(p, dst,
3270              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
3271               stride(suboffset(src, 4 * i), 0, 4, 1)));
3272   } else {
3273      /* From the Haswell PRM section "Register Region Restrictions":
3274       *
3275       *    "The lower bits of the AddressImmediate must not overflow to
3276       *    change the register address.  The lower 5 bits of Address
3277       *    Immediate when added to lower 5 bits of address register gives
3278       *    the sub-register offset. The upper bits of Address Immediate
3279       *    when added to upper bits of address register gives the register
3280       *    address. Any overflow from sub-register offset is dropped."
3281       *
3282       * Fortunately, for broadcast, we never have a sub-register offset so
3283       * this isn't an issue.
3284       */
3285      assert(src.subnr == 0);
3286
3287      if (align1) {
3288         const struct brw_reg addr =
3289            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3290         unsigned offset = src.nr * REG_SIZE + src.subnr;
3291         /* Limit in bytes of the signed indirect addressing immediate. */
3292         const unsigned limit = 512;
3293
3294         brw_push_insn_state(p);
3295         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3296         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3297
3298         /* Take into account the component size and horizontal stride. */
3299         assert(src.vstride == src.hstride + src.width);
3300         brw_SHL(p, addr, vec1(idx),
3301                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
3302                            src.hstride - 1));
3303
3304         /* We can only address up to limit bytes using the indirect
3305          * addressing immediate, account for the difference if the source
3306          * register is above this limit.
3307          */
3308         if (offset >= limit) {
3309            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3310            offset = offset % limit;
3311         }
3312
3313         brw_pop_insn_state(p);
3314
3315         /* Use indirect addressing to fetch the specified component. */
3316         if (type_sz(src.type) > 4 &&
3317             (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
3318            /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3319             *
3320             *    "When source or destination datatype is 64b or operation is
3321             *    integer DWord multiply, indirect addressing must not be
3322             *    used."
3323             *
3324             * To work around both of this issue, we do two integer MOVs
3325             * insead of one 64-bit MOV.  Because no double value should ever
3326             * cross a register boundary, it's safe to use the immediate
3327             * offset in the indirect here to handle adding 4 bytes to the
3328             * offset and avoid the extra ADD to the register file.
3329             */
3330            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3331                       retype(brw_vec1_indirect(addr.subnr, offset),
3332                              BRW_REGISTER_TYPE_D));
3333            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3334                       retype(brw_vec1_indirect(addr.subnr, offset + 4),
3335                              BRW_REGISTER_TYPE_D));
3336         } else {
3337            brw_MOV(p, dst,
3338                    retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3339         }
3340      } else {
3341         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3342          * to all bits of a flag register,
3343          */
3344         inst = brw_MOV(p,
3345                        brw_null_reg(),
3346                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3347         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3348         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3349         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3350
3351         /* and use predicated SEL to pick the right channel. */
3352         inst = brw_SEL(p, dst,
3353                        stride(suboffset(src, 4), 4, 4, 1),
3354                        stride(src, 4, 4, 1));
3355         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3356         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3357      }
3358   }
3359
3360   brw_pop_insn_state(p);
3361}
3362
3363/**
3364 * This instruction is generated as a single-channel align1 instruction by
3365 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3366 *
3367 * We can't use the typed atomic op in the FS because that has the execution
3368 * mask ANDed with the pixel mask, but we just want to write the one dword for
3369 * all the pixels.
3370 *
3371 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3372 * one u32.  So we use the same untyped atomic write message as the pixel
3373 * shader.
3374 *
3375 * The untyped atomic operation requires a BUFFER surface type with RAW
3376 * format, and is only accessible through the legacy DATA_CACHE dataport
3377 * messages.
3378 */
3379void brw_shader_time_add(struct brw_codegen *p,
3380                         struct brw_reg payload,
3381                         uint32_t surf_index)
3382{
3383   const struct gen_device_info *devinfo = p->devinfo;
3384   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
3385                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3386                          GEN7_SFID_DATAPORT_DATA_CACHE);
3387   assert(devinfo->gen >= 7);
3388
3389   brw_push_insn_state(p);
3390   brw_set_default_access_mode(p, BRW_ALIGN_1);
3391   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3392   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3393   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3394
3395   /* We use brw_vec1_reg and unmasked because we want to increment the given
3396    * offset only once.
3397    */
3398   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3399                                      BRW_ARF_NULL, 0));
3400   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3401                                      payload.nr, 0));
3402   brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3403                          brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3404                                                     false)));
3405
3406   brw_inst_set_sfid(devinfo, send, sfid);
3407   brw_inst_set_binding_table_index(devinfo, send, surf_index);
3408
3409   brw_pop_insn_state(p);
3410}
3411
3412
3413/**
3414 * Emit the SEND message for a barrier
3415 */
3416void
3417brw_barrier(struct brw_codegen *p, struct brw_reg src)
3418{
3419   const struct gen_device_info *devinfo = p->devinfo;
3420   struct brw_inst *inst;
3421
3422   assert(devinfo->gen >= 7);
3423
3424   brw_push_insn_state(p);
3425   brw_set_default_access_mode(p, BRW_ALIGN_1);
3426   inst = next_insn(p, BRW_OPCODE_SEND);
3427   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3428   brw_set_src0(p, inst, src);
3429   brw_set_src1(p, inst, brw_null_reg());
3430   brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3431
3432   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3433   brw_inst_set_gateway_notify(devinfo, inst, 1);
3434   brw_inst_set_gateway_subfuncid(devinfo, inst,
3435                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3436
3437   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3438   brw_pop_insn_state(p);
3439}
3440
3441
3442/**
3443 * Emit the wait instruction for a barrier
3444 */
3445void
3446brw_WAIT(struct brw_codegen *p)
3447{
3448   const struct gen_device_info *devinfo = p->devinfo;
3449   struct brw_inst *insn;
3450
3451   struct brw_reg src = brw_notification_reg();
3452
3453   insn = next_insn(p, BRW_OPCODE_WAIT);
3454   brw_set_dest(p, insn, src);
3455   brw_set_src0(p, insn, src);
3456   brw_set_src1(p, insn, brw_null_reg());
3457
3458   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3459   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3460}
3461
3462/**
3463 * Changes the floating point rounding mode updating the control register
3464 * field defined at cr0.0[5-6] bits. This function supports the changes to
3465 * RTNE (00), RU (01), RD (10) and RTZ (11) rounding using bitwise operations.
3466 * Only RTNE and RTZ rounding are enabled at nir.
3467 */
3468void
3469brw_rounding_mode(struct brw_codegen *p,
3470                  enum brw_rnd_mode mode)
3471{
3472   const unsigned bits = mode << BRW_CR0_RND_MODE_SHIFT;
3473
3474   if (bits != BRW_CR0_RND_MODE_MASK) {
3475      brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3476                               brw_imm_ud(~BRW_CR0_RND_MODE_MASK));
3477      brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3478
3479      /* From the Skylake PRM, Volume 7, page 760:
3480       *  "Implementation Restriction on Register Access: When the control
3481       *   register is used as an explicit source and/or destination, hardware
3482       *   does not ensure execution pipeline coherency. Software must set the
3483       *   thread control field to ‘switch’ for an instruction that uses
3484       *   control register as an explicit operand."
3485       */
3486      brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3487    }
3488
3489   if (bits) {
3490      brw_inst *inst = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3491                              brw_imm_ud(bits));
3492      brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3493      brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3494   }
3495}
3496