1/*
2 Copyright (C) Intel Corp.  2006.  All Rights Reserved.
3 Intel funded Tungsten Graphics to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28  * Authors:
29  *   Keith Whitwell <keithw@vmware.com>
30  */
31
32
33#include "brw_eu_defines.h"
34#include "brw_eu.h"
35
36#include "util/ralloc.h"
37
38/**
39 * Prior to Sandybridge, the SEND instruction accepted non-MRF source
40 * registers, implicitly moving the operand to a message register.
41 *
42 * On Sandybridge, this is no longer the case.  This function performs the
43 * explicit move; it should be called before emitting a SEND instruction.
44 */
45void
46gfx6_resolve_implied_move(struct brw_codegen *p,
47			  struct brw_reg *src,
48			  unsigned msg_reg_nr)
49{
50   const struct intel_device_info *devinfo = p->devinfo;
51   if (devinfo->ver < 6)
52      return;
53
54   if (src->file == BRW_MESSAGE_REGISTER_FILE)
55      return;
56
57   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
58      assert(devinfo->ver < 12);
59      brw_push_insn_state(p);
60      brw_set_default_exec_size(p, BRW_EXECUTE_8);
61      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
62      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
63      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
64	      retype(*src, BRW_REGISTER_TYPE_UD));
65      brw_pop_insn_state(p);
66   }
67   *src = brw_message_reg(msg_reg_nr);
68}
69
70static void
71gfx7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
72{
73   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
74    * "The send with EOT should use register space R112-R127 for <src>. This is
75    *  to enable loading of a new thread into the same slot while the message
76    *  with EOT for current thread is pending dispatch."
77    *
78    * Since we're pretending to have 16 MRFs anyway, we may as well use the
79    * registers required for messages with EOT.
80    */
81   const struct intel_device_info *devinfo = p->devinfo;
82   if (devinfo->ver >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
83      reg->file = BRW_GENERAL_REGISTER_FILE;
84      reg->nr += GFX7_MRF_HACK_START;
85   }
86}
87
88void
89brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
90{
91   const struct intel_device_info *devinfo = p->devinfo;
92
93   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
94      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
95   else if (dest.file == BRW_GENERAL_REGISTER_FILE)
96      assert(dest.nr < 128);
97
98   /* The hardware has a restriction where a destination of size Byte with
99    * a stride of 1 is only allowed for a packed byte MOV. For any other
100    * instruction, the stride must be at least 2, even when the destination
101    * is the NULL register.
102    */
103   if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
104       dest.nr == BRW_ARF_NULL &&
105       type_sz(dest.type) == 1 &&
106       dest.hstride == BRW_HORIZONTAL_STRIDE_1) {
107      dest.hstride = BRW_HORIZONTAL_STRIDE_2;
108   }
109
110   gfx7_convert_mrf_to_grf(p, &dest);
111
112   if (devinfo->ver >= 12 &&
113       (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
114        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
115      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
116             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
117      assert(dest.address_mode == BRW_ADDRESS_DIRECT);
118      assert(dest.subnr == 0);
119      assert(brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1 ||
120             (dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
121              dest.vstride == dest.width + 1));
122      assert(!dest.negate && !dest.abs);
123      brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
124      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
125
126   } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
127              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
128      assert(devinfo->ver < 12);
129      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
130             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
131      assert(dest.address_mode == BRW_ADDRESS_DIRECT);
132      assert(dest.subnr % 16 == 0);
133      assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1 &&
134             dest.vstride == dest.width + 1);
135      assert(!dest.negate && !dest.abs);
136      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
137      brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
138      brw_inst_set_send_dst_reg_file(devinfo, inst, dest.file);
139   } else {
140      brw_inst_set_dst_file_type(devinfo, inst, dest.file, dest.type);
141      brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
142
143      if (dest.address_mode == BRW_ADDRESS_DIRECT) {
144         brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
145
146         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
147            brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
148            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
149               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
150            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
151         } else {
152            brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
153            brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
154            if (dest.file == BRW_GENERAL_REGISTER_FILE ||
155                dest.file == BRW_MESSAGE_REGISTER_FILE) {
156               assert(dest.writemask != 0);
157            }
158            /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
159             *    Although Dst.HorzStride is a don't care for Align16, HW needs
160             *    this to be programmed as "01".
161             */
162            brw_inst_set_dst_hstride(devinfo, inst, 1);
163         }
164      } else {
165         brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
166
167         /* These are different sizes in align1 vs align16:
168          */
169         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
170            brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
171                                          dest.indirect_offset);
172            if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
173               dest.hstride = BRW_HORIZONTAL_STRIDE_1;
174            brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
175         } else {
176            brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
177                                           dest.indirect_offset);
178            /* even ignored in da16, still need to set as '01' */
179            brw_inst_set_dst_hstride(devinfo, inst, 1);
180         }
181      }
182   }
183
184   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
185    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
186    * small registers, it can be useful for us to automatically reduce it to
187    * match the register size.
188    */
189   if (p->automatic_exec_sizes) {
190      /*
191       * In platforms that support fp64 we can emit instructions with a width
192       * of 4 that need two SIMD8 registers and an exec_size of 8 or 16. In
193       * these cases we need to make sure that these instructions have their
194       * exec sizes set properly when they are emitted and we can't rely on
195       * this code to fix it.
196       */
197      bool fix_exec_size;
198      if (devinfo->ver >= 6)
199         fix_exec_size = dest.width < BRW_EXECUTE_4;
200      else
201         fix_exec_size = dest.width < BRW_EXECUTE_8;
202
203      if (fix_exec_size)
204         brw_inst_set_exec_size(devinfo, inst, dest.width);
205   }
206}
207
208void
209brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
210{
211   const struct intel_device_info *devinfo = p->devinfo;
212
213   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
214      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver));
215   else if (reg.file == BRW_GENERAL_REGISTER_FILE)
216      assert(reg.nr < 128);
217
218   gfx7_convert_mrf_to_grf(p, &reg);
219
220   if (devinfo->ver >= 6 &&
221       (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
222        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC ||
223        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
224        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC)) {
225      /* Any source modifiers or regions will be ignored, since this just
226       * identifies the MRF/GRF to start reading the message contents from.
227       * Check for some likely failures.
228       */
229      assert(!reg.negate);
230      assert(!reg.abs);
231      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
232   }
233
234   if (devinfo->ver >= 12 &&
235       (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
236        brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
237      assert(reg.file != BRW_IMMEDIATE_VALUE);
238      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
239      assert(reg.subnr == 0);
240      assert(has_scalar_region(reg) ||
241             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
242              reg.vstride == reg.width + 1));
243      assert(!reg.negate && !reg.abs);
244      brw_inst_set_send_src0_reg_file(devinfo, inst, reg.file);
245      brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
246
247   } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
248              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC) {
249      assert(reg.file == BRW_GENERAL_REGISTER_FILE);
250      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
251      assert(reg.subnr % 16 == 0);
252      assert(has_scalar_region(reg) ||
253             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
254              reg.vstride == reg.width + 1));
255      assert(!reg.negate && !reg.abs);
256      brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
257      brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
258   } else {
259      brw_inst_set_src0_file_type(devinfo, inst, reg.file, reg.type);
260      brw_inst_set_src0_abs(devinfo, inst, reg.abs);
261      brw_inst_set_src0_negate(devinfo, inst, reg.negate);
262      brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
263
264      if (reg.file == BRW_IMMEDIATE_VALUE) {
265         if (reg.type == BRW_REGISTER_TYPE_DF ||
266             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
267            brw_inst_set_imm_df(devinfo, inst, reg.df);
268         else if (reg.type == BRW_REGISTER_TYPE_UQ ||
269                  reg.type == BRW_REGISTER_TYPE_Q)
270            brw_inst_set_imm_uq(devinfo, inst, reg.u64);
271         else
272            brw_inst_set_imm_ud(devinfo, inst, reg.ud);
273
274         if (devinfo->ver < 12 && type_sz(reg.type) < 8) {
275            brw_inst_set_src1_reg_file(devinfo, inst,
276                                       BRW_ARCHITECTURE_REGISTER_FILE);
277            brw_inst_set_src1_reg_hw_type(devinfo, inst,
278                                          brw_inst_src0_reg_hw_type(devinfo, inst));
279         }
280      } else {
281         if (reg.address_mode == BRW_ADDRESS_DIRECT) {
282            brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
283            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
284                brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
285            } else {
286               brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
287            }
288         } else {
289            brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
290
291            if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
292               brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
293            } else {
294               brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
295            }
296         }
297
298         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
299            if (reg.width == BRW_WIDTH_1 &&
300                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
301               brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
302               brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
303               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
304            } else {
305               brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
306               brw_inst_set_src0_width(devinfo, inst, reg.width);
307               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
308            }
309         } else {
310            brw_inst_set_src0_da16_swiz_x(devinfo, inst,
311               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
312            brw_inst_set_src0_da16_swiz_y(devinfo, inst,
313               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
314            brw_inst_set_src0_da16_swiz_z(devinfo, inst,
315               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
316            brw_inst_set_src0_da16_swiz_w(devinfo, inst,
317               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
318
319            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
320               /* This is an oddity of the fact we're using the same
321                * descriptions for registers in align_16 as align_1:
322                */
323               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
324            } else if (devinfo->verx10 == 70 &&
325                       reg.type == BRW_REGISTER_TYPE_DF &&
326                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
327               /* From SNB PRM:
328                *
329                * "For Align16 access mode, only encodings of 0000 and 0011
330                *  are allowed. Other codes are reserved."
331                *
332                * Presumably the DevSNB behavior applies to IVB as well.
333                */
334               brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
335            } else {
336               brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
337            }
338         }
339      }
340   }
341}
342
343
344void
345brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
346{
347   const struct intel_device_info *devinfo = p->devinfo;
348
349   if (reg.file == BRW_GENERAL_REGISTER_FILE)
350      assert(reg.nr < 128);
351
352   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDS ||
353       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDSC ||
354       (devinfo->ver >= 12 &&
355        (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
356         brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC))) {
357      assert(reg.file == BRW_GENERAL_REGISTER_FILE ||
358             reg.file == BRW_ARCHITECTURE_REGISTER_FILE);
359      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
360      assert(reg.subnr == 0);
361      assert(has_scalar_region(reg) ||
362             (reg.hstride == BRW_HORIZONTAL_STRIDE_1 &&
363              reg.vstride == reg.width + 1));
364      assert(!reg.negate && !reg.abs);
365      brw_inst_set_send_src1_reg_nr(devinfo, inst, reg.nr);
366      brw_inst_set_send_src1_reg_file(devinfo, inst, reg.file);
367   } else {
368      /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
369       *
370       *    "Accumulator registers may be accessed explicitly as src0
371       *    operands only."
372       */
373      assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
374             reg.nr != BRW_ARF_ACCUMULATOR);
375
376      gfx7_convert_mrf_to_grf(p, &reg);
377      assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
378
379      brw_inst_set_src1_file_type(devinfo, inst, reg.file, reg.type);
380      brw_inst_set_src1_abs(devinfo, inst, reg.abs);
381      brw_inst_set_src1_negate(devinfo, inst, reg.negate);
382
383      /* Only src1 can be immediate in two-argument instructions.
384       */
385      assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
386
387      if (reg.file == BRW_IMMEDIATE_VALUE) {
388         /* two-argument instructions can only use 32-bit immediates */
389         assert(type_sz(reg.type) < 8);
390         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
391      } else {
392         /* This is a hardware restriction, which may or may not be lifted
393          * in the future:
394          */
395         assert (reg.address_mode == BRW_ADDRESS_DIRECT);
396         /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
397
398         brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
399         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
400            brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
401         } else {
402            brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
403         }
404
405         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
406            if (reg.width == BRW_WIDTH_1 &&
407                brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
408               brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
409               brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
410               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
411            } else {
412               brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
413               brw_inst_set_src1_width(devinfo, inst, reg.width);
414               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
415            }
416         } else {
417            brw_inst_set_src1_da16_swiz_x(devinfo, inst,
418               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
419            brw_inst_set_src1_da16_swiz_y(devinfo, inst,
420               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
421            brw_inst_set_src1_da16_swiz_z(devinfo, inst,
422               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
423            brw_inst_set_src1_da16_swiz_w(devinfo, inst,
424               BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
425
426            if (reg.vstride == BRW_VERTICAL_STRIDE_8) {
427               /* This is an oddity of the fact we're using the same
428                * descriptions for registers in align_16 as align_1:
429                */
430               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
431            } else if (devinfo->verx10 == 70 &&
432                       reg.type == BRW_REGISTER_TYPE_DF &&
433                       reg.vstride == BRW_VERTICAL_STRIDE_2) {
434               /* From SNB PRM:
435                *
436                * "For Align16 access mode, only encodings of 0000 and 0011
437                *  are allowed. Other codes are reserved."
438                *
439                * Presumably the DevSNB behavior applies to IVB as well.
440                */
441               brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
442            } else {
443               brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
444            }
445         }
446      }
447   }
448}
449
450/**
451 * Specify the descriptor and extended descriptor immediate for a SEND(C)
452 * message instruction.
453 */
454void
455brw_set_desc_ex(struct brw_codegen *p, brw_inst *inst,
456                unsigned desc, unsigned ex_desc)
457{
458   const struct intel_device_info *devinfo = p->devinfo;
459   assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
460          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC);
461   if (devinfo->ver < 12)
462      brw_inst_set_src1_file_type(devinfo, inst,
463                                  BRW_IMMEDIATE_VALUE, BRW_REGISTER_TYPE_UD);
464   brw_inst_set_send_desc(devinfo, inst, desc);
465   if (devinfo->ver >= 9)
466      brw_inst_set_send_ex_desc(devinfo, inst, ex_desc);
467}
468
469static void brw_set_math_message( struct brw_codegen *p,
470				  brw_inst *inst,
471				  unsigned function,
472				  unsigned integer_type,
473				  bool low_precision,
474				  unsigned dataType )
475{
476   const struct intel_device_info *devinfo = p->devinfo;
477   unsigned msg_length;
478   unsigned response_length;
479
480   /* Infer message length from the function */
481   switch (function) {
482   case BRW_MATH_FUNCTION_POW:
483   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
484   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
485   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
486      msg_length = 2;
487      break;
488   default:
489      msg_length = 1;
490      break;
491   }
492
493   /* Infer response length from the function */
494   switch (function) {
495   case BRW_MATH_FUNCTION_SINCOS:
496   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
497      response_length = 2;
498      break;
499   default:
500      response_length = 1;
501      break;
502   }
503
504   brw_set_desc(p, inst, brw_message_desc(
505                   devinfo, msg_length, response_length, false));
506
507   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MATH);
508   brw_inst_set_math_msg_function(devinfo, inst, function);
509   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
510   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
511   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
512   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
513   brw_inst_set_saturate(devinfo, inst, 0);
514}
515
516
517static void brw_set_ff_sync_message(struct brw_codegen *p,
518				    brw_inst *insn,
519				    bool allocate,
520				    unsigned response_length,
521				    bool end_of_thread)
522{
523   const struct intel_device_info *devinfo = p->devinfo;
524
525   brw_set_desc(p, insn, brw_message_desc(
526                   devinfo, 1, response_length, true));
527
528   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
529   brw_inst_set_eot(devinfo, insn, end_of_thread);
530   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
531   brw_inst_set_urb_allocate(devinfo, insn, allocate);
532   /* The following fields are not used by FF_SYNC: */
533   brw_inst_set_urb_global_offset(devinfo, insn, 0);
534   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
535   brw_inst_set_urb_used(devinfo, insn, 0);
536   brw_inst_set_urb_complete(devinfo, insn, 0);
537}
538
539static void brw_set_urb_message( struct brw_codegen *p,
540				 brw_inst *insn,
541                                 enum brw_urb_write_flags flags,
542				 unsigned msg_length,
543				 unsigned response_length,
544				 unsigned offset,
545				 unsigned swizzle_control )
546{
547   const struct intel_device_info *devinfo = p->devinfo;
548
549   assert(devinfo->ver < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
550   assert(devinfo->ver < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
551   assert(devinfo->ver >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
552
553   brw_set_desc(p, insn, brw_message_desc(
554                   devinfo, msg_length, response_length, true));
555
556   brw_inst_set_sfid(devinfo, insn, BRW_SFID_URB);
557   brw_inst_set_eot(devinfo, insn, !!(flags & BRW_URB_WRITE_EOT));
558
559   if (flags & BRW_URB_WRITE_OWORD) {
560      assert(msg_length == 2); /* header + one OWORD of data */
561      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
562   } else {
563      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
564   }
565
566   brw_inst_set_urb_global_offset(devinfo, insn, offset);
567   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
568
569   if (devinfo->ver < 8) {
570      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
571   }
572
573   if (devinfo->ver < 7) {
574      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
575      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
576   } else {
577      brw_inst_set_urb_per_slot_offset(devinfo, insn,
578         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
579   }
580}
581
582static void
583gfx7_set_dp_scratch_message(struct brw_codegen *p,
584                            brw_inst *inst,
585                            bool write,
586                            bool dword,
587                            bool invalidate_after_read,
588                            unsigned num_regs,
589                            unsigned addr_offset,
590                            unsigned mlen,
591                            unsigned rlen,
592                            bool header_present)
593{
594   const struct intel_device_info *devinfo = p->devinfo;
595   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
596          (devinfo->ver >= 8 && num_regs == 8));
597   const unsigned block_size = (devinfo->ver >= 8 ? util_logbase2(num_regs) :
598                                num_regs - 1);
599
600   brw_set_desc(p, inst, brw_message_desc(
601                   devinfo, mlen, rlen, header_present));
602
603   brw_inst_set_sfid(devinfo, inst, GFX7_SFID_DATAPORT_DATA_CACHE);
604   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
605   brw_inst_set_scratch_read_write(devinfo, inst, write);
606   brw_inst_set_scratch_type(devinfo, inst, dword);
607   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
608   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
609   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
610}
611
612static void
613brw_inst_set_state(const struct intel_device_info *devinfo,
614                   brw_inst *insn,
615                   const struct brw_insn_state *state)
616{
617   brw_inst_set_exec_size(devinfo, insn, state->exec_size);
618   brw_inst_set_group(devinfo, insn, state->group);
619   brw_inst_set_compression(devinfo, insn, state->compressed);
620   brw_inst_set_access_mode(devinfo, insn, state->access_mode);
621   brw_inst_set_mask_control(devinfo, insn, state->mask_control);
622   if (devinfo->ver >= 12)
623      brw_inst_set_swsb(devinfo, insn, tgl_swsb_encode(devinfo, state->swsb));
624   brw_inst_set_saturate(devinfo, insn, state->saturate);
625   brw_inst_set_pred_control(devinfo, insn, state->predicate);
626   brw_inst_set_pred_inv(devinfo, insn, state->pred_inv);
627
628   if (is_3src(devinfo, brw_inst_opcode(devinfo, insn)) &&
629       state->access_mode == BRW_ALIGN_16) {
630      brw_inst_set_3src_a16_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
631      if (devinfo->ver >= 7)
632         brw_inst_set_3src_a16_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
633   } else {
634      brw_inst_set_flag_subreg_nr(devinfo, insn, state->flag_subreg % 2);
635      if (devinfo->ver >= 7)
636         brw_inst_set_flag_reg_nr(devinfo, insn, state->flag_subreg / 2);
637   }
638
639   if (devinfo->ver >= 6)
640      brw_inst_set_acc_wr_control(devinfo, insn, state->acc_wr_control);
641}
642
643static brw_inst *
644brw_append_insns(struct brw_codegen *p, unsigned nr_insn, unsigned align)
645{
646   assert(util_is_power_of_two_or_zero(sizeof(brw_inst)));
647   assert(util_is_power_of_two_or_zero(align));
648   const unsigned align_insn = MAX2(align / sizeof(brw_inst), 1);
649   const unsigned start_insn = ALIGN(p->nr_insn, align_insn);
650   const unsigned new_nr_insn = start_insn + nr_insn;
651
652   if (p->store_size < new_nr_insn) {
653      p->store_size = util_next_power_of_two(new_nr_insn * sizeof(brw_inst));
654      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
655   }
656
657   /* Memset any padding due to alignment to 0.  We don't want to be hashing
658    * or caching a bunch of random bits we got from a memory allocation.
659    */
660   if (p->nr_insn < start_insn) {
661      memset(&p->store[p->nr_insn], 0,
662             (start_insn - p->nr_insn) * sizeof(brw_inst));
663   }
664
665   assert(p->next_insn_offset == p->nr_insn * sizeof(brw_inst));
666   p->nr_insn = new_nr_insn;
667   p->next_insn_offset = new_nr_insn * sizeof(brw_inst);
668
669   return &p->store[start_insn];
670}
671
672void
673brw_realign(struct brw_codegen *p, unsigned align)
674{
675   brw_append_insns(p, 0, align);
676}
677
678int
679brw_append_data(struct brw_codegen *p, void *data,
680                unsigned size, unsigned align)
681{
682   unsigned nr_insn = DIV_ROUND_UP(size, sizeof(brw_inst));
683   void *dst = brw_append_insns(p, nr_insn, align);
684   memcpy(dst, data, size);
685
686   /* If it's not a whole number of instructions, memset the end */
687   if (size < nr_insn * sizeof(brw_inst))
688      memset(dst + size, 0, nr_insn * sizeof(brw_inst) - size);
689
690   return dst - (void *)p->store;
691}
692
693#define next_insn brw_next_insn
694brw_inst *
695brw_next_insn(struct brw_codegen *p, unsigned opcode)
696{
697   const struct intel_device_info *devinfo = p->devinfo;
698   brw_inst *insn = brw_append_insns(p, 1, sizeof(brw_inst));
699
700   memset(insn, 0, sizeof(*insn));
701   brw_inst_set_opcode(devinfo, insn, opcode);
702
703   /* Apply the default instruction state */
704   brw_inst_set_state(devinfo, insn, p->current);
705
706   return insn;
707}
708
709void
710brw_add_reloc(struct brw_codegen *p, uint32_t id,
711              enum brw_shader_reloc_type type,
712              uint32_t offset, uint32_t delta)
713{
714   if (p->num_relocs + 1 > p->reloc_array_size) {
715      p->reloc_array_size = MAX2(16, p->reloc_array_size * 2);
716      p->relocs = reralloc(p->mem_ctx, p->relocs,
717                           struct brw_shader_reloc, p->reloc_array_size);
718   }
719
720   p->relocs[p->num_relocs++] = (struct brw_shader_reloc) {
721      .id = id,
722      .type = type,
723      .offset = offset,
724      .delta = delta,
725   };
726}
727
728static brw_inst *
729brw_alu1(struct brw_codegen *p, unsigned opcode,
730         struct brw_reg dest, struct brw_reg src)
731{
732   brw_inst *insn = next_insn(p, opcode);
733   brw_set_dest(p, insn, dest);
734   brw_set_src0(p, insn, src);
735   return insn;
736}
737
738static brw_inst *
739brw_alu2(struct brw_codegen *p, unsigned opcode,
740         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
741{
742   /* 64-bit immediates are only supported on 1-src instructions */
743   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
744   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
745
746   brw_inst *insn = next_insn(p, opcode);
747   brw_set_dest(p, insn, dest);
748   brw_set_src0(p, insn, src0);
749   brw_set_src1(p, insn, src1);
750   return insn;
751}
752
753static int
754get_3src_subreg_nr(struct brw_reg reg)
755{
756   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
757    * use 32-bit units (components 0..7).  Since they only support F/D/UD
758    * types, this doesn't lose any flexibility, but uses fewer bits.
759    */
760   return reg.subnr / 4;
761}
762
763static enum gfx10_align1_3src_vertical_stride
764to_3src_align1_vstride(const struct intel_device_info *devinfo,
765                       enum brw_vertical_stride vstride)
766{
767   switch (vstride) {
768   case BRW_VERTICAL_STRIDE_0:
769      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_0;
770   case BRW_VERTICAL_STRIDE_1:
771      assert(devinfo->ver >= 12);
772      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_1;
773   case BRW_VERTICAL_STRIDE_2:
774      assert(devinfo->ver < 12);
775      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_2;
776   case BRW_VERTICAL_STRIDE_4:
777      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_4;
778   case BRW_VERTICAL_STRIDE_8:
779   case BRW_VERTICAL_STRIDE_16:
780      return BRW_ALIGN1_3SRC_VERTICAL_STRIDE_8;
781   default:
782      unreachable("invalid vstride");
783   }
784}
785
786
787static enum gfx10_align1_3src_src_horizontal_stride
788to_3src_align1_hstride(enum brw_horizontal_stride hstride)
789{
790   switch (hstride) {
791   case BRW_HORIZONTAL_STRIDE_0:
792      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_0;
793   case BRW_HORIZONTAL_STRIDE_1:
794      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_1;
795   case BRW_HORIZONTAL_STRIDE_2:
796      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_2;
797   case BRW_HORIZONTAL_STRIDE_4:
798      return BRW_ALIGN1_3SRC_SRC_HORIZONTAL_STRIDE_4;
799   default:
800      unreachable("invalid hstride");
801   }
802}
803
804static brw_inst *
805brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
806         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
807{
808   const struct intel_device_info *devinfo = p->devinfo;
809   brw_inst *inst = next_insn(p, opcode);
810
811   gfx7_convert_mrf_to_grf(p, &dest);
812
813   assert(dest.nr < 128);
814
815   if (devinfo->ver >= 10)
816      assert(!(src0.file == BRW_IMMEDIATE_VALUE &&
817               src2.file == BRW_IMMEDIATE_VALUE));
818
819   assert(src0.file == BRW_IMMEDIATE_VALUE || src0.nr < 128);
820   assert(src1.file != BRW_IMMEDIATE_VALUE && src1.nr < 128);
821   assert(src2.file == BRW_IMMEDIATE_VALUE || src2.nr < 128);
822   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
823   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
824   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
825   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
826
827   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
828      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
829             dest.file == BRW_ARCHITECTURE_REGISTER_FILE);
830
831      if (devinfo->ver >= 12) {
832         brw_inst_set_3src_a1_dst_reg_file(devinfo, inst, dest.file);
833         brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
834      } else {
835         if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE) {
836            brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
837                                              BRW_ALIGN1_3SRC_ACCUMULATOR);
838            brw_inst_set_3src_dst_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
839         } else {
840            brw_inst_set_3src_a1_dst_reg_file(devinfo, inst,
841                                              BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE);
842            brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
843         }
844      }
845      brw_inst_set_3src_a1_dst_subreg_nr(devinfo, inst, dest.subnr / 8);
846
847      brw_inst_set_3src_a1_dst_hstride(devinfo, inst, BRW_ALIGN1_3SRC_DST_HORIZONTAL_STRIDE_1);
848
849      if (brw_reg_type_is_floating_point(dest.type)) {
850         brw_inst_set_3src_a1_exec_type(devinfo, inst,
851                                        BRW_ALIGN1_3SRC_EXEC_TYPE_FLOAT);
852      } else {
853         brw_inst_set_3src_a1_exec_type(devinfo, inst,
854                                        BRW_ALIGN1_3SRC_EXEC_TYPE_INT);
855      }
856
857      brw_inst_set_3src_a1_dst_type(devinfo, inst, dest.type);
858      brw_inst_set_3src_a1_src0_type(devinfo, inst, src0.type);
859      brw_inst_set_3src_a1_src1_type(devinfo, inst, src1.type);
860      brw_inst_set_3src_a1_src2_type(devinfo, inst, src2.type);
861
862      if (src0.file == BRW_IMMEDIATE_VALUE) {
863         brw_inst_set_3src_a1_src0_imm(devinfo, inst, src0.ud);
864      } else {
865         brw_inst_set_3src_a1_src0_vstride(
866            devinfo, inst, to_3src_align1_vstride(devinfo, src0.vstride));
867         brw_inst_set_3src_a1_src0_hstride(devinfo, inst,
868                                           to_3src_align1_hstride(src0.hstride));
869         brw_inst_set_3src_a1_src0_subreg_nr(devinfo, inst, src0.subnr);
870         if (src0.type == BRW_REGISTER_TYPE_NF) {
871            brw_inst_set_3src_src0_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
872         } else {
873            brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
874         }
875         brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
876         brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
877      }
878      brw_inst_set_3src_a1_src1_vstride(
879         devinfo, inst, to_3src_align1_vstride(devinfo, src1.vstride));
880      brw_inst_set_3src_a1_src1_hstride(devinfo, inst,
881                                        to_3src_align1_hstride(src1.hstride));
882
883      brw_inst_set_3src_a1_src1_subreg_nr(devinfo, inst, src1.subnr);
884      if (src1.file == BRW_ARCHITECTURE_REGISTER_FILE) {
885         brw_inst_set_3src_src1_reg_nr(devinfo, inst, BRW_ARF_ACCUMULATOR);
886      } else {
887         brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
888      }
889      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
890      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
891
892      if (src2.file == BRW_IMMEDIATE_VALUE) {
893         brw_inst_set_3src_a1_src2_imm(devinfo, inst, src2.ud);
894      } else {
895         brw_inst_set_3src_a1_src2_hstride(devinfo, inst,
896                                           to_3src_align1_hstride(src2.hstride));
897         /* no vstride on src2 */
898         brw_inst_set_3src_a1_src2_subreg_nr(devinfo, inst, src2.subnr);
899         brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
900         brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
901         brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
902      }
903
904      assert(src0.file == BRW_GENERAL_REGISTER_FILE ||
905             src0.file == BRW_IMMEDIATE_VALUE ||
906             (src0.file == BRW_ARCHITECTURE_REGISTER_FILE &&
907              src0.type == BRW_REGISTER_TYPE_NF));
908      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
909             src1.file == BRW_ARCHITECTURE_REGISTER_FILE);
910      assert(src2.file == BRW_GENERAL_REGISTER_FILE ||
911             src2.file == BRW_IMMEDIATE_VALUE);
912
913      if (devinfo->ver >= 12) {
914         if (src0.file == BRW_IMMEDIATE_VALUE) {
915            brw_inst_set_3src_a1_src0_is_imm(devinfo, inst, 1);
916         } else {
917            brw_inst_set_3src_a1_src0_reg_file(devinfo, inst, src0.file);
918         }
919
920         brw_inst_set_3src_a1_src1_reg_file(devinfo, inst, src1.file);
921
922         if (src2.file == BRW_IMMEDIATE_VALUE) {
923            brw_inst_set_3src_a1_src2_is_imm(devinfo, inst, 1);
924         } else {
925            brw_inst_set_3src_a1_src2_reg_file(devinfo, inst, src2.file);
926         }
927      } else {
928         brw_inst_set_3src_a1_src0_reg_file(devinfo, inst,
929                                            src0.file == BRW_GENERAL_REGISTER_FILE ?
930                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
931                                            BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
932         brw_inst_set_3src_a1_src1_reg_file(devinfo, inst,
933                                            src1.file == BRW_GENERAL_REGISTER_FILE ?
934                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
935                                            BRW_ALIGN1_3SRC_ACCUMULATOR);
936         brw_inst_set_3src_a1_src2_reg_file(devinfo, inst,
937                                            src2.file == BRW_GENERAL_REGISTER_FILE ?
938                                            BRW_ALIGN1_3SRC_GENERAL_REGISTER_FILE :
939                                            BRW_ALIGN1_3SRC_IMMEDIATE_VALUE);
940      }
941
942   } else {
943      assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
944             dest.file == BRW_MESSAGE_REGISTER_FILE);
945      assert(dest.type == BRW_REGISTER_TYPE_F  ||
946             dest.type == BRW_REGISTER_TYPE_DF ||
947             dest.type == BRW_REGISTER_TYPE_D  ||
948             dest.type == BRW_REGISTER_TYPE_UD ||
949             (dest.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 8));
950      if (devinfo->ver == 6) {
951         brw_inst_set_3src_a16_dst_reg_file(devinfo, inst,
952                                            dest.file == BRW_MESSAGE_REGISTER_FILE);
953      }
954      brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
955      brw_inst_set_3src_a16_dst_subreg_nr(devinfo, inst, dest.subnr / 4);
956      brw_inst_set_3src_a16_dst_writemask(devinfo, inst, dest.writemask);
957
958      assert(src0.file == BRW_GENERAL_REGISTER_FILE);
959      brw_inst_set_3src_a16_src0_swizzle(devinfo, inst, src0.swizzle);
960      brw_inst_set_3src_a16_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
961      brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
962      brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
963      brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
964      brw_inst_set_3src_a16_src0_rep_ctrl(devinfo, inst,
965                                          src0.vstride == BRW_VERTICAL_STRIDE_0);
966
967      assert(src1.file == BRW_GENERAL_REGISTER_FILE);
968      brw_inst_set_3src_a16_src1_swizzle(devinfo, inst, src1.swizzle);
969      brw_inst_set_3src_a16_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
970      brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
971      brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
972      brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
973      brw_inst_set_3src_a16_src1_rep_ctrl(devinfo, inst,
974                                          src1.vstride == BRW_VERTICAL_STRIDE_0);
975
976      assert(src2.file == BRW_GENERAL_REGISTER_FILE);
977      brw_inst_set_3src_a16_src2_swizzle(devinfo, inst, src2.swizzle);
978      brw_inst_set_3src_a16_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
979      brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
980      brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
981      brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
982      brw_inst_set_3src_a16_src2_rep_ctrl(devinfo, inst,
983                                          src2.vstride == BRW_VERTICAL_STRIDE_0);
984
985      if (devinfo->ver >= 7) {
986         /* Set both the source and destination types based on dest.type,
987          * ignoring the source register types.  The MAD and LRP emitters ensure
988          * that all four types are float.  The BFE and BFI2 emitters, however,
989          * may send us mixed D and UD types and want us to ignore that and use
990          * the destination type.
991          */
992         brw_inst_set_3src_a16_src_type(devinfo, inst, dest.type);
993         brw_inst_set_3src_a16_dst_type(devinfo, inst, dest.type);
994
995         /* From the Bspec, 3D Media GPGPU, Instruction fields, srcType:
996          *
997          *    "Three source instructions can use operands with mixed-mode
998          *     precision. When SrcType field is set to :f or :hf it defines
999          *     precision for source 0 only, and fields Src1Type and Src2Type
1000          *     define precision for other source operands:
1001          *
1002          *     0b = :f. Single precision Float (32-bit).
1003          *     1b = :hf. Half precision Float (16-bit)."
1004          */
1005         if (src1.type == BRW_REGISTER_TYPE_HF)
1006            brw_inst_set_3src_a16_src1_type(devinfo, inst, 1);
1007
1008         if (src2.type == BRW_REGISTER_TYPE_HF)
1009            brw_inst_set_3src_a16_src2_type(devinfo, inst, 1);
1010      }
1011   }
1012
1013   return inst;
1014}
1015
1016
1017/***********************************************************************
1018 * Convenience routines.
1019 */
1020#define ALU1(OP)					\
1021brw_inst *brw_##OP(struct brw_codegen *p,		\
1022	      struct brw_reg dest,			\
1023	      struct brw_reg src0)   			\
1024{							\
1025   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
1026}
1027
1028#define ALU2(OP)					\
1029brw_inst *brw_##OP(struct brw_codegen *p,		\
1030	      struct brw_reg dest,			\
1031	      struct brw_reg src0,			\
1032	      struct brw_reg src1)   			\
1033{							\
1034   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
1035}
1036
1037#define ALU3(OP)					\
1038brw_inst *brw_##OP(struct brw_codegen *p,		\
1039	      struct brw_reg dest,			\
1040	      struct brw_reg src0,			\
1041	      struct brw_reg src1,			\
1042	      struct brw_reg src2)   			\
1043{                                                       \
1044   if (p->current->access_mode == BRW_ALIGN_16) {       \
1045      if (src0.vstride == BRW_VERTICAL_STRIDE_0)        \
1046         src0.swizzle = BRW_SWIZZLE_XXXX;               \
1047      if (src1.vstride == BRW_VERTICAL_STRIDE_0)        \
1048         src1.swizzle = BRW_SWIZZLE_XXXX;               \
1049      if (src2.vstride == BRW_VERTICAL_STRIDE_0)        \
1050         src2.swizzle = BRW_SWIZZLE_XXXX;               \
1051   }                                                    \
1052   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
1053}
1054
1055#define ALU3F(OP)                                               \
1056brw_inst *brw_##OP(struct brw_codegen *p,         \
1057                                 struct brw_reg dest,           \
1058                                 struct brw_reg src0,           \
1059                                 struct brw_reg src1,           \
1060                                 struct brw_reg src2)           \
1061{                                                               \
1062   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
1063          dest.type == BRW_REGISTER_TYPE_DF);                   \
1064   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
1065      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
1066      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
1067      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
1068   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
1069      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
1070      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
1071      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
1072   }                                                            \
1073                                                                \
1074   if (p->current->access_mode == BRW_ALIGN_16) {               \
1075      if (src0.vstride == BRW_VERTICAL_STRIDE_0)                \
1076         src0.swizzle = BRW_SWIZZLE_XXXX;                       \
1077      if (src1.vstride == BRW_VERTICAL_STRIDE_0)                \
1078         src1.swizzle = BRW_SWIZZLE_XXXX;                       \
1079      if (src2.vstride == BRW_VERTICAL_STRIDE_0)                \
1080         src2.swizzle = BRW_SWIZZLE_XXXX;                       \
1081   }                                                            \
1082   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
1083}
1084
1085ALU2(SEL)
1086ALU1(NOT)
1087ALU2(AND)
1088ALU2(OR)
1089ALU2(XOR)
1090ALU2(SHR)
1091ALU2(SHL)
1092ALU1(DIM)
1093ALU2(ASR)
1094ALU2(ROL)
1095ALU2(ROR)
1096ALU3(CSEL)
1097ALU1(FRC)
1098ALU1(RNDD)
1099ALU1(RNDE)
1100ALU1(RNDU)
1101ALU1(RNDZ)
1102ALU2(MAC)
1103ALU2(MACH)
1104ALU1(LZD)
1105ALU2(DP4)
1106ALU2(DPH)
1107ALU2(DP3)
1108ALU2(DP2)
1109ALU3(DP4A)
1110ALU3(MAD)
1111ALU3F(LRP)
1112ALU1(BFREV)
1113ALU3(BFE)
1114ALU2(BFI1)
1115ALU3(BFI2)
1116ALU1(FBH)
1117ALU1(FBL)
1118ALU1(CBIT)
1119ALU2(ADDC)
1120ALU2(SUBB)
1121ALU3(ADD3)
1122
1123brw_inst *
1124brw_MOV(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0)
1125{
1126   const struct intel_device_info *devinfo = p->devinfo;
1127
1128   /* When converting F->DF on IVB/BYT, every odd source channel is ignored.
1129    * To avoid the problems that causes, we use an <X,2,0> source region to
1130    * read each element twice.
1131    */
1132   if (devinfo->verx10 == 70 &&
1133       brw_get_default_access_mode(p) == BRW_ALIGN_1 &&
1134       dest.type == BRW_REGISTER_TYPE_DF &&
1135       (src0.type == BRW_REGISTER_TYPE_F ||
1136        src0.type == BRW_REGISTER_TYPE_D ||
1137        src0.type == BRW_REGISTER_TYPE_UD) &&
1138       !has_scalar_region(src0)) {
1139      assert(src0.vstride == src0.width + src0.hstride);
1140      src0.vstride = src0.hstride;
1141      src0.width = BRW_WIDTH_2;
1142      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1143   }
1144
1145   return brw_alu1(p, BRW_OPCODE_MOV, dest, src0);
1146}
1147
1148brw_inst *
1149brw_ADD(struct brw_codegen *p, struct brw_reg dest,
1150        struct brw_reg src0, struct brw_reg src1)
1151{
1152   /* 6.2.2: add */
1153   if (src0.type == BRW_REGISTER_TYPE_F ||
1154       (src0.file == BRW_IMMEDIATE_VALUE &&
1155	src0.type == BRW_REGISTER_TYPE_VF)) {
1156      assert(src1.type != BRW_REGISTER_TYPE_UD);
1157      assert(src1.type != BRW_REGISTER_TYPE_D);
1158   }
1159
1160   if (src1.type == BRW_REGISTER_TYPE_F ||
1161       (src1.file == BRW_IMMEDIATE_VALUE &&
1162	src1.type == BRW_REGISTER_TYPE_VF)) {
1163      assert(src0.type != BRW_REGISTER_TYPE_UD);
1164      assert(src0.type != BRW_REGISTER_TYPE_D);
1165   }
1166
1167   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
1168}
1169
1170brw_inst *
1171brw_AVG(struct brw_codegen *p, struct brw_reg dest,
1172        struct brw_reg src0, struct brw_reg src1)
1173{
1174   assert(dest.type == src0.type);
1175   assert(src0.type == src1.type);
1176   switch (src0.type) {
1177   case BRW_REGISTER_TYPE_B:
1178   case BRW_REGISTER_TYPE_UB:
1179   case BRW_REGISTER_TYPE_W:
1180   case BRW_REGISTER_TYPE_UW:
1181   case BRW_REGISTER_TYPE_D:
1182   case BRW_REGISTER_TYPE_UD:
1183      break;
1184   default:
1185      unreachable("Bad type for brw_AVG");
1186   }
1187
1188   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
1189}
1190
1191brw_inst *
1192brw_MUL(struct brw_codegen *p, struct brw_reg dest,
1193        struct brw_reg src0, struct brw_reg src1)
1194{
1195   /* 6.32.38: mul */
1196   if (src0.type == BRW_REGISTER_TYPE_D ||
1197       src0.type == BRW_REGISTER_TYPE_UD ||
1198       src1.type == BRW_REGISTER_TYPE_D ||
1199       src1.type == BRW_REGISTER_TYPE_UD) {
1200      assert(dest.type != BRW_REGISTER_TYPE_F);
1201   }
1202
1203   if (src0.type == BRW_REGISTER_TYPE_F ||
1204       (src0.file == BRW_IMMEDIATE_VALUE &&
1205	src0.type == BRW_REGISTER_TYPE_VF)) {
1206      assert(src1.type != BRW_REGISTER_TYPE_UD);
1207      assert(src1.type != BRW_REGISTER_TYPE_D);
1208   }
1209
1210   if (src1.type == BRW_REGISTER_TYPE_F ||
1211       (src1.file == BRW_IMMEDIATE_VALUE &&
1212	src1.type == BRW_REGISTER_TYPE_VF)) {
1213      assert(src0.type != BRW_REGISTER_TYPE_UD);
1214      assert(src0.type != BRW_REGISTER_TYPE_D);
1215   }
1216
1217   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1218	  src0.nr != BRW_ARF_ACCUMULATOR);
1219   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
1220	  src1.nr != BRW_ARF_ACCUMULATOR);
1221
1222   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
1223}
1224
1225brw_inst *
1226brw_LINE(struct brw_codegen *p, struct brw_reg dest,
1227         struct brw_reg src0, struct brw_reg src1)
1228{
1229   src0.vstride = BRW_VERTICAL_STRIDE_0;
1230   src0.width = BRW_WIDTH_1;
1231   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1232   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
1233}
1234
1235brw_inst *
1236brw_PLN(struct brw_codegen *p, struct brw_reg dest,
1237        struct brw_reg src0, struct brw_reg src1)
1238{
1239   src0.vstride = BRW_VERTICAL_STRIDE_0;
1240   src0.width = BRW_WIDTH_1;
1241   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1242   src1.vstride = BRW_VERTICAL_STRIDE_8;
1243   src1.width = BRW_WIDTH_8;
1244   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
1245   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
1246}
1247
1248brw_inst *
1249brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1250{
1251   const struct intel_device_info *devinfo = p->devinfo;
1252   const bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1253   /* The F32TO16 instruction doesn't support 32-bit destination types in
1254    * Align1 mode, and neither does the Gfx8 implementation in terms of a
1255    * converting MOV.  Gfx7 does zero out the high 16 bits in Align16 mode as
1256    * an undocumented feature.
1257    */
1258   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
1259                                 (!align16 || devinfo->ver >= 8));
1260   brw_inst *inst;
1261
1262   if (align16) {
1263      assert(dst.type == BRW_REGISTER_TYPE_UD);
1264   } else {
1265      assert(dst.type == BRW_REGISTER_TYPE_UD ||
1266             dst.type == BRW_REGISTER_TYPE_W ||
1267             dst.type == BRW_REGISTER_TYPE_UW ||
1268             dst.type == BRW_REGISTER_TYPE_HF);
1269   }
1270
1271   brw_push_insn_state(p);
1272
1273   if (needs_zero_fill) {
1274      brw_set_default_access_mode(p, BRW_ALIGN_1);
1275      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1276   }
1277
1278   if (devinfo->ver >= 8) {
1279      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
1280   } else {
1281      assert(devinfo->ver == 7);
1282      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
1283   }
1284
1285   if (needs_zero_fill) {
1286      if (devinfo->ver < 12)
1287         brw_inst_set_no_dd_clear(devinfo, inst, true);
1288      brw_set_default_swsb(p, tgl_swsb_null());
1289      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
1290      if (devinfo->ver < 12)
1291         brw_inst_set_no_dd_check(devinfo, inst, true);
1292   }
1293
1294   brw_pop_insn_state(p);
1295   return inst;
1296}
1297
1298brw_inst *
1299brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
1300{
1301   const struct intel_device_info *devinfo = p->devinfo;
1302   bool align16 = brw_get_default_access_mode(p) == BRW_ALIGN_16;
1303
1304   if (align16) {
1305      assert(src.type == BRW_REGISTER_TYPE_UD);
1306   } else {
1307      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1308       *
1309       *   Because this instruction does not have a 16-bit floating-point
1310       *   type, the source data type must be Word (W). The destination type
1311       *   must be F (Float).
1312       */
1313      if (src.type == BRW_REGISTER_TYPE_UD)
1314         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1315
1316      assert(src.type == BRW_REGISTER_TYPE_W ||
1317             src.type == BRW_REGISTER_TYPE_UW ||
1318             src.type == BRW_REGISTER_TYPE_HF);
1319   }
1320
1321   if (devinfo->ver >= 8) {
1322      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
1323   } else {
1324      assert(devinfo->ver == 7);
1325      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
1326   }
1327}
1328
1329
1330void brw_NOP(struct brw_codegen *p)
1331{
1332   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
1333   memset(insn, 0, sizeof(*insn));
1334   brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
1335}
1336
1337void brw_SYNC(struct brw_codegen *p, enum tgl_sync_function func)
1338{
1339   brw_inst *insn = next_insn(p, BRW_OPCODE_SYNC);
1340   brw_inst_set_cond_modifier(p->devinfo, insn, func);
1341}
1342
1343/***********************************************************************
1344 * Comparisons, if/else/endif
1345 */
1346
1347brw_inst *
1348brw_JMPI(struct brw_codegen *p, struct brw_reg index,
1349         unsigned predicate_control)
1350{
1351   const struct intel_device_info *devinfo = p->devinfo;
1352   struct brw_reg ip = brw_ip_reg();
1353   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
1354
1355   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_1);
1356   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
1357   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
1358   brw_inst_set_pred_control(devinfo, inst, predicate_control);
1359
1360   return inst;
1361}
1362
1363static void
1364push_if_stack(struct brw_codegen *p, brw_inst *inst)
1365{
1366   p->if_stack[p->if_stack_depth] = inst - p->store;
1367
1368   p->if_stack_depth++;
1369   if (p->if_stack_array_size <= p->if_stack_depth) {
1370      p->if_stack_array_size *= 2;
1371      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
1372			     p->if_stack_array_size);
1373   }
1374}
1375
1376static brw_inst *
1377pop_if_stack(struct brw_codegen *p)
1378{
1379   p->if_stack_depth--;
1380   return &p->store[p->if_stack[p->if_stack_depth]];
1381}
1382
1383static void
1384push_loop_stack(struct brw_codegen *p, brw_inst *inst)
1385{
1386   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
1387      p->loop_stack_array_size *= 2;
1388      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
1389			       p->loop_stack_array_size);
1390      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
1391				     p->loop_stack_array_size);
1392   }
1393
1394   p->loop_stack[p->loop_stack_depth] = inst - p->store;
1395   p->loop_stack_depth++;
1396   p->if_depth_in_loop[p->loop_stack_depth] = 0;
1397}
1398
1399static brw_inst *
1400get_inner_do_insn(struct brw_codegen *p)
1401{
1402   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
1403}
1404
1405/* EU takes the value from the flag register and pushes it onto some
1406 * sort of a stack (presumably merging with any flag value already on
1407 * the stack).  Within an if block, the flags at the top of the stack
1408 * control execution on each channel of the unit, eg. on each of the
1409 * 16 pixel values in our wm programs.
1410 *
1411 * When the matching 'else' instruction is reached (presumably by
1412 * countdown of the instruction count patched in by our ELSE/ENDIF
1413 * functions), the relevant flags are inverted.
1414 *
1415 * When the matching 'endif' instruction is reached, the flags are
1416 * popped off.  If the stack is now empty, normal execution resumes.
1417 */
1418brw_inst *
1419brw_IF(struct brw_codegen *p, unsigned execute_size)
1420{
1421   const struct intel_device_info *devinfo = p->devinfo;
1422   brw_inst *insn;
1423
1424   insn = next_insn(p, BRW_OPCODE_IF);
1425
1426   /* Override the defaults for this instruction:
1427    */
1428   if (devinfo->ver < 6) {
1429      brw_set_dest(p, insn, brw_ip_reg());
1430      brw_set_src0(p, insn, brw_ip_reg());
1431      brw_set_src1(p, insn, brw_imm_d(0x0));
1432   } else if (devinfo->ver == 6) {
1433      brw_set_dest(p, insn, brw_imm_w(0));
1434      brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1435      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1436      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1437   } else if (devinfo->ver == 7) {
1438      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1439      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1440      brw_set_src1(p, insn, brw_imm_w(0));
1441      brw_inst_set_jip(devinfo, insn, 0);
1442      brw_inst_set_uip(devinfo, insn, 0);
1443   } else {
1444      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
1445      if (devinfo->ver < 12)
1446         brw_set_src0(p, insn, brw_imm_d(0));
1447      brw_inst_set_jip(devinfo, insn, 0);
1448      brw_inst_set_uip(devinfo, insn, 0);
1449   }
1450
1451   brw_inst_set_exec_size(devinfo, insn, execute_size);
1452   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1453   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
1454   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1455   if (!p->single_program_flow && devinfo->ver < 6)
1456      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1457
1458   push_if_stack(p, insn);
1459   p->if_depth_in_loop[p->loop_stack_depth]++;
1460   return insn;
1461}
1462
1463/* This function is only used for gfx6-style IF instructions with an
1464 * embedded comparison (conditional modifier).  It is not used on gfx7.
1465 */
1466brw_inst *
1467gfx6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
1468	struct brw_reg src0, struct brw_reg src1)
1469{
1470   const struct intel_device_info *devinfo = p->devinfo;
1471   brw_inst *insn;
1472
1473   insn = next_insn(p, BRW_OPCODE_IF);
1474
1475   brw_set_dest(p, insn, brw_imm_w(0));
1476   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1477   brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1478   brw_set_src0(p, insn, src0);
1479   brw_set_src1(p, insn, src1);
1480
1481   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
1482   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
1483   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1484
1485   push_if_stack(p, insn);
1486   return insn;
1487}
1488
1489/**
1490 * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
1491 */
1492static void
1493convert_IF_ELSE_to_ADD(struct brw_codegen *p,
1494                       brw_inst *if_inst, brw_inst *else_inst)
1495{
1496   const struct intel_device_info *devinfo = p->devinfo;
1497
1498   /* The next instruction (where the ENDIF would be, if it existed) */
1499   brw_inst *next_inst = &p->store[p->nr_insn];
1500
1501   assert(p->single_program_flow);
1502   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1503   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1504   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
1505
1506   /* Convert IF to an ADD instruction that moves the instruction pointer
1507    * to the first instruction of the ELSE block.  If there is no ELSE
1508    * block, point to where ENDIF would be.  Reverse the predicate.
1509    *
1510    * There's no need to execute an ENDIF since we don't need to do any
1511    * stack operations, and if we're currently executing, we just want to
1512    * continue normally.
1513    */
1514   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
1515   brw_inst_set_pred_inv(devinfo, if_inst, true);
1516
1517   if (else_inst != NULL) {
1518      /* Convert ELSE to an ADD instruction that points where the ENDIF
1519       * would be.
1520       */
1521      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
1522
1523      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
1524      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
1525   } else {
1526      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
1527   }
1528}
1529
1530/**
1531 * Patch IF and ELSE instructions with appropriate jump targets.
1532 */
1533static void
1534patch_IF_ELSE(struct brw_codegen *p,
1535              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
1536{
1537   const struct intel_device_info *devinfo = p->devinfo;
1538
1539   /* We shouldn't be patching IF and ELSE instructions in single program flow
1540    * mode when gen < 6, because in single program flow mode on those
1541    * platforms, we convert flow control instructions to conditional ADDs that
1542    * operate on IP (see brw_ENDIF).
1543    *
1544    * However, on Gfx6, writing to IP doesn't work in single program flow mode
1545    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1546    * not be updated by non-flow control instructions.").  And on later
1547    * platforms, there is no significant benefit to converting control flow
1548    * instructions to conditional ADDs.  So we do patch IF and ELSE
1549    * instructions in single program flow mode on those platforms.
1550    */
1551   if (devinfo->ver < 6)
1552      assert(!p->single_program_flow);
1553
1554   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
1555   assert(endif_inst != NULL);
1556   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
1557
1558   unsigned br = brw_jump_scale(devinfo);
1559
1560   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
1561   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
1562
1563   if (else_inst == NULL) {
1564      /* Patch IF -> ENDIF */
1565      if (devinfo->ver < 6) {
1566	 /* Turn it into an IFF, which means no mask stack operations for
1567	  * all-false and jumping past the ENDIF.
1568	  */
1569         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
1570         brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1571                                      br * (endif_inst - if_inst + 1));
1572         brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1573      } else if (devinfo->ver == 6) {
1574	 /* As of gfx6, there is no IFF and IF must point to the ENDIF. */
1575         brw_inst_set_gfx6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
1576      } else {
1577         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1578         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
1579      }
1580   } else {
1581      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
1582
1583      /* Patch IF -> ELSE */
1584      if (devinfo->ver < 6) {
1585         brw_inst_set_gfx4_jump_count(devinfo, if_inst,
1586                                      br * (else_inst - if_inst));
1587         brw_inst_set_gfx4_pop_count(devinfo, if_inst, 0);
1588      } else if (devinfo->ver == 6) {
1589         brw_inst_set_gfx6_jump_count(devinfo, if_inst,
1590                                      br * (else_inst - if_inst + 1));
1591      }
1592
1593      /* Patch ELSE -> ENDIF */
1594      if (devinfo->ver < 6) {
1595	 /* BRW_OPCODE_ELSE pre-gfx6 should point just past the
1596	  * matching ENDIF.
1597	  */
1598         brw_inst_set_gfx4_jump_count(devinfo, else_inst,
1599                                      br * (endif_inst - else_inst + 1));
1600         brw_inst_set_gfx4_pop_count(devinfo, else_inst, 1);
1601      } else if (devinfo->ver == 6) {
1602	 /* BRW_OPCODE_ELSE on gfx6 should point to the matching ENDIF. */
1603         brw_inst_set_gfx6_jump_count(devinfo, else_inst,
1604                                      br * (endif_inst - else_inst));
1605      } else {
1606	 /* The IF instruction's JIP should point just past the ELSE */
1607         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
1608	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
1609         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
1610         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
1611         if (devinfo->ver >= 8) {
1612            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
1613             * should point to ENDIF.
1614             */
1615            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
1616         }
1617      }
1618   }
1619}
1620
1621void
1622brw_ELSE(struct brw_codegen *p)
1623{
1624   const struct intel_device_info *devinfo = p->devinfo;
1625   brw_inst *insn;
1626
1627   insn = next_insn(p, BRW_OPCODE_ELSE);
1628
1629   if (devinfo->ver < 6) {
1630      brw_set_dest(p, insn, brw_ip_reg());
1631      brw_set_src0(p, insn, brw_ip_reg());
1632      brw_set_src1(p, insn, brw_imm_d(0x0));
1633   } else if (devinfo->ver == 6) {
1634      brw_set_dest(p, insn, brw_imm_w(0));
1635      brw_inst_set_gfx6_jump_count(devinfo, insn, 0);
1636      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1637      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1638   } else if (devinfo->ver == 7) {
1639      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1640      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1641      brw_set_src1(p, insn, brw_imm_w(0));
1642      brw_inst_set_jip(devinfo, insn, 0);
1643      brw_inst_set_uip(devinfo, insn, 0);
1644   } else {
1645      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1646      if (devinfo->ver < 12)
1647         brw_set_src0(p, insn, brw_imm_d(0));
1648      brw_inst_set_jip(devinfo, insn, 0);
1649      brw_inst_set_uip(devinfo, insn, 0);
1650   }
1651
1652   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1653   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1654   if (!p->single_program_flow && devinfo->ver < 6)
1655      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1656
1657   push_if_stack(p, insn);
1658}
1659
1660void
1661brw_ENDIF(struct brw_codegen *p)
1662{
1663   const struct intel_device_info *devinfo = p->devinfo;
1664   brw_inst *insn = NULL;
1665   brw_inst *else_inst = NULL;
1666   brw_inst *if_inst = NULL;
1667   brw_inst *tmp;
1668   bool emit_endif = true;
1669
1670   /* In single program flow mode, we can express IF and ELSE instructions
1671    * equivalently as ADD instructions that operate on IP.  On platforms prior
1672    * to Gfx6, flow control instructions cause an implied thread switch, so
1673    * this is a significant savings.
1674    *
1675    * However, on Gfx6, writing to IP doesn't work in single program flow mode
1676    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
1677    * not be updated by non-flow control instructions.").  And on later
1678    * platforms, there is no significant benefit to converting control flow
1679    * instructions to conditional ADDs.  So we only do this trick on Gfx4 and
1680    * Gfx5.
1681    */
1682   if (devinfo->ver < 6 && p->single_program_flow)
1683      emit_endif = false;
1684
1685   /*
1686    * A single next_insn() may change the base address of instruction store
1687    * memory(p->store), so call it first before referencing the instruction
1688    * store pointer from an index
1689    */
1690   if (emit_endif)
1691      insn = next_insn(p, BRW_OPCODE_ENDIF);
1692
1693   /* Pop the IF and (optional) ELSE instructions from the stack */
1694   p->if_depth_in_loop[p->loop_stack_depth]--;
1695   tmp = pop_if_stack(p);
1696   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
1697      else_inst = tmp;
1698      tmp = pop_if_stack(p);
1699   }
1700   if_inst = tmp;
1701
1702   if (!emit_endif) {
1703      /* ENDIF is useless; don't bother emitting it. */
1704      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
1705      return;
1706   }
1707
1708   if (devinfo->ver < 6) {
1709      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1710      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1711      brw_set_src1(p, insn, brw_imm_d(0x0));
1712   } else if (devinfo->ver == 6) {
1713      brw_set_dest(p, insn, brw_imm_w(0));
1714      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1715      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1716   } else if (devinfo->ver == 7) {
1717      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1718      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1719      brw_set_src1(p, insn, brw_imm_w(0));
1720   } else {
1721      brw_set_src0(p, insn, brw_imm_d(0));
1722   }
1723
1724   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1725   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
1726   if (devinfo->ver < 6)
1727      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
1728
1729   /* Also pop item off the stack in the endif instruction: */
1730   if (devinfo->ver < 6) {
1731      brw_inst_set_gfx4_jump_count(devinfo, insn, 0);
1732      brw_inst_set_gfx4_pop_count(devinfo, insn, 1);
1733   } else if (devinfo->ver == 6) {
1734      brw_inst_set_gfx6_jump_count(devinfo, insn, 2);
1735   } else {
1736      brw_inst_set_jip(devinfo, insn, 2);
1737   }
1738   patch_IF_ELSE(p, if_inst, else_inst, insn);
1739}
1740
1741brw_inst *
1742brw_BREAK(struct brw_codegen *p)
1743{
1744   const struct intel_device_info *devinfo = p->devinfo;
1745   brw_inst *insn;
1746
1747   insn = next_insn(p, BRW_OPCODE_BREAK);
1748   if (devinfo->ver >= 8) {
1749      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1750      brw_set_src0(p, insn, brw_imm_d(0x0));
1751   } else if (devinfo->ver >= 6) {
1752      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1753      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1754      brw_set_src1(p, insn, brw_imm_d(0x0));
1755   } else {
1756      brw_set_dest(p, insn, brw_ip_reg());
1757      brw_set_src0(p, insn, brw_ip_reg());
1758      brw_set_src1(p, insn, brw_imm_d(0x0));
1759      brw_inst_set_gfx4_pop_count(devinfo, insn,
1760                                  p->if_depth_in_loop[p->loop_stack_depth]);
1761   }
1762   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1763   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1764
1765   return insn;
1766}
1767
1768brw_inst *
1769brw_CONT(struct brw_codegen *p)
1770{
1771   const struct intel_device_info *devinfo = p->devinfo;
1772   brw_inst *insn;
1773
1774   insn = next_insn(p, BRW_OPCODE_CONTINUE);
1775   brw_set_dest(p, insn, brw_ip_reg());
1776   if (devinfo->ver >= 8) {
1777      brw_set_src0(p, insn, brw_imm_d(0x0));
1778   } else {
1779      brw_set_src0(p, insn, brw_ip_reg());
1780      brw_set_src1(p, insn, brw_imm_d(0x0));
1781   }
1782
1783   if (devinfo->ver < 6) {
1784      brw_inst_set_gfx4_pop_count(devinfo, insn,
1785                                  p->if_depth_in_loop[p->loop_stack_depth]);
1786   }
1787   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1788   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1789   return insn;
1790}
1791
1792brw_inst *
1793brw_HALT(struct brw_codegen *p)
1794{
1795   const struct intel_device_info *devinfo = p->devinfo;
1796   brw_inst *insn;
1797
1798   insn = next_insn(p, BRW_OPCODE_HALT);
1799   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1800   if (devinfo->ver < 6) {
1801      /* From the Gfx4 PRM:
1802       *
1803       *    "IP register must be put (for example, by the assembler) at <dst>
1804       *    and <src0> locations.
1805       */
1806      brw_set_dest(p, insn, brw_ip_reg());
1807      brw_set_src0(p, insn, brw_ip_reg());
1808      brw_set_src1(p, insn, brw_imm_d(0x0)); /* exitcode updated later. */
1809   } else if (devinfo->ver < 8) {
1810      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1811      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
1812   } else if (devinfo->ver < 12) {
1813      brw_set_src0(p, insn, brw_imm_d(0x0));
1814   }
1815
1816   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1817   brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1818   return insn;
1819}
1820
1821/* DO/WHILE loop:
1822 *
1823 * The DO/WHILE is just an unterminated loop -- break or continue are
1824 * used for control within the loop.  We have a few ways they can be
1825 * done.
1826 *
1827 * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
1828 * jip and no DO instruction.
1829 *
1830 * For non-uniform control flow pre-gfx6, there's a DO instruction to
1831 * push the mask, and a WHILE to jump back, and BREAK to get out and
1832 * pop the mask.
1833 *
1834 * For gfx6, there's no more mask stack, so no need for DO.  WHILE
1835 * just points back to the first instruction of the loop.
1836 */
1837brw_inst *
1838brw_DO(struct brw_codegen *p, unsigned execute_size)
1839{
1840   const struct intel_device_info *devinfo = p->devinfo;
1841
1842   if (devinfo->ver >= 6 || p->single_program_flow) {
1843      push_loop_stack(p, &p->store[p->nr_insn]);
1844      return &p->store[p->nr_insn];
1845   } else {
1846      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
1847
1848      push_loop_stack(p, insn);
1849
1850      /* Override the defaults for this instruction:
1851       */
1852      brw_set_dest(p, insn, brw_null_reg());
1853      brw_set_src0(p, insn, brw_null_reg());
1854      brw_set_src1(p, insn, brw_null_reg());
1855
1856      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1857      brw_inst_set_exec_size(devinfo, insn, execute_size);
1858      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
1859
1860      return insn;
1861   }
1862}
1863
1864/**
1865 * For pre-gfx6, we patch BREAK/CONT instructions to point at the WHILE
1866 * instruction here.
1867 *
1868 * For gfx6+, see brw_set_uip_jip(), which doesn't care so much about the loop
1869 * nesting, since it can always just point to the end of the block/current loop.
1870 */
1871static void
1872brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
1873{
1874   const struct intel_device_info *devinfo = p->devinfo;
1875   brw_inst *do_inst = get_inner_do_insn(p);
1876   brw_inst *inst;
1877   unsigned br = brw_jump_scale(devinfo);
1878
1879   assert(devinfo->ver < 6);
1880
1881   for (inst = while_inst - 1; inst != do_inst; inst--) {
1882      /* If the jump count is != 0, that means that this instruction has already
1883       * been patched because it's part of a loop inside of the one we're
1884       * patching.
1885       */
1886      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
1887          brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1888         brw_inst_set_gfx4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
1889      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
1890                 brw_inst_gfx4_jump_count(devinfo, inst) == 0) {
1891         brw_inst_set_gfx4_jump_count(devinfo, inst, br * (while_inst - inst));
1892      }
1893   }
1894}
1895
1896brw_inst *
1897brw_WHILE(struct brw_codegen *p)
1898{
1899   const struct intel_device_info *devinfo = p->devinfo;
1900   brw_inst *insn, *do_insn;
1901   unsigned br = brw_jump_scale(devinfo);
1902
1903   if (devinfo->ver >= 6) {
1904      insn = next_insn(p, BRW_OPCODE_WHILE);
1905      do_insn = get_inner_do_insn(p);
1906
1907      if (devinfo->ver >= 8) {
1908         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1909         if (devinfo->ver < 12)
1910            brw_set_src0(p, insn, brw_imm_d(0));
1911         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1912      } else if (devinfo->ver == 7) {
1913         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1914         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1915         brw_set_src1(p, insn, brw_imm_w(0));
1916         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
1917      } else {
1918         brw_set_dest(p, insn, brw_imm_w(0));
1919         brw_inst_set_gfx6_jump_count(devinfo, insn, br * (do_insn - insn));
1920         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1921         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
1922      }
1923
1924      brw_inst_set_exec_size(devinfo, insn, brw_get_default_exec_size(p));
1925
1926   } else {
1927      if (p->single_program_flow) {
1928	 insn = next_insn(p, BRW_OPCODE_ADD);
1929         do_insn = get_inner_do_insn(p);
1930
1931	 brw_set_dest(p, insn, brw_ip_reg());
1932	 brw_set_src0(p, insn, brw_ip_reg());
1933	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
1934         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
1935      } else {
1936	 insn = next_insn(p, BRW_OPCODE_WHILE);
1937         do_insn = get_inner_do_insn(p);
1938
1939         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
1940
1941	 brw_set_dest(p, insn, brw_ip_reg());
1942	 brw_set_src0(p, insn, brw_ip_reg());
1943	 brw_set_src1(p, insn, brw_imm_d(0));
1944
1945         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
1946         brw_inst_set_gfx4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
1947         brw_inst_set_gfx4_pop_count(devinfo, insn, 0);
1948
1949	 brw_patch_break_cont(p, insn);
1950      }
1951   }
1952   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
1953
1954   p->loop_stack_depth--;
1955
1956   return insn;
1957}
1958
1959/* FORWARD JUMPS:
1960 */
1961void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
1962{
1963   const struct intel_device_info *devinfo = p->devinfo;
1964   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
1965   unsigned jmpi = 1;
1966
1967   if (devinfo->ver >= 5)
1968      jmpi = 2;
1969
1970   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
1971   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
1972
1973   brw_inst_set_gfx4_jump_count(devinfo, jmp_insn,
1974                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
1975}
1976
1977/* To integrate with the above, it makes sense that the comparison
1978 * instruction should populate the flag register.  It might be simpler
1979 * just to use the flag reg for most WM tasks?
1980 */
1981void brw_CMP(struct brw_codegen *p,
1982	     struct brw_reg dest,
1983	     unsigned conditional,
1984	     struct brw_reg src0,
1985	     struct brw_reg src1)
1986{
1987   const struct intel_device_info *devinfo = p->devinfo;
1988   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
1989
1990   brw_inst_set_cond_modifier(devinfo, insn, conditional);
1991   brw_set_dest(p, insn, dest);
1992   brw_set_src0(p, insn, src0);
1993   brw_set_src1(p, insn, src1);
1994
1995   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
1996    * page says:
1997    *    "Any CMP instruction with a null destination must use a {switch}."
1998    *
1999    * It also applies to other Gfx7 platforms (IVB, BYT) even though it isn't
2000    * mentioned on their work-arounds pages.
2001    */
2002   if (devinfo->ver == 7) {
2003      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2004          dest.nr == BRW_ARF_NULL) {
2005         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2006      }
2007   }
2008}
2009
2010void brw_CMPN(struct brw_codegen *p,
2011              struct brw_reg dest,
2012              unsigned conditional,
2013              struct brw_reg src0,
2014              struct brw_reg src1)
2015{
2016   const struct intel_device_info *devinfo = p->devinfo;
2017   brw_inst *insn = next_insn(p, BRW_OPCODE_CMPN);
2018
2019   brw_inst_set_cond_modifier(devinfo, insn, conditional);
2020   brw_set_dest(p, insn, dest);
2021   brw_set_src0(p, insn, src0);
2022   brw_set_src1(p, insn, src1);
2023
2024   /* Page 166 of the Ivy Bridge PRM Volume 4 part 3 (Execution Unit ISA)
2025    * says:
2026    *
2027    *    If the destination is the null register, the {Switch} instruction
2028    *    option must be used.
2029    *
2030    * Page 77 of the Haswell PRM Volume 2b contains the same text.
2031    */
2032   if (devinfo->ver == 7) {
2033      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
2034          dest.nr == BRW_ARF_NULL) {
2035         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
2036      }
2037   }
2038}
2039
2040/***********************************************************************
2041 * Helpers for the various SEND message types:
2042 */
2043
2044/** Extended math function, float[8].
2045 */
2046void gfx4_math(struct brw_codegen *p,
2047	       struct brw_reg dest,
2048	       unsigned function,
2049	       unsigned msg_reg_nr,
2050	       struct brw_reg src,
2051	       unsigned precision )
2052{
2053   const struct intel_device_info *devinfo = p->devinfo;
2054   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2055   unsigned data_type;
2056   if (has_scalar_region(src)) {
2057      data_type = BRW_MATH_DATA_SCALAR;
2058   } else {
2059      data_type = BRW_MATH_DATA_VECTOR;
2060   }
2061
2062   assert(devinfo->ver < 6);
2063
2064   /* Example code doesn't set predicate_control for send
2065    * instructions.
2066    */
2067   brw_inst_set_pred_control(devinfo, insn, 0);
2068   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2069
2070   brw_set_dest(p, insn, dest);
2071   brw_set_src0(p, insn, src);
2072   brw_set_math_message(p,
2073                        insn,
2074                        function,
2075                        src.type == BRW_REGISTER_TYPE_D,
2076                        precision,
2077                        data_type);
2078}
2079
2080void gfx6_math(struct brw_codegen *p,
2081	       struct brw_reg dest,
2082	       unsigned function,
2083	       struct brw_reg src0,
2084	       struct brw_reg src1)
2085{
2086   const struct intel_device_info *devinfo = p->devinfo;
2087   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
2088
2089   assert(devinfo->ver >= 6);
2090
2091   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
2092          (devinfo->ver >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
2093
2094   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
2095   if (devinfo->ver == 6) {
2096      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
2097      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
2098   }
2099
2100   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
2101       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
2102       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
2103      assert(src0.type != BRW_REGISTER_TYPE_F);
2104      assert(src1.type != BRW_REGISTER_TYPE_F);
2105      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
2106             (devinfo->ver >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
2107      /* From BSpec 6647/47428 "[Instruction] Extended Math Function":
2108       *     INT DIV function does not support source modifiers.
2109       */
2110      assert(!src0.negate);
2111      assert(!src0.abs);
2112      assert(!src1.negate);
2113      assert(!src1.abs);
2114   } else {
2115      assert(src0.type == BRW_REGISTER_TYPE_F ||
2116             (src0.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2117      assert(src1.type == BRW_REGISTER_TYPE_F ||
2118             (src1.type == BRW_REGISTER_TYPE_HF && devinfo->ver >= 9));
2119   }
2120
2121   /* Source modifiers are ignored for extended math instructions on Gfx6. */
2122   if (devinfo->ver == 6) {
2123      assert(!src0.negate);
2124      assert(!src0.abs);
2125      assert(!src1.negate);
2126      assert(!src1.abs);
2127   }
2128
2129   brw_inst_set_math_function(devinfo, insn, function);
2130
2131   brw_set_dest(p, insn, dest);
2132   brw_set_src0(p, insn, src0);
2133   brw_set_src1(p, insn, src1);
2134}
2135
2136/**
2137 * Return the right surface index to access the thread scratch space using
2138 * stateless dataport messages.
2139 */
2140unsigned
2141brw_scratch_surface_idx(const struct brw_codegen *p)
2142{
2143   /* The scratch space is thread-local so IA coherency is unnecessary. */
2144   if (p->devinfo->ver >= 8)
2145      return GFX8_BTI_STATELESS_NON_COHERENT;
2146   else
2147      return BRW_BTI_STATELESS;
2148}
2149
2150/**
2151 * Write a block of OWORDs (half a GRF each) from the scratch buffer,
2152 * using a constant offset per channel.
2153 *
2154 * The offset must be aligned to oword size (16 bytes).  Used for
2155 * register spilling.
2156 */
2157void brw_oword_block_write_scratch(struct brw_codegen *p,
2158				   struct brw_reg mrf,
2159				   int num_regs,
2160				   unsigned offset)
2161{
2162   const struct intel_device_info *devinfo = p->devinfo;
2163   const unsigned target_cache =
2164      (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2165       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2166       BRW_SFID_DATAPORT_WRITE);
2167   const struct tgl_swsb swsb = brw_get_default_swsb(p);
2168   uint32_t msg_type;
2169
2170   if (devinfo->ver >= 6)
2171      offset /= 16;
2172
2173   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2174
2175   const unsigned mlen = 1 + num_regs;
2176
2177   /* Set up the message header.  This is g0, with g0.2 filled with
2178    * the offset.  We don't want to leave our offset around in g0 or
2179    * it'll screw up texture samples, so set it up inside the message
2180    * reg.
2181    */
2182   {
2183      brw_push_insn_state(p);
2184      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2185      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2186      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2187      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2188
2189      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2190
2191      /* set message header global offset field (reg 0, element 2) */
2192      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2193      brw_set_default_swsb(p, tgl_swsb_null());
2194      brw_MOV(p,
2195	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2196				  mrf.nr,
2197				  2), BRW_REGISTER_TYPE_UD),
2198	      brw_imm_ud(offset));
2199
2200      brw_pop_insn_state(p);
2201      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2202   }
2203
2204   {
2205      struct brw_reg dest;
2206      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2207      int send_commit_msg;
2208      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
2209					 BRW_REGISTER_TYPE_UW);
2210
2211      brw_inst_set_sfid(devinfo, insn, target_cache);
2212      brw_inst_set_compression(devinfo, insn, false);
2213
2214      if (brw_inst_exec_size(devinfo, insn) >= 16)
2215	 src_header = vec16(src_header);
2216
2217      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
2218      if (devinfo->ver < 6)
2219         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2220
2221      /* Until gfx6, writes followed by reads from the same location
2222       * are not guaranteed to be ordered unless write_commit is set.
2223       * If set, then a no-op write is issued to the destination
2224       * register to set a dependency, and a read from the destination
2225       * can be used to ensure the ordering.
2226       *
2227       * For gfx6, only writes between different threads need ordering
2228       * protection.  Our use of DP writes is all about register
2229       * spilling within a thread.
2230       */
2231      if (devinfo->ver >= 6) {
2232	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2233	 send_commit_msg = 0;
2234      } else {
2235	 dest = src_header;
2236	 send_commit_msg = 1;
2237      }
2238
2239      brw_set_dest(p, insn, dest);
2240      if (devinfo->ver >= 6) {
2241	 brw_set_src0(p, insn, mrf);
2242      } else {
2243	 brw_set_src0(p, insn, brw_null_reg());
2244      }
2245
2246      if (devinfo->ver >= 6)
2247	 msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2248      else
2249	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
2250
2251      brw_set_desc(p, insn,
2252                   brw_message_desc(devinfo, mlen, send_commit_msg, true) |
2253                   brw_dp_write_desc(devinfo, brw_scratch_surface_idx(p),
2254                                     BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2255                                     msg_type, send_commit_msg));
2256   }
2257}
2258
2259
2260/**
2261 * Read a block of owords (half a GRF each) from the scratch buffer
2262 * using a constant index per channel.
2263 *
2264 * Offset must be aligned to oword size (16 bytes).  Used for register
2265 * spilling.
2266 */
2267void
2268brw_oword_block_read_scratch(struct brw_codegen *p,
2269			     struct brw_reg dest,
2270			     struct brw_reg mrf,
2271			     int num_regs,
2272			     unsigned offset)
2273{
2274   const struct intel_device_info *devinfo = p->devinfo;
2275   const struct tgl_swsb swsb = brw_get_default_swsb(p);
2276
2277   if (devinfo->ver >= 6)
2278      offset /= 16;
2279
2280   if (p->devinfo->ver >= 7) {
2281      /* On gen 7 and above, we no longer have message registers and we can
2282       * send from any register we want.  By using the destination register
2283       * for the message, we guarantee that the implied message write won't
2284       * accidentally overwrite anything.  This has been a problem because
2285       * the MRF registers and source for the final FB write are both fixed
2286       * and may overlap.
2287       */
2288      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
2289   } else {
2290      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2291   }
2292   dest = retype(dest, BRW_REGISTER_TYPE_UW);
2293
2294   const unsigned rlen = num_regs;
2295   const unsigned target_cache =
2296      (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
2297       devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2298       BRW_SFID_DATAPORT_READ);
2299
2300   {
2301      brw_push_insn_state(p);
2302      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2303      brw_set_default_exec_size(p, BRW_EXECUTE_8);
2304      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2305      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2306
2307      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2308
2309      /* set message header global offset field (reg 0, element 2) */
2310      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2311      brw_set_default_swsb(p, tgl_swsb_null());
2312      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
2313
2314      brw_pop_insn_state(p);
2315      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2316   }
2317
2318   {
2319      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2320
2321      brw_inst_set_sfid(devinfo, insn, target_cache);
2322      assert(brw_inst_pred_control(devinfo, insn) == 0);
2323      brw_inst_set_compression(devinfo, insn, false);
2324
2325      brw_set_dest(p, insn, dest);	/* UW? */
2326      if (devinfo->ver >= 6) {
2327	 brw_set_src0(p, insn, mrf);
2328      } else {
2329	 brw_set_src0(p, insn, brw_null_reg());
2330         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2331      }
2332
2333      brw_set_desc(p, insn,
2334                   brw_message_desc(devinfo, 1, rlen, true) |
2335                   brw_dp_read_desc(devinfo, brw_scratch_surface_idx(p),
2336                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
2337                                    BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2338                                    BRW_DATAPORT_READ_TARGET_RENDER_CACHE));
2339   }
2340}
2341
2342void
2343gfx7_block_read_scratch(struct brw_codegen *p,
2344                        struct brw_reg dest,
2345                        int num_regs,
2346                        unsigned offset)
2347{
2348   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2349   assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
2350
2351   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
2352
2353   /* The HW requires that the header is present; this is to get the g0.5
2354    * scratch offset.
2355    */
2356   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
2357
2358   /* According to the docs, offset is "A 12-bit HWord offset into the memory
2359    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
2360    * is 32 bytes, which happens to be the size of a register.
2361    */
2362   offset /= REG_SIZE;
2363   assert(offset < (1 << 12));
2364
2365   gfx7_set_dp_scratch_message(p, insn,
2366                               false, /* scratch read */
2367                               false, /* OWords */
2368                               false, /* invalidate after read */
2369                               num_regs,
2370                               offset,
2371                               1,        /* mlen: just g0 */
2372                               num_regs, /* rlen */
2373                               true);    /* header present */
2374}
2375
2376/**
2377 * Read float[4] vectors from the data port constant cache.
2378 * Location (in buffer) should be a multiple of 16.
2379 * Used for fetching shader constants.
2380 */
2381void brw_oword_block_read(struct brw_codegen *p,
2382			  struct brw_reg dest,
2383			  struct brw_reg mrf,
2384			  uint32_t offset,
2385			  uint32_t bind_table_index)
2386{
2387   const struct intel_device_info *devinfo = p->devinfo;
2388   const unsigned target_cache =
2389      (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_CONSTANT_CACHE :
2390       BRW_SFID_DATAPORT_READ);
2391   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
2392   const struct tgl_swsb swsb = brw_get_default_swsb(p);
2393
2394   /* On newer hardware, offset is in units of owords. */
2395   if (devinfo->ver >= 6)
2396      offset /= 16;
2397
2398   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
2399
2400   brw_push_insn_state(p);
2401   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2402   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
2403   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2404
2405   brw_push_insn_state(p);
2406   brw_set_default_exec_size(p, BRW_EXECUTE_8);
2407   brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2408   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
2409
2410   /* set message header global offset field (reg 0, element 2) */
2411   brw_set_default_exec_size(p, BRW_EXECUTE_1);
2412   brw_set_default_swsb(p, tgl_swsb_null());
2413   brw_MOV(p,
2414	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
2415			       mrf.nr,
2416			       2), BRW_REGISTER_TYPE_UD),
2417	   brw_imm_ud(offset));
2418   brw_pop_insn_state(p);
2419
2420   brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2421
2422   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
2423
2424   brw_inst_set_sfid(devinfo, insn, target_cache);
2425
2426   /* cast dest to a uword[8] vector */
2427   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
2428
2429   brw_set_dest(p, insn, dest);
2430   if (devinfo->ver >= 6) {
2431      brw_set_src0(p, insn, mrf);
2432   } else {
2433      brw_set_src0(p, insn, brw_null_reg());
2434      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
2435   }
2436
2437   brw_set_desc(p, insn,
2438                brw_message_desc(devinfo, 1, DIV_ROUND_UP(exec_size, 8), true) |
2439                brw_dp_read_desc(devinfo, bind_table_index,
2440                                 BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
2441                                 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
2442                                 BRW_DATAPORT_READ_TARGET_DATA_CACHE));
2443
2444   brw_pop_insn_state(p);
2445}
2446
2447brw_inst *
2448brw_fb_WRITE(struct brw_codegen *p,
2449             struct brw_reg payload,
2450             struct brw_reg implied_header,
2451             unsigned msg_control,
2452             unsigned binding_table_index,
2453             unsigned msg_length,
2454             unsigned response_length,
2455             bool eot,
2456             bool last_render_target,
2457             bool header_present)
2458{
2459   const struct intel_device_info *devinfo = p->devinfo;
2460   const unsigned target_cache =
2461      (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
2462       BRW_SFID_DATAPORT_WRITE);
2463   brw_inst *insn;
2464   struct brw_reg dest, src0;
2465
2466   if (brw_get_default_exec_size(p) >= BRW_EXECUTE_16)
2467      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2468   else
2469      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
2470
2471   if (devinfo->ver >= 6) {
2472      insn = next_insn(p, BRW_OPCODE_SENDC);
2473   } else {
2474      insn = next_insn(p, BRW_OPCODE_SEND);
2475   }
2476   brw_inst_set_sfid(devinfo, insn, target_cache);
2477   brw_inst_set_compression(devinfo, insn, false);
2478
2479   if (devinfo->ver >= 6) {
2480      /* headerless version, just submit color payload */
2481      src0 = payload;
2482   } else {
2483      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
2484      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
2485      src0 = implied_header;
2486   }
2487
2488   brw_set_dest(p, insn, dest);
2489   brw_set_src0(p, insn, src0);
2490   brw_set_desc(p, insn,
2491                brw_message_desc(devinfo, msg_length, response_length,
2492                                 header_present) |
2493                brw_fb_write_desc(devinfo, binding_table_index, msg_control,
2494                                  last_render_target,
2495                                  false /* coarse_write */));
2496   brw_inst_set_eot(devinfo, insn, eot);
2497
2498   return insn;
2499}
2500
2501brw_inst *
2502gfx9_fb_READ(struct brw_codegen *p,
2503             struct brw_reg dst,
2504             struct brw_reg payload,
2505             unsigned binding_table_index,
2506             unsigned msg_length,
2507             unsigned response_length,
2508             bool per_sample)
2509{
2510   const struct intel_device_info *devinfo = p->devinfo;
2511   assert(devinfo->ver >= 9);
2512   brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
2513
2514   brw_inst_set_sfid(devinfo, insn, GFX6_SFID_DATAPORT_RENDER_CACHE);
2515   brw_set_dest(p, insn, dst);
2516   brw_set_src0(p, insn, payload);
2517   brw_set_desc(
2518      p, insn,
2519      brw_message_desc(devinfo, msg_length, response_length, true) |
2520      brw_fb_read_desc(devinfo, binding_table_index, 0 /* msg_control */,
2521                       1 << brw_get_default_exec_size(p), per_sample));
2522   brw_inst_set_rt_slot_group(devinfo, insn, brw_get_default_group(p) / 16);
2523
2524   return insn;
2525}
2526
2527/**
2528 * Texture sample instruction.
2529 * Note: the msg_type plus msg_length values determine exactly what kind
2530 * of sampling operation is performed.  See volume 4, page 161 of docs.
2531 */
2532void brw_SAMPLE(struct brw_codegen *p,
2533		struct brw_reg dest,
2534		unsigned msg_reg_nr,
2535		struct brw_reg src0,
2536		unsigned binding_table_index,
2537		unsigned sampler,
2538		unsigned msg_type,
2539		unsigned response_length,
2540		unsigned msg_length,
2541		unsigned header_present,
2542		unsigned simd_mode,
2543		unsigned return_format)
2544{
2545   const struct intel_device_info *devinfo = p->devinfo;
2546   brw_inst *insn;
2547
2548   if (msg_reg_nr != -1)
2549      gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2550
2551   insn = next_insn(p, BRW_OPCODE_SEND);
2552   brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER);
2553   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
2554
2555   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
2556    *
2557    *    "Instruction compression is not allowed for this instruction (that
2558    *     is, send). The hardware behavior is undefined if this instruction is
2559    *     set as compressed. However, compress control can be set to "SecHalf"
2560    *     to affect the EMask generation."
2561    *
2562    * No similar wording is found in later PRMs, but there are examples
2563    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
2564    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
2565    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
2566    */
2567   brw_inst_set_compression(devinfo, insn, false);
2568
2569   if (devinfo->ver < 6)
2570      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2571
2572   brw_set_dest(p, insn, dest);
2573   brw_set_src0(p, insn, src0);
2574   brw_set_desc(p, insn,
2575                brw_message_desc(devinfo, msg_length, response_length,
2576                                 header_present) |
2577                brw_sampler_desc(devinfo, binding_table_index, sampler,
2578                                 msg_type, simd_mode, return_format));
2579}
2580
2581/* Adjust the message header's sampler state pointer to
2582 * select the correct group of 16 samplers.
2583 */
2584void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
2585                                      struct brw_reg header,
2586                                      struct brw_reg sampler_index)
2587{
2588   /* The "Sampler Index" field can only store values between 0 and 15.
2589    * However, we can add an offset to the "Sampler State Pointer"
2590    * field, effectively selecting a different set of 16 samplers.
2591    *
2592    * The "Sampler State Pointer" needs to be aligned to a 32-byte
2593    * offset, and each sampler state is only 16-bytes, so we can't
2594    * exclusively use the offset - we have to use both.
2595    */
2596
2597   const struct intel_device_info *devinfo = p->devinfo;
2598
2599   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
2600      const int sampler_state_size = 16; /* 16 bytes */
2601      uint32_t sampler = sampler_index.ud;
2602
2603      if (sampler >= 16) {
2604         assert(devinfo->verx10 >= 75);
2605         brw_ADD(p,
2606                 get_element_ud(header, 3),
2607                 get_element_ud(brw_vec8_grf(0, 0), 3),
2608                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
2609      }
2610   } else {
2611      /* Non-const sampler array indexing case */
2612      if (devinfo->verx10 <= 70) {
2613         return;
2614      }
2615
2616      struct brw_reg temp = get_element_ud(header, 3);
2617
2618      brw_push_insn_state(p);
2619      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
2620      brw_set_default_swsb(p, tgl_swsb_regdist(1));
2621      brw_SHL(p, temp, temp, brw_imm_ud(4));
2622      brw_ADD(p,
2623              get_element_ud(header, 3),
2624              get_element_ud(brw_vec8_grf(0, 0), 3),
2625              temp);
2626      brw_pop_insn_state(p);
2627   }
2628}
2629
2630/* All these variables are pretty confusing - we might be better off
2631 * using bitmasks and macros for this, in the old style.  Or perhaps
2632 * just having the caller instantiate the fields in dword3 itself.
2633 */
2634void brw_urb_WRITE(struct brw_codegen *p,
2635		   struct brw_reg dest,
2636		   unsigned msg_reg_nr,
2637		   struct brw_reg src0,
2638                   enum brw_urb_write_flags flags,
2639		   unsigned msg_length,
2640		   unsigned response_length,
2641		   unsigned offset,
2642		   unsigned swizzle)
2643{
2644   const struct intel_device_info *devinfo = p->devinfo;
2645   brw_inst *insn;
2646
2647   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
2648
2649   if (devinfo->ver >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
2650      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
2651      brw_push_insn_state(p);
2652      brw_set_default_access_mode(p, BRW_ALIGN_1);
2653      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2654      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2655      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
2656		       BRW_REGISTER_TYPE_UD),
2657	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
2658		brw_imm_ud(0xff00));
2659      brw_pop_insn_state(p);
2660   }
2661
2662   insn = next_insn(p, BRW_OPCODE_SEND);
2663
2664   assert(msg_length < BRW_MAX_MRF(devinfo->ver));
2665
2666   brw_set_dest(p, insn, dest);
2667   brw_set_src0(p, insn, src0);
2668   brw_set_src1(p, insn, brw_imm_d(0));
2669
2670   if (devinfo->ver < 6)
2671      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
2672
2673   brw_set_urb_message(p,
2674		       insn,
2675		       flags,
2676		       msg_length,
2677		       response_length,
2678		       offset,
2679		       swizzle);
2680}
2681
2682void
2683brw_send_indirect_message(struct brw_codegen *p,
2684                          unsigned sfid,
2685                          struct brw_reg dst,
2686                          struct brw_reg payload,
2687                          struct brw_reg desc,
2688                          unsigned desc_imm,
2689                          bool eot)
2690{
2691   const struct intel_device_info *devinfo = p->devinfo;
2692   struct brw_inst *send;
2693
2694   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2695
2696   assert(desc.type == BRW_REGISTER_TYPE_UD);
2697
2698   if (desc.file == BRW_IMMEDIATE_VALUE) {
2699      send = next_insn(p, BRW_OPCODE_SEND);
2700      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2701      brw_set_desc(p, send, desc.ud | desc_imm);
2702   } else {
2703      const struct tgl_swsb swsb = brw_get_default_swsb(p);
2704      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2705
2706      brw_push_insn_state(p);
2707      brw_set_default_access_mode(p, BRW_ALIGN_1);
2708      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2709      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2710      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2711      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2712
2713      /* Load the indirect descriptor to an address register using OR so the
2714       * caller can specify additional descriptor bits with the desc_imm
2715       * immediate.
2716       */
2717      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2718
2719      brw_pop_insn_state(p);
2720
2721      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2722      send = next_insn(p, BRW_OPCODE_SEND);
2723      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
2724
2725      if (devinfo->ver >= 12)
2726         brw_inst_set_send_sel_reg32_desc(devinfo, send, true);
2727      else
2728         brw_set_src1(p, send, addr);
2729   }
2730
2731   brw_set_dest(p, send, dst);
2732   brw_inst_set_sfid(devinfo, send, sfid);
2733   brw_inst_set_eot(devinfo, send, eot);
2734}
2735
2736void
2737brw_send_indirect_split_message(struct brw_codegen *p,
2738                                unsigned sfid,
2739                                struct brw_reg dst,
2740                                struct brw_reg payload0,
2741                                struct brw_reg payload1,
2742                                struct brw_reg desc,
2743                                unsigned desc_imm,
2744                                struct brw_reg ex_desc,
2745                                unsigned ex_desc_imm,
2746                                bool eot)
2747{
2748   const struct intel_device_info *devinfo = p->devinfo;
2749   struct brw_inst *send;
2750
2751   dst = retype(dst, BRW_REGISTER_TYPE_UW);
2752
2753   assert(desc.type == BRW_REGISTER_TYPE_UD);
2754
2755   if (desc.file == BRW_IMMEDIATE_VALUE) {
2756      desc.ud |= desc_imm;
2757   } else {
2758      const struct tgl_swsb swsb = brw_get_default_swsb(p);
2759      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2760
2761      brw_push_insn_state(p);
2762      brw_set_default_access_mode(p, BRW_ALIGN_1);
2763      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2764      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2765      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2766      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2767
2768      /* Load the indirect descriptor to an address register using OR so the
2769       * caller can specify additional descriptor bits with the desc_imm
2770       * immediate.
2771       */
2772      brw_OR(p, addr, desc, brw_imm_ud(desc_imm));
2773
2774      brw_pop_insn_state(p);
2775      desc = addr;
2776
2777      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2778   }
2779
2780   if (ex_desc.file == BRW_IMMEDIATE_VALUE &&
2781       (devinfo->ver >= 12 ||
2782        ((ex_desc.ud | ex_desc_imm) & INTEL_MASK(15, 12)) == 0)) {
2783      ex_desc.ud |= ex_desc_imm;
2784   } else {
2785      const struct tgl_swsb swsb = brw_get_default_swsb(p);
2786      struct brw_reg addr = retype(brw_address_reg(2), BRW_REGISTER_TYPE_UD);
2787
2788      brw_push_insn_state(p);
2789      brw_set_default_access_mode(p, BRW_ALIGN_1);
2790      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2791      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2792      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2793      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2794
2795      /* Load the indirect extended descriptor to an address register using OR
2796       * so the caller can specify additional descriptor bits with the
2797       * desc_imm immediate.
2798       *
2799       * Even though the instruction dispatcher always pulls the SFID and EOT
2800       * fields from the instruction itself, actual external unit which
2801       * processes the message gets the SFID and EOT from the extended
2802       * descriptor which comes from the address register.  If we don't OR
2803       * those two bits in, the external unit may get confused and hang.
2804       */
2805      unsigned imm_part = ex_desc_imm | sfid | eot << 5;
2806
2807      if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2808         /* ex_desc bits 15:12 don't exist in the instruction encoding prior
2809          * to Gfx12, so we may have fallen back to an indirect extended
2810          * descriptor.
2811          */
2812         brw_MOV(p, addr, brw_imm_ud(ex_desc.ud | imm_part));
2813      } else {
2814         brw_OR(p, addr, ex_desc, brw_imm_ud(imm_part));
2815      }
2816
2817      brw_pop_insn_state(p);
2818      ex_desc = addr;
2819
2820      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2821   }
2822
2823   send = next_insn(p, devinfo->ver >= 12 ? BRW_OPCODE_SEND : BRW_OPCODE_SENDS);
2824   brw_set_dest(p, send, dst);
2825   brw_set_src0(p, send, retype(payload0, BRW_REGISTER_TYPE_UD));
2826   brw_set_src1(p, send, retype(payload1, BRW_REGISTER_TYPE_UD));
2827
2828   if (desc.file == BRW_IMMEDIATE_VALUE) {
2829      brw_inst_set_send_sel_reg32_desc(devinfo, send, 0);
2830      brw_inst_set_send_desc(devinfo, send, desc.ud);
2831   } else {
2832      assert(desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2833      assert(desc.nr == BRW_ARF_ADDRESS);
2834      assert(desc.subnr == 0);
2835      brw_inst_set_send_sel_reg32_desc(devinfo, send, 1);
2836   }
2837
2838   if (ex_desc.file == BRW_IMMEDIATE_VALUE) {
2839      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 0);
2840      brw_inst_set_sends_ex_desc(devinfo, send, ex_desc.ud);
2841   } else {
2842      assert(ex_desc.file == BRW_ARCHITECTURE_REGISTER_FILE);
2843      assert(ex_desc.nr == BRW_ARF_ADDRESS);
2844      assert((ex_desc.subnr & 0x3) == 0);
2845      brw_inst_set_send_sel_reg32_ex_desc(devinfo, send, 1);
2846      brw_inst_set_send_ex_desc_ia_subreg_nr(devinfo, send, ex_desc.subnr >> 2);
2847   }
2848
2849   brw_inst_set_sfid(devinfo, send, sfid);
2850   brw_inst_set_eot(devinfo, send, eot);
2851}
2852
2853static void
2854brw_send_indirect_surface_message(struct brw_codegen *p,
2855                                  unsigned sfid,
2856                                  struct brw_reg dst,
2857                                  struct brw_reg payload,
2858                                  struct brw_reg surface,
2859                                  unsigned desc_imm)
2860{
2861   if (surface.file != BRW_IMMEDIATE_VALUE) {
2862      const struct tgl_swsb swsb = brw_get_default_swsb(p);
2863      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
2864
2865      brw_push_insn_state(p);
2866      brw_set_default_access_mode(p, BRW_ALIGN_1);
2867      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2868      brw_set_default_exec_size(p, BRW_EXECUTE_1);
2869      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
2870      brw_set_default_swsb(p, tgl_swsb_src_dep(swsb));
2871
2872      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
2873       * some surface array is accessed out of bounds.
2874       */
2875      brw_AND(p, addr,
2876              suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
2877                        BRW_GET_SWZ(surface.swizzle, 0)),
2878              brw_imm_ud(0xff));
2879
2880      brw_pop_insn_state(p);
2881
2882      surface = addr;
2883      brw_set_default_swsb(p, tgl_swsb_dst_dep(swsb, 1));
2884   }
2885
2886   brw_send_indirect_message(p, sfid, dst, payload, surface, desc_imm, false);
2887}
2888
2889static bool
2890while_jumps_before_offset(const struct intel_device_info *devinfo,
2891                          brw_inst *insn, int while_offset, int start_offset)
2892{
2893   int scale = 16 / brw_jump_scale(devinfo);
2894   int jip = devinfo->ver == 6 ? brw_inst_gfx6_jump_count(devinfo, insn)
2895                               : brw_inst_jip(devinfo, insn);
2896   assert(jip < 0);
2897   return while_offset + jip * scale <= start_offset;
2898}
2899
2900
2901static int
2902brw_find_next_block_end(struct brw_codegen *p, int start_offset)
2903{
2904   int offset;
2905   void *store = p->store;
2906   const struct intel_device_info *devinfo = p->devinfo;
2907
2908   int depth = 0;
2909
2910   for (offset = next_offset(devinfo, store, start_offset);
2911        offset < p->next_insn_offset;
2912        offset = next_offset(devinfo, store, offset)) {
2913      brw_inst *insn = store + offset;
2914
2915      switch (brw_inst_opcode(devinfo, insn)) {
2916      case BRW_OPCODE_IF:
2917         depth++;
2918         break;
2919      case BRW_OPCODE_ENDIF:
2920         if (depth == 0)
2921            return offset;
2922         depth--;
2923         break;
2924      case BRW_OPCODE_WHILE:
2925         /* If the while doesn't jump before our instruction, it's the end
2926          * of a sibling do...while loop.  Ignore it.
2927          */
2928         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
2929            continue;
2930         FALLTHROUGH;
2931      case BRW_OPCODE_ELSE:
2932      case BRW_OPCODE_HALT:
2933         if (depth == 0)
2934            return offset;
2935         break;
2936      default:
2937         break;
2938      }
2939   }
2940
2941   return 0;
2942}
2943
2944/* There is no DO instruction on gfx6, so to find the end of the loop
2945 * we have to see if the loop is jumping back before our start
2946 * instruction.
2947 */
2948static int
2949brw_find_loop_end(struct brw_codegen *p, int start_offset)
2950{
2951   const struct intel_device_info *devinfo = p->devinfo;
2952   int offset;
2953   void *store = p->store;
2954
2955   assert(devinfo->ver >= 6);
2956
2957   /* Always start after the instruction (such as a WHILE) we're trying to fix
2958    * up.
2959    */
2960   for (offset = next_offset(devinfo, store, start_offset);
2961        offset < p->next_insn_offset;
2962        offset = next_offset(devinfo, store, offset)) {
2963      brw_inst *insn = store + offset;
2964
2965      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
2966	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
2967	    return offset;
2968      }
2969   }
2970   assert(!"not reached");
2971   return start_offset;
2972}
2973
2974/* After program generation, go back and update the UIP and JIP of
2975 * BREAK, CONT, and HALT instructions to their correct locations.
2976 */
2977void
2978brw_set_uip_jip(struct brw_codegen *p, int start_offset)
2979{
2980   const struct intel_device_info *devinfo = p->devinfo;
2981   int offset;
2982   int br = brw_jump_scale(devinfo);
2983   int scale = 16 / br;
2984   void *store = p->store;
2985
2986   if (devinfo->ver < 6)
2987      return;
2988
2989   for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
2990      brw_inst *insn = store + offset;
2991      assert(brw_inst_cmpt_control(devinfo, insn) == 0);
2992
2993      int block_end_offset = brw_find_next_block_end(p, offset);
2994      switch (brw_inst_opcode(devinfo, insn)) {
2995      case BRW_OPCODE_BREAK:
2996         assert(block_end_offset != 0);
2997         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
2998	 /* Gfx7 UIP points to WHILE; Gfx6 points just after it */
2999         brw_inst_set_uip(devinfo, insn,
3000	    (brw_find_loop_end(p, offset) - offset +
3001             (devinfo->ver == 6 ? 16 : 0)) / scale);
3002	 break;
3003      case BRW_OPCODE_CONTINUE:
3004         assert(block_end_offset != 0);
3005         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3006         brw_inst_set_uip(devinfo, insn,
3007            (brw_find_loop_end(p, offset) - offset) / scale);
3008
3009         assert(brw_inst_uip(devinfo, insn) != 0);
3010         assert(brw_inst_jip(devinfo, insn) != 0);
3011	 break;
3012
3013      case BRW_OPCODE_ENDIF: {
3014         int32_t jump = (block_end_offset == 0) ?
3015                        1 * br : (block_end_offset - offset) / scale;
3016         if (devinfo->ver >= 7)
3017            brw_inst_set_jip(devinfo, insn, jump);
3018         else
3019            brw_inst_set_gfx6_jump_count(devinfo, insn, jump);
3020	 break;
3021      }
3022
3023      case BRW_OPCODE_HALT:
3024	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
3025	  *
3026	  *    "In case of the halt instruction not inside any conditional
3027	  *     code block, the value of <JIP> and <UIP> should be the
3028	  *     same. In case of the halt instruction inside conditional code
3029	  *     block, the <UIP> should be the end of the program, and the
3030	  *     <JIP> should be end of the most inner conditional code block."
3031	  *
3032	  * The uip will have already been set by whoever set up the
3033	  * instruction.
3034	  */
3035	 if (block_end_offset == 0) {
3036            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
3037	 } else {
3038            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
3039	 }
3040         assert(brw_inst_uip(devinfo, insn) != 0);
3041         assert(brw_inst_jip(devinfo, insn) != 0);
3042	 break;
3043
3044      default:
3045         break;
3046      }
3047   }
3048}
3049
3050void brw_ff_sync(struct brw_codegen *p,
3051		   struct brw_reg dest,
3052		   unsigned msg_reg_nr,
3053		   struct brw_reg src0,
3054		   bool allocate,
3055		   unsigned response_length,
3056		   bool eot)
3057{
3058   const struct intel_device_info *devinfo = p->devinfo;
3059   brw_inst *insn;
3060
3061   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3062
3063   insn = next_insn(p, BRW_OPCODE_SEND);
3064   brw_set_dest(p, insn, dest);
3065   brw_set_src0(p, insn, src0);
3066   brw_set_src1(p, insn, brw_imm_d(0));
3067
3068   if (devinfo->ver < 6)
3069      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
3070
3071   brw_set_ff_sync_message(p,
3072			   insn,
3073			   allocate,
3074			   response_length,
3075			   eot);
3076}
3077
3078/**
3079 * Emit the SEND instruction necessary to generate stream output data on Gfx6
3080 * (for transform feedback).
3081 *
3082 * If send_commit_msg is true, this is the last piece of stream output data
3083 * from this thread, so send the data as a committed write.  According to the
3084 * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
3085 *
3086 *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
3087 *   writes are complete by sending the final write as a committed write."
3088 */
3089void
3090brw_svb_write(struct brw_codegen *p,
3091              struct brw_reg dest,
3092              unsigned msg_reg_nr,
3093              struct brw_reg src0,
3094              unsigned binding_table_index,
3095              bool   send_commit_msg)
3096{
3097   const struct intel_device_info *devinfo = p->devinfo;
3098   assert(devinfo->ver == 6);
3099   const unsigned target_cache = GFX6_SFID_DATAPORT_RENDER_CACHE;
3100   brw_inst *insn;
3101
3102   gfx6_resolve_implied_move(p, &src0, msg_reg_nr);
3103
3104   insn = next_insn(p, BRW_OPCODE_SEND);
3105   brw_inst_set_sfid(devinfo, insn, target_cache);
3106   brw_set_dest(p, insn, dest);
3107   brw_set_src0(p, insn, src0);
3108   brw_set_desc(p, insn,
3109                brw_message_desc(devinfo, 1, send_commit_msg, true) |
3110                brw_dp_write_desc(devinfo, binding_table_index,
3111                                  0, /* msg_control: ignored */
3112                                  GFX6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
3113                                  send_commit_msg)); /* send_commit_msg */
3114}
3115
3116static unsigned
3117brw_surface_payload_size(unsigned num_channels,
3118                         unsigned exec_size /**< 0 for SIMD4x2 */)
3119{
3120   if (exec_size == 0)
3121      return 1; /* SIMD4x2 */
3122   else if (exec_size <= 8)
3123      return num_channels;
3124   else
3125      return 2 * num_channels;
3126}
3127
3128void
3129brw_untyped_atomic(struct brw_codegen *p,
3130                   struct brw_reg dst,
3131                   struct brw_reg payload,
3132                   struct brw_reg surface,
3133                   unsigned atomic_op,
3134                   unsigned msg_length,
3135                   bool response_expected,
3136                   bool header_present)
3137{
3138   const struct intel_device_info *devinfo = p->devinfo;
3139   const unsigned sfid = (devinfo->verx10 >= 75 ?
3140                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3141                          GFX7_SFID_DATAPORT_DATA_CACHE);
3142   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3143   /* SIMD4x2 untyped atomic instructions only exist on HSW+ */
3144   const bool has_simd4x2 = devinfo->verx10 >= 75;
3145   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3146                              has_simd4x2 ? 0 : 8;
3147   const unsigned response_length =
3148      brw_surface_payload_size(response_expected, exec_size);
3149   const unsigned desc =
3150      brw_message_desc(devinfo, msg_length, response_length, header_present) |
3151      brw_dp_untyped_atomic_desc(devinfo, exec_size, atomic_op,
3152                                 response_expected);
3153   /* Mask out unused components -- This is especially important in Align16
3154    * mode on generations that don't have native support for SIMD4x2 atomics,
3155    * because unused but enabled components will cause the dataport to perform
3156    * additional atomic operations on the addresses that happen to be in the
3157    * uninitialized Y, Z and W coordinates of the payload.
3158    */
3159   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
3160
3161   brw_send_indirect_surface_message(p, sfid, brw_writemask(dst, mask),
3162                                     payload, surface, desc);
3163}
3164
3165void
3166brw_untyped_surface_read(struct brw_codegen *p,
3167                         struct brw_reg dst,
3168                         struct brw_reg payload,
3169                         struct brw_reg surface,
3170                         unsigned msg_length,
3171                         unsigned num_channels)
3172{
3173   const struct intel_device_info *devinfo = p->devinfo;
3174   const unsigned sfid = (devinfo->verx10 >= 75 ?
3175                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3176                          GFX7_SFID_DATAPORT_DATA_CACHE);
3177   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3178   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) : 0;
3179   const unsigned response_length =
3180      brw_surface_payload_size(num_channels, exec_size);
3181   const unsigned desc =
3182      brw_message_desc(devinfo, msg_length, response_length, false) |
3183      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, false);
3184
3185   brw_send_indirect_surface_message(p, sfid, dst, payload, surface, desc);
3186}
3187
3188void
3189brw_untyped_surface_write(struct brw_codegen *p,
3190                          struct brw_reg payload,
3191                          struct brw_reg surface,
3192                          unsigned msg_length,
3193                          unsigned num_channels,
3194                          bool header_present)
3195{
3196   const struct intel_device_info *devinfo = p->devinfo;
3197   const unsigned sfid = (devinfo->verx10 >= 75 ?
3198                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3199                          GFX7_SFID_DATAPORT_DATA_CACHE);
3200   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3201   /* SIMD4x2 untyped surface write instructions only exist on HSW+ */
3202   const bool has_simd4x2 = devinfo->verx10 >= 75;
3203   const unsigned exec_size = align1 ? 1 << brw_get_default_exec_size(p) :
3204                              has_simd4x2 ? 0 : 8;
3205   const unsigned desc =
3206      brw_message_desc(devinfo, msg_length, 0, header_present) |
3207      brw_dp_untyped_surface_rw_desc(devinfo, exec_size, num_channels, true);
3208   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
3209   const unsigned mask = !has_simd4x2 && !align1 ? WRITEMASK_X : WRITEMASK_XYZW;
3210
3211   brw_send_indirect_surface_message(p, sfid, brw_writemask(brw_null_reg(), mask),
3212                                     payload, surface, desc);
3213}
3214
3215static void
3216brw_set_memory_fence_message(struct brw_codegen *p,
3217                             struct brw_inst *insn,
3218                             enum brw_message_target sfid,
3219                             bool commit_enable,
3220                             unsigned bti)
3221{
3222   const struct intel_device_info *devinfo = p->devinfo;
3223
3224   brw_set_desc(p, insn, brw_message_desc(
3225                   devinfo, 1, (commit_enable ? 1 : 0), true));
3226
3227   brw_inst_set_sfid(devinfo, insn, sfid);
3228
3229   switch (sfid) {
3230   case GFX6_SFID_DATAPORT_RENDER_CACHE:
3231      brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_RC_MEMORY_FENCE);
3232      break;
3233   case GFX7_SFID_DATAPORT_DATA_CACHE:
3234      brw_inst_set_dp_msg_type(devinfo, insn, GFX7_DATAPORT_DC_MEMORY_FENCE);
3235      break;
3236   default:
3237      unreachable("Not reached");
3238   }
3239
3240   if (commit_enable)
3241      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
3242
3243   assert(devinfo->ver >= 11 || bti == 0);
3244   brw_inst_set_binding_table_index(devinfo, insn, bti);
3245}
3246
3247static void
3248gfx12_set_memory_fence_message(struct brw_codegen *p,
3249                               struct brw_inst *insn,
3250                               enum brw_message_target sfid)
3251{
3252   const unsigned mlen = 1; /* g0 header */
3253    /* Completion signaled by write to register. No data returned. */
3254   const unsigned rlen = 1;
3255
3256   brw_inst_set_sfid(p->devinfo, insn, sfid);
3257
3258   if (sfid == BRW_SFID_URB) {
3259      brw_set_desc(p, insn, brw_urb_fence_desc(p->devinfo) |
3260                            brw_message_desc(p->devinfo, mlen, rlen, false));
3261   } else {
3262      enum lsc_fence_scope scope = LSC_FENCE_THREADGROUP;
3263      enum lsc_flush_type flush_type = LSC_FLUSH_TYPE_NONE;
3264
3265      if (sfid == GFX12_SFID_TGM) {
3266         scope = LSC_FENCE_TILE;
3267         flush_type = LSC_FLUSH_TYPE_EVICT;
3268      }
3269
3270      brw_set_desc(p, insn, lsc_fence_msg_desc(p->devinfo, scope,
3271                                               flush_type, false) |
3272                            brw_message_desc(p->devinfo, mlen, rlen, false));
3273   }
3274}
3275
3276void
3277brw_memory_fence(struct brw_codegen *p,
3278                 struct brw_reg dst,
3279                 struct brw_reg src,
3280                 enum opcode send_op,
3281                 enum brw_message_target sfid,
3282                 bool commit_enable,
3283                 unsigned bti)
3284{
3285   const struct intel_device_info *devinfo = p->devinfo;
3286
3287   dst = retype(vec1(dst), BRW_REGISTER_TYPE_UW);
3288   src = retype(vec1(src), BRW_REGISTER_TYPE_UD);
3289
3290   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
3291    * message doesn't write anything back.
3292    */
3293   struct brw_inst *insn = next_insn(p, send_op);
3294   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3295   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3296   brw_set_dest(p, insn, dst);
3297   brw_set_src0(p, insn, src);
3298
3299   /* All DG2 hardware requires LSC for fence messages, even A-step */
3300   if (devinfo->has_lsc)
3301      gfx12_set_memory_fence_message(p, insn, sfid);
3302   else
3303      brw_set_memory_fence_message(p, insn, sfid, commit_enable, bti);
3304}
3305
3306void
3307brw_pixel_interpolator_query(struct brw_codegen *p,
3308                             struct brw_reg dest,
3309                             struct brw_reg mrf,
3310                             bool noperspective,
3311                             bool coarse_pixel_rate,
3312                             unsigned mode,
3313                             struct brw_reg data,
3314                             unsigned msg_length,
3315                             unsigned response_length)
3316{
3317   const struct intel_device_info *devinfo = p->devinfo;
3318   const uint16_t exec_size = brw_get_default_exec_size(p);
3319   const unsigned slot_group = brw_get_default_group(p) / 16;
3320   const unsigned simd_mode = (exec_size == BRW_EXECUTE_16);
3321   const unsigned desc =
3322      brw_message_desc(devinfo, msg_length, response_length, false) |
3323      brw_pixel_interp_desc(devinfo, mode, noperspective, coarse_pixel_rate,
3324                            simd_mode, slot_group);
3325
3326   /* brw_send_indirect_message will automatically use a direct send message
3327    * if data is actually immediate.
3328    */
3329   brw_send_indirect_message(p,
3330                             GFX7_SFID_PIXEL_INTERPOLATOR,
3331                             dest,
3332                             mrf,
3333                             vec1(data),
3334                             desc,
3335                             false);
3336}
3337
3338void
3339brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
3340                      struct brw_reg mask)
3341{
3342   const struct intel_device_info *devinfo = p->devinfo;
3343   const unsigned exec_size = 1 << brw_get_default_exec_size(p);
3344   const unsigned qtr_control = brw_get_default_group(p) / 8;
3345   brw_inst *inst;
3346
3347   assert(devinfo->ver >= 7);
3348   assert(mask.type == BRW_REGISTER_TYPE_UD);
3349
3350   brw_push_insn_state(p);
3351
3352   /* The flag register is only used on Gfx7 in align1 mode, so avoid setting
3353    * unnecessary bits in the instruction words, get the information we need
3354    * and reset the default flag register. This allows more instructions to be
3355    * compacted.
3356    */
3357   const unsigned flag_subreg = p->current->flag_subreg;
3358   brw_set_default_flag_reg(p, 0, 0);
3359
3360   if (brw_get_default_access_mode(p) == BRW_ALIGN_1) {
3361      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3362
3363      if (devinfo->ver >= 8) {
3364         /* Getting the first active channel index is easy on Gfx8: Just find
3365          * the first bit set in the execution mask.  The register exists on
3366          * HSW already but it reads back as all ones when the current
3367          * instruction has execution masking disabled, so it's kind of
3368          * useless.
3369          */
3370         struct brw_reg exec_mask =
3371            retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
3372
3373         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3374         if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
3375            /* Unfortunately, ce0 does not take into account the thread
3376             * dispatch mask, which may be a problem in cases where it's not
3377             * tightly packed (i.e. it doesn't have the form '2^n - 1' for
3378             * some n).  Combine ce0 with the given dispatch (or vector) mask
3379             * to mask off those channels which were never dispatched by the
3380             * hardware.
3381             */
3382            brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
3383            brw_set_default_swsb(p, tgl_swsb_regdist(1));
3384            brw_AND(p, vec1(dst), exec_mask, vec1(dst));
3385            exec_mask = vec1(dst);
3386         }
3387
3388         /* Quarter control has the effect of magically shifting the value of
3389          * ce0 so you'll get the first active channel relative to the
3390          * specified quarter control as result.
3391          */
3392         inst = brw_FBL(p, vec1(dst), exec_mask);
3393      } else {
3394         const struct brw_reg flag = brw_flag_subreg(flag_subreg);
3395
3396         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3397         brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
3398
3399         /* Run enough instructions returning zero with execution masking and
3400          * a conditional modifier enabled in order to get the full execution
3401          * mask in f1.0.  We could use a single 32-wide move here if it
3402          * weren't because of the hardware bug that causes channel enables to
3403          * be applied incorrectly to the second half of 32-wide instructions
3404          * on Gfx7.
3405          */
3406         const unsigned lower_size = MIN2(16, exec_size);
3407         for (unsigned i = 0; i < exec_size / lower_size; i++) {
3408            inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
3409                           brw_imm_uw(0));
3410            brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3411            brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
3412            brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
3413            brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
3414            brw_inst_set_flag_reg_nr(devinfo, inst, flag_subreg / 2);
3415            brw_inst_set_flag_subreg_nr(devinfo, inst, flag_subreg % 2);
3416         }
3417
3418         /* Find the first bit set in the exec_size-wide portion of the flag
3419          * register that was updated by the last sequence of MOV
3420          * instructions.
3421          */
3422         const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
3423         brw_set_default_exec_size(p, BRW_EXECUTE_1);
3424         brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
3425      }
3426   } else {
3427      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3428
3429      if (devinfo->ver >= 8 &&
3430          mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
3431         /* In SIMD4x2 mode the first active channel index is just the
3432          * negation of the first bit of the mask register.  Note that ce0
3433          * doesn't take into account the dispatch mask, so the Gfx7 path
3434          * should be used instead unless you have the guarantee that the
3435          * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
3436          * for some n).
3437          */
3438         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
3439                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
3440                        brw_imm_ud(1));
3441
3442      } else {
3443         /* Overwrite the destination without and with execution masking to
3444          * find out which of the channels is active.
3445          */
3446         brw_push_insn_state(p);
3447         brw_set_default_exec_size(p, BRW_EXECUTE_4);
3448         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3449                 brw_imm_ud(1));
3450
3451         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
3452                        brw_imm_ud(0));
3453         brw_pop_insn_state(p);
3454         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
3455      }
3456   }
3457
3458   brw_pop_insn_state(p);
3459}
3460
3461void
3462brw_broadcast(struct brw_codegen *p,
3463              struct brw_reg dst,
3464              struct brw_reg src,
3465              struct brw_reg idx)
3466{
3467   const struct intel_device_info *devinfo = p->devinfo;
3468   const bool align1 = brw_get_default_access_mode(p) == BRW_ALIGN_1;
3469   brw_inst *inst;
3470
3471   brw_push_insn_state(p);
3472   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3473   brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
3474
3475   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
3476          src.address_mode == BRW_ADDRESS_DIRECT);
3477   assert(!src.abs && !src.negate);
3478   assert(src.type == dst.type);
3479
3480   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
3481       idx.file == BRW_IMMEDIATE_VALUE) {
3482      /* Trivial, the source is already uniform or the index is a constant.
3483       * We will typically not get here if the optimizer is doing its job, but
3484       * asserting would be mean.
3485       */
3486      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
3487      src = align1 ? stride(suboffset(src, i), 0, 1, 0) :
3488                     stride(suboffset(src, 4 * i), 0, 4, 1);
3489
3490      if (type_sz(src.type) > 4 && !devinfo->has_64bit_float) {
3491         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3492                    subscript(src, BRW_REGISTER_TYPE_D, 0));
3493         brw_set_default_swsb(p, tgl_swsb_null());
3494         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3495                    subscript(src, BRW_REGISTER_TYPE_D, 1));
3496      } else {
3497         brw_MOV(p, dst, src);
3498      }
3499   } else {
3500      /* From the Haswell PRM section "Register Region Restrictions":
3501       *
3502       *    "The lower bits of the AddressImmediate must not overflow to
3503       *    change the register address.  The lower 5 bits of Address
3504       *    Immediate when added to lower 5 bits of address register gives
3505       *    the sub-register offset. The upper bits of Address Immediate
3506       *    when added to upper bits of address register gives the register
3507       *    address. Any overflow from sub-register offset is dropped."
3508       *
3509       * Fortunately, for broadcast, we never have a sub-register offset so
3510       * this isn't an issue.
3511       */
3512      assert(src.subnr == 0);
3513
3514      if (align1) {
3515         const struct brw_reg addr =
3516            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
3517         unsigned offset = src.nr * REG_SIZE + src.subnr;
3518         /* Limit in bytes of the signed indirect addressing immediate. */
3519         const unsigned limit = 512;
3520
3521         brw_push_insn_state(p);
3522         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3523         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
3524
3525         /* Take into account the component size and horizontal stride. */
3526         assert(src.vstride == src.hstride + src.width);
3527         brw_SHL(p, addr, vec1(idx),
3528                 brw_imm_ud(util_logbase2(type_sz(src.type)) +
3529                            src.hstride - 1));
3530
3531         /* We can only address up to limit bytes using the indirect
3532          * addressing immediate, account for the difference if the source
3533          * register is above this limit.
3534          */
3535         if (offset >= limit) {
3536            brw_set_default_swsb(p, tgl_swsb_regdist(1));
3537            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
3538            offset = offset % limit;
3539         }
3540
3541         brw_pop_insn_state(p);
3542
3543         brw_set_default_swsb(p, tgl_swsb_regdist(1));
3544
3545         /* Use indirect addressing to fetch the specified component. */
3546         if (type_sz(src.type) > 4 &&
3547             (devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
3548              !devinfo->has_64bit_float)) {
3549            /* From the Cherryview PRM Vol 7. "Register Region Restrictions":
3550             *
3551             *    "When source or destination datatype is 64b or operation is
3552             *    integer DWord multiply, indirect addressing must not be
3553             *    used."
3554             *
3555             * To work around both of this issue, we do two integer MOVs
3556             * insead of one 64-bit MOV.  Because no double value should ever
3557             * cross a register boundary, it's safe to use the immediate
3558             * offset in the indirect here to handle adding 4 bytes to the
3559             * offset and avoid the extra ADD to the register file.
3560             */
3561            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
3562                       retype(brw_vec1_indirect(addr.subnr, offset),
3563                              BRW_REGISTER_TYPE_D));
3564            brw_set_default_swsb(p, tgl_swsb_null());
3565            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
3566                       retype(brw_vec1_indirect(addr.subnr, offset + 4),
3567                              BRW_REGISTER_TYPE_D));
3568         } else {
3569            brw_MOV(p, dst,
3570                    retype(brw_vec1_indirect(addr.subnr, offset), src.type));
3571         }
3572      } else {
3573         /* In SIMD4x2 mode the index can be either zero or one, replicate it
3574          * to all bits of a flag register,
3575          */
3576         inst = brw_MOV(p,
3577                        brw_null_reg(),
3578                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
3579         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
3580         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
3581         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3582
3583         /* and use predicated SEL to pick the right channel. */
3584         inst = brw_SEL(p, dst,
3585                        stride(suboffset(src, 4), 4, 4, 1),
3586                        stride(src, 4, 4, 1));
3587         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
3588         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
3589      }
3590   }
3591
3592   brw_pop_insn_state(p);
3593}
3594
3595/**
3596 * This instruction is generated as a single-channel align1 instruction by
3597 * both the VS and FS stages when using INTEL_DEBUG=shader_time.
3598 *
3599 * We can't use the typed atomic op in the FS because that has the execution
3600 * mask ANDed with the pixel mask, but we just want to write the one dword for
3601 * all the pixels.
3602 *
3603 * We don't use the SIMD4x2 atomic ops in the VS because want to just write
3604 * one u32.  So we use the same untyped atomic write message as the pixel
3605 * shader.
3606 *
3607 * The untyped atomic operation requires a BUFFER surface type with RAW
3608 * format, and is only accessible through the legacy DATA_CACHE dataport
3609 * messages.
3610 */
3611void brw_shader_time_add(struct brw_codegen *p,
3612                         struct brw_reg payload,
3613                         uint32_t surf_index)
3614{
3615   const struct intel_device_info *devinfo = p->devinfo;
3616   const unsigned sfid = (devinfo->verx10 >= 75 ?
3617                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
3618                          GFX7_SFID_DATAPORT_DATA_CACHE);
3619   assert(devinfo->ver >= 7);
3620
3621   brw_push_insn_state(p);
3622   brw_set_default_access_mode(p, BRW_ALIGN_1);
3623   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
3624   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
3625   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
3626
3627   /* We use brw_vec1_reg and unmasked because we want to increment the given
3628    * offset only once.
3629    */
3630   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
3631                                      BRW_ARF_NULL, 0));
3632   brw_set_src0(p, send, brw_vec1_reg(payload.file,
3633                                      payload.nr, 0));
3634   brw_set_desc(p, send, (brw_message_desc(devinfo, 2, 0, false) |
3635                          brw_dp_untyped_atomic_desc(devinfo, 1, BRW_AOP_ADD,
3636                                                     false)));
3637
3638   brw_inst_set_sfid(devinfo, send, sfid);
3639   brw_inst_set_binding_table_index(devinfo, send, surf_index);
3640
3641   brw_pop_insn_state(p);
3642}
3643
3644
3645/**
3646 * Emit the SEND message for a barrier
3647 */
3648void
3649brw_barrier(struct brw_codegen *p, struct brw_reg src)
3650{
3651   const struct intel_device_info *devinfo = p->devinfo;
3652   struct brw_inst *inst;
3653
3654   assert(devinfo->ver >= 7);
3655
3656   brw_push_insn_state(p);
3657   brw_set_default_access_mode(p, BRW_ALIGN_1);
3658   inst = next_insn(p, BRW_OPCODE_SEND);
3659   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
3660   brw_set_src0(p, inst, src);
3661   brw_set_src1(p, inst, brw_null_reg());
3662   brw_set_desc(p, inst, brw_message_desc(devinfo, 1, 0, false));
3663
3664   brw_inst_set_sfid(devinfo, inst, BRW_SFID_MESSAGE_GATEWAY);
3665   brw_inst_set_gateway_subfuncid(devinfo, inst,
3666                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
3667
3668   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
3669   brw_pop_insn_state(p);
3670}
3671
3672
3673/**
3674 * Emit the wait instruction for a barrier
3675 */
3676void
3677brw_WAIT(struct brw_codegen *p)
3678{
3679   const struct intel_device_info *devinfo = p->devinfo;
3680   struct brw_inst *insn;
3681
3682   struct brw_reg src = brw_notification_reg();
3683
3684   insn = next_insn(p, BRW_OPCODE_WAIT);
3685   brw_set_dest(p, insn, src);
3686   brw_set_src0(p, insn, src);
3687   brw_set_src1(p, insn, brw_null_reg());
3688
3689   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
3690   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
3691}
3692
3693void
3694brw_float_controls_mode(struct brw_codegen *p,
3695                        unsigned mode, unsigned mask)
3696{
3697   /* From the Skylake PRM, Volume 7, page 760:
3698    *  "Implementation Restriction on Register Access: When the control
3699    *   register is used as an explicit source and/or destination, hardware
3700    *   does not ensure execution pipeline coherency. Software must set the
3701    *   thread control field to ‘switch’ for an instruction that uses
3702    *   control register as an explicit operand."
3703    *
3704    * On Gfx12+ this is implemented in terms of SWSB annotations instead.
3705    */
3706   brw_set_default_swsb(p, tgl_swsb_regdist(1));
3707
3708   brw_inst *inst = brw_AND(p, brw_cr0_reg(0), brw_cr0_reg(0),
3709                            brw_imm_ud(~mask));
3710   brw_inst_set_exec_size(p->devinfo, inst, BRW_EXECUTE_1);
3711   if (p->devinfo->ver < 12)
3712      brw_inst_set_thread_control(p->devinfo, inst, BRW_THREAD_SWITCH);
3713
3714   if (mode) {
3715      brw_inst *inst_or = brw_OR(p, brw_cr0_reg(0), brw_cr0_reg(0),
3716                                 brw_imm_ud(mode));
3717      brw_inst_set_exec_size(p->devinfo, inst_or, BRW_EXECUTE_1);
3718      if (p->devinfo->ver < 12)
3719         brw_inst_set_thread_control(p->devinfo, inst_or, BRW_THREAD_SWITCH);
3720   }
3721
3722   if (p->devinfo->ver >= 12)
3723      brw_SYNC(p, TGL_SYNC_NOP);
3724}
3725
3726void
3727brw_update_reloc_imm(const struct intel_device_info *devinfo,
3728                     brw_inst *inst,
3729                     uint32_t value)
3730{
3731   /* Sanity check that the instruction is a MOV of an immediate */
3732   assert(brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV);
3733   assert(brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE);
3734
3735   /* If it was compacted, we can't safely rewrite */
3736   assert(brw_inst_cmpt_control(devinfo, inst) == 0);
3737
3738   brw_inst_set_imm_ud(devinfo, inst, value);
3739}
3740
3741/* A default value for constants that will be patched at run-time.
3742 * We pick an arbitrary value that prevents instruction compaction.
3743 */
3744#define DEFAULT_PATCH_IMM 0x4a7cc037
3745
3746void
3747brw_MOV_reloc_imm(struct brw_codegen *p,
3748                  struct brw_reg dst,
3749                  enum brw_reg_type src_type,
3750                  uint32_t id)
3751{
3752   assert(type_sz(src_type) == 4);
3753   assert(type_sz(dst.type) == 4);
3754
3755   brw_add_reloc(p, id, BRW_SHADER_RELOC_TYPE_MOV_IMM,
3756                 p->next_insn_offset, 0);
3757
3758   brw_MOV(p, dst, retype(brw_imm_ud(DEFAULT_PATCH_IMM), src_type));
3759}
3760