1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30#include "brw_eu.h"
31#include "brw_fs.h"
32#include "brw_cfg.h"
33
34static enum brw_reg_file
35brw_file_from_reg(fs_reg *reg)
36{
37   switch (reg->file) {
38   case ARF:
39      return BRW_ARCHITECTURE_REGISTER_FILE;
40   case FIXED_GRF:
41   case VGRF:
42      return BRW_GENERAL_REGISTER_FILE;
43   case MRF:
44      return BRW_MESSAGE_REGISTER_FILE;
45   case IMM:
46      return BRW_IMMEDIATE_VALUE;
47   case BAD_FILE:
48   case ATTR:
49   case UNIFORM:
50      unreachable("not reached");
51   }
52   return BRW_ARCHITECTURE_REGISTER_FILE;
53}
54
55static struct brw_reg
56brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
57                    fs_reg *reg, bool compressed)
58{
59   struct brw_reg brw_reg;
60
61   switch (reg->file) {
62   case MRF:
63      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64      /* Fallthrough */
65   case VGRF:
66      if (reg->stride == 0) {
67         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68      } else {
69         /* From the Haswell PRM:
70          *
71          *  "VertStride must be used to cross GRF register boundaries. This
72          *   rule implies that elements within a 'Width' cannot cross GRF
73          *   boundaries."
74          *
75          * The maximum width value that could satisfy this restriction is:
76          */
77         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78
79         /* Because the hardware can only split source regions at a whole
80          * multiple of width during decompression (i.e. vertically), clamp
81          * the value obtained above to the physical execution size of a
82          * single decompressed chunk of the instruction:
83          */
84         const unsigned phys_width = compressed ? inst->exec_size / 2 :
85                                     inst->exec_size;
86
87         /* XXX - The equation above is strictly speaking not correct on
88          *       hardware that supports unbalanced GRF writes -- On Gen9+
89          *       each decompressed chunk of the instruction may have a
90          *       different execution size when the number of components
91          *       written to each destination GRF is not the same.
92          */
93         if (reg->stride > 4) {
94            assert(reg != &inst->dst);
95            assert(reg->stride * type_sz(reg->type) <= REG_SIZE);
96            brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0);
97            brw_reg = stride(brw_reg, reg->stride, 1, 0);
98         } else {
99            const unsigned width = MIN2(reg_width, phys_width);
100            brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
101            brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
102         }
103
104         if (devinfo->gen == 7 && !devinfo->is_haswell) {
105            /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
106             *  "Each DF (Double Float) operand uses an element size of 4 rather
107             *   than 8 and all regioning parameters are twice what the values
108             *   would be based on the true element size: ExecSize, Width,
109             *   HorzStride, and VertStride. Each DF operand uses a pair of
110             *   channels and all masking and swizzing should be adjusted
111             *   appropriately."
112             *
113             * From the IvyBridge PRM (Special Requirements for Handling Double
114             * Precision Data Types, page 71):
115             *  "In Align1 mode, all regioning parameters like stride, execution
116             *   size, and width must use the syntax of a pair of packed
117             *   floats. The offsets for these data types must be 64-bit
118             *   aligned. The execution size and regioning parameters are in terms
119             *   of floats."
120             *
121             * Summarized: when handling DF-typed arguments, ExecSize,
122             * VertStride, and Width must be doubled.
123             *
124             * It applies to BayTrail too.
125             */
126            if (type_sz(reg->type) == 8) {
127               brw_reg.width++;
128               if (brw_reg.vstride > 0)
129                  brw_reg.vstride++;
130               assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
131            }
132
133            /* When converting from DF->F, we set the destination stride to 2
134             * because each d2f conversion implicitly writes 2 floats, being
135             * the first one the converted value. IVB/BYT actually writes two
136             * F components per SIMD channel, and every other component is
137             * filled with garbage.
138             */
139            if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
140                type_sz(inst->dst.type) < 8) {
141               assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
142               brw_reg.hstride--;
143            }
144         }
145      }
146
147      brw_reg = retype(brw_reg, reg->type);
148      brw_reg = byte_offset(brw_reg, reg->offset);
149      brw_reg.abs = reg->abs;
150      brw_reg.negate = reg->negate;
151      break;
152   case ARF:
153   case FIXED_GRF:
154   case IMM:
155      assert(reg->offset == 0);
156      brw_reg = reg->as_brw_reg();
157      break;
158   case BAD_FILE:
159      /* Probably unused. */
160      brw_reg = brw_null_reg();
161      break;
162   case ATTR:
163   case UNIFORM:
164      unreachable("not reached");
165   }
166
167   /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
168    * region, but on IVB and BYT DF regions must be programmed in terms of
169    * floats. A <0,2,1> region accomplishes this.
170    */
171   if (devinfo->gen == 7 && !devinfo->is_haswell &&
172       type_sz(reg->type) == 8 &&
173       brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
174       brw_reg.width == BRW_WIDTH_1 &&
175       brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
176      brw_reg.width = BRW_WIDTH_2;
177      brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
178   }
179
180   return brw_reg;
181}
182
183fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
184                           void *mem_ctx,
185                           struct brw_stage_prog_data *prog_data,
186                           unsigned promoted_constants,
187                           bool runtime_check_aads_emit,
188                           gl_shader_stage stage)
189
190   : compiler(compiler), log_data(log_data),
191     devinfo(compiler->devinfo),
192     prog_data(prog_data),
193     promoted_constants(promoted_constants),
194     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
195     stage(stage), mem_ctx(mem_ctx)
196{
197   p = rzalloc(mem_ctx, struct brw_codegen);
198   brw_init_codegen(devinfo, p, mem_ctx);
199
200   /* In the FS code generator, we are very careful to ensure that we always
201    * set the right execution size so we don't need the EU code to "help" us
202    * by trying to infer it.  Sometimes, it infers the wrong thing.
203    */
204   p->automatic_exec_sizes = false;
205}
206
207fs_generator::~fs_generator()
208{
209}
210
211class ip_record : public exec_node {
212public:
213   DECLARE_RALLOC_CXX_OPERATORS(ip_record)
214
215   ip_record(int ip)
216   {
217      this->ip = ip;
218   }
219
220   int ip;
221};
222
223bool
224fs_generator::patch_discard_jumps_to_fb_writes()
225{
226   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
227      return false;
228
229   int scale = brw_jump_scale(p->devinfo);
230
231   /* There is a somewhat strange undocumented requirement of using
232    * HALT, according to the simulator.  If some channel has HALTed to
233    * a particular UIP, then by the end of the program, every channel
234    * must have HALTed to that UIP.  Furthermore, the tracking is a
235    * stack, so you can't do the final halt of a UIP after starting
236    * halting to a new UIP.
237    *
238    * Symptoms of not emitting this instruction on actual hardware
239    * included GPU hangs and sparkly rendering on the piglit discard
240    * tests.
241    */
242   brw_inst *last_halt = gen6_HALT(p);
243   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
244   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
245
246   int ip = p->nr_insn;
247
248   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
249      brw_inst *patch = &p->store[patch_ip->ip];
250
251      assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
252      /* HALT takes a half-instruction distance from the pre-incremented IP. */
253      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
254   }
255
256   this->discard_halt_patches.make_empty();
257   return true;
258}
259
260void
261fs_generator::generate_send(fs_inst *inst,
262                            struct brw_reg dst,
263                            struct brw_reg desc,
264                            struct brw_reg ex_desc,
265                            struct brw_reg payload,
266                            struct brw_reg payload2)
267{
268   const bool dst_is_null = dst.file == BRW_ARCHITECTURE_REGISTER_FILE &&
269                            dst.nr == BRW_ARF_NULL;
270   const unsigned rlen = dst_is_null ? 0 : inst->size_written / REG_SIZE;
271
272   uint32_t desc_imm = inst->desc |
273      brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size);
274
275   uint32_t ex_desc_imm = brw_message_ex_desc(devinfo, inst->ex_mlen);
276
277   if (ex_desc.file != BRW_IMMEDIATE_VALUE || ex_desc.ud || ex_desc_imm) {
278      /* If we have any sort of extended descriptor, then we need SENDS.  This
279       * also covers the dual-payload case because ex_mlen goes in ex_desc.
280       */
281      brw_send_indirect_split_message(p, inst->sfid, dst, payload, payload2,
282                                      desc, desc_imm, ex_desc, ex_desc_imm,
283                                      inst->eot);
284      if (inst->check_tdr)
285         brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDSC);
286   } else {
287      brw_send_indirect_message(p, inst->sfid, dst, payload, desc, desc_imm,
288                                   inst->eot);
289      if (inst->check_tdr)
290         brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
291   }
292}
293
294void
295fs_generator::fire_fb_write(fs_inst *inst,
296                            struct brw_reg payload,
297                            struct brw_reg implied_header,
298                            GLuint nr)
299{
300   uint32_t msg_control;
301
302   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
303
304   if (devinfo->gen < 6) {
305      brw_push_insn_state(p);
306      brw_set_default_exec_size(p, BRW_EXECUTE_8);
307      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
308      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
309      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
310      brw_MOV(p, offset(retype(payload, BRW_REGISTER_TYPE_UD), 1),
311              offset(retype(implied_header, BRW_REGISTER_TYPE_UD), 1));
312      brw_pop_insn_state(p);
313   }
314
315   if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
316      assert(inst->group == 0 && inst->exec_size == 16);
317      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
318
319   } else if (prog_data->dual_src_blend) {
320      assert(inst->exec_size == 8);
321
322      if (inst->group % 16 == 0)
323         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
324      else if (inst->group % 16 == 8)
325         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
326      else
327         unreachable("Invalid dual-source FB write instruction group");
328
329   } else {
330      assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
331
332      if (inst->exec_size == 16)
333         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
334      else if (inst->exec_size == 8)
335         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
336      else
337         unreachable("Invalid FB write execution size");
338   }
339
340   /* We assume render targets start at 0, because headerless FB write
341    * messages set "Render Target Index" to 0.  Using a different binding
342    * table index would make it impossible to use headerless messages.
343    */
344   const uint32_t surf_index = inst->target;
345
346   brw_inst *insn = brw_fb_WRITE(p,
347                                 payload,
348                                 retype(implied_header, BRW_REGISTER_TYPE_UW),
349                                 msg_control,
350                                 surf_index,
351                                 nr,
352                                 0,
353                                 inst->eot,
354                                 inst->last_rt,
355                                 inst->header_size != 0);
356
357   if (devinfo->gen >= 6)
358      brw_inst_set_rt_slot_group(devinfo, insn, inst->group / 16);
359}
360
361void
362fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
363{
364   if (devinfo->gen < 8 && !devinfo->is_haswell) {
365      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
366   }
367
368   const struct brw_reg implied_header =
369      devinfo->gen < 6 ? payload : brw_null_reg();
370
371   if (inst->base_mrf >= 0)
372      payload = brw_message_reg(inst->base_mrf);
373
374   if (!runtime_check_aads_emit) {
375      fire_fb_write(inst, payload, implied_header, inst->mlen);
376   } else {
377      /* This can only happen in gen < 6 */
378      assert(devinfo->gen < 6);
379
380      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
381
382      /* Check runtime bit to detect if we have to send AA data or not */
383      brw_push_insn_state(p);
384      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
385      brw_set_default_exec_size(p, BRW_EXECUTE_1);
386      brw_AND(p,
387              v1_null_ud,
388              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
389              brw_imm_ud(1<<26));
390      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
391
392      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
393      brw_pop_insn_state(p);
394      {
395         /* Don't send AA data */
396         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
397      }
398      brw_land_fwd_jump(p, jmp);
399      fire_fb_write(inst, payload, implied_header, inst->mlen);
400   }
401}
402
403void
404fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
405                               struct brw_reg payload)
406{
407   assert(inst->size_written % REG_SIZE == 0);
408   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
409   /* We assume that render targets start at binding table index 0. */
410   const unsigned surf_index = inst->target;
411
412   gen9_fb_READ(p, dst, payload, surf_index,
413                inst->header_size, inst->size_written / REG_SIZE,
414                prog_data->persample_dispatch);
415}
416
417void
418fs_generator::generate_mov_indirect(fs_inst *inst,
419                                    struct brw_reg dst,
420                                    struct brw_reg reg,
421                                    struct brw_reg indirect_byte_offset)
422{
423   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
424   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
425   assert(!reg.abs && !reg.negate);
426   assert(reg.type == dst.type);
427
428   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
429
430   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
431      imm_byte_offset += indirect_byte_offset.ud;
432
433      reg.nr = imm_byte_offset / REG_SIZE;
434      reg.subnr = imm_byte_offset % REG_SIZE;
435      brw_MOV(p, dst, reg);
436   } else {
437      /* Prior to Broadwell, there are only 8 address registers. */
438      assert(inst->exec_size <= 8 || devinfo->gen >= 8);
439
440      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
441      struct brw_reg addr = vec8(brw_address_reg(0));
442
443      /* The destination stride of an instruction (in bytes) must be greater
444       * than or equal to the size of the rest of the instruction.  Since the
445       * address register is of type UW, we can't use a D-type instruction.
446       * In order to get around this, re retype to UW and use a stride.
447       */
448      indirect_byte_offset =
449         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
450
451      /* There are a number of reasons why we don't use the base offset here.
452       * One reason is that the field is only 9 bits which means we can only
453       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
454       * section "Register Region Restrictions":
455       *
456       *    "The lower bits of the AddressImmediate must not overflow to
457       *    change the register address.  The lower 5 bits of Address
458       *    Immediate when added to lower 5 bits of address register gives
459       *    the sub-register offset. The upper bits of Address Immediate
460       *    when added to upper bits of address register gives the register
461       *    address. Any overflow from sub-register offset is dropped."
462       *
463       * Since the indirect may cause us to cross a register boundary, this
464       * makes the base offset almost useless.  We could try and do something
465       * clever where we use a actual base offset if base_offset % 32 == 0 but
466       * that would mean we were generating different code depending on the
467       * base offset.  Instead, for the sake of consistency, we'll just do the
468       * add ourselves.  This restriction is only listed in the Haswell PRM
469       * but empirical testing indicates that it applies on all older
470       * generations and is lifted on Broadwell.
471       *
472       * In the end, while base_offset is nice to look at in the generated
473       * code, using it saves us 0 instructions and would require quite a bit
474       * of case-by-case work.  It's just not worth it.
475       */
476      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
477
478      if (type_sz(reg.type) > 4 &&
479          ((devinfo->gen == 7 && !devinfo->is_haswell) ||
480           devinfo->is_cherryview || gen_device_info_is_9lp(devinfo) ||
481           !devinfo->has_64bit_types)) {
482         /* IVB has an issue (which we found empirically) where it reads two
483          * address register components per channel for indirectly addressed
484          * 64-bit sources.
485          *
486          * From the Cherryview PRM Vol 7. "Register Region Restrictions":
487          *
488          *    "When source or destination datatype is 64b or operation is
489          *    integer DWord multiply, indirect addressing must not be used."
490          *
491          * To work around both of these, we do two integer MOVs insead of one
492          * 64-bit MOV.  Because no double value should ever cross a register
493          * boundary, it's safe to use the immediate offset in the indirect
494          * here to handle adding 4 bytes to the offset and avoid the extra
495          * ADD to the register file.
496          */
497         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
498                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
499         brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
500                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
501      } else {
502         struct brw_reg ind_src = brw_VxH_indirect(0, 0);
503
504         brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
505
506         if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
507             !inst->get_next()->is_tail_sentinel() &&
508             ((fs_inst *)inst->get_next())->mlen > 0) {
509            /* From the Sandybridge PRM:
510             *
511             *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
512             *    instruction that “indexed/indirect” source AND is followed
513             *    by a send, the instruction requires a “Switch”. This is to
514             *    avoid race condition where send may dispatch before MRF is
515             *    updated."
516             */
517            brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
518         }
519      }
520   }
521}
522
523void
524fs_generator::generate_shuffle(fs_inst *inst,
525                               struct brw_reg dst,
526                               struct brw_reg src,
527                               struct brw_reg idx)
528{
529   /* Ivy bridge has some strange behavior that makes this a real pain to
530    * implement for 64-bit values so we just don't bother.
531    */
532   assert(devinfo->gen >= 8 || devinfo->is_haswell || type_sz(src.type) <= 4);
533
534   /* Because we're using the address register, we're limited to 8-wide
535    * execution on gen7.  On gen8, we're limited to 16-wide by the address
536    * register file and 8-wide for 64-bit types.  We could try and make this
537    * instruction splittable higher up in the compiler but that gets weird
538    * because it reads all of the channels regardless of execution size.  It's
539    * easier just to split it here.
540    */
541   const unsigned lower_width =
542      (devinfo->gen <= 7 || type_sz(src.type) > 4) ?
543      8 : MIN2(16, inst->exec_size);
544
545   brw_set_default_exec_size(p, cvt(lower_width) - 1);
546   for (unsigned group = 0; group < inst->exec_size; group += lower_width) {
547      brw_set_default_group(p, group);
548
549      if ((src.vstride == 0 && src.hstride == 0) ||
550          idx.file == BRW_IMMEDIATE_VALUE) {
551         /* Trivial, the source is already uniform or the index is a constant.
552          * We will typically not get here if the optimizer is doing its job,
553          * but asserting would be mean.
554          */
555         const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
556         brw_MOV(p, suboffset(dst, group), stride(suboffset(src, i), 0, 1, 0));
557      } else {
558         /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
559         struct brw_reg addr = vec8(brw_address_reg(0));
560
561         struct brw_reg group_idx = suboffset(idx, group);
562
563         if (lower_width == 8 && group_idx.width == BRW_WIDTH_16) {
564            /* Things get grumpy if the register is too wide. */
565            group_idx.width--;
566            group_idx.vstride--;
567         }
568
569         assert(type_sz(group_idx.type) <= 4);
570         if (type_sz(group_idx.type) == 4) {
571            /* The destination stride of an instruction (in bytes) must be
572             * greater than or equal to the size of the rest of the
573             * instruction.  Since the address register is of type UW, we
574             * can't use a D-type instruction.  In order to get around this,
575             * re retype to UW and use a stride.
576             */
577            group_idx = retype(spread(group_idx, 2), BRW_REGISTER_TYPE_W);
578         }
579
580         /* Take into account the component size and horizontal stride. */
581         assert(src.vstride == src.hstride + src.width);
582         brw_SHL(p, addr, group_idx,
583                 brw_imm_uw(_mesa_logbase2(type_sz(src.type)) +
584                            src.hstride - 1));
585
586         /* Add on the register start offset */
587         brw_ADD(p, addr, addr, brw_imm_uw(src.nr * REG_SIZE + src.subnr));
588
589         if (type_sz(src.type) > 4 &&
590             ((devinfo->gen == 7 && !devinfo->is_haswell) ||
591              devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
592            /* IVB has an issue (which we found empirically) where it reads
593             * two address register components per channel for indirectly
594             * addressed 64-bit sources.
595             *
596             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
597             *
598             *    "When source or destination datatype is 64b or operation is
599             *    integer DWord multiply, indirect addressing must not be
600             *    used."
601             *
602             * To work around both of these, we do two integer MOVs insead of
603             * one 64-bit MOV.  Because no double value should ever cross a
604             * register boundary, it's safe to use the immediate offset in the
605             * indirect here to handle adding 4 bytes to the offset and avoid
606             * the extra ADD to the register file.
607             */
608            struct brw_reg gdst = suboffset(dst, group);
609            struct brw_reg dst_d = retype(spread(gdst, 2),
610                                          BRW_REGISTER_TYPE_D);
611            brw_MOV(p, dst_d,
612                    retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
613            brw_MOV(p, byte_offset(dst_d, 4),
614                    retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
615         } else {
616            brw_MOV(p, suboffset(dst, group),
617                    retype(brw_VxH_indirect(0, 0), src.type));
618         }
619      }
620   }
621}
622
623void
624fs_generator::generate_quad_swizzle(const fs_inst *inst,
625                                    struct brw_reg dst, struct brw_reg src,
626                                    unsigned swiz)
627{
628   /* Requires a quad. */
629   assert(inst->exec_size >= 4);
630
631   if (src.file == BRW_IMMEDIATE_VALUE ||
632       has_scalar_region(src)) {
633      /* The value is uniform across all channels */
634      brw_MOV(p, dst, src);
635
636   } else if (devinfo->gen < 11 && type_sz(src.type) == 4) {
637      /* This only works on 8-wide 32-bit values */
638      assert(inst->exec_size == 8);
639      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
640      assert(src.vstride == src.width + 1);
641      brw_set_default_access_mode(p, BRW_ALIGN_16);
642      struct brw_reg swiz_src = stride(src, 4, 4, 1);
643      swiz_src.swizzle = swiz;
644      brw_MOV(p, dst, swiz_src);
645
646   } else {
647      assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
648      assert(src.vstride == src.width + 1);
649      const struct brw_reg src_0 = suboffset(src, BRW_GET_SWZ(swiz, 0));
650
651      switch (swiz) {
652      case BRW_SWIZZLE_XXXX:
653      case BRW_SWIZZLE_YYYY:
654      case BRW_SWIZZLE_ZZZZ:
655      case BRW_SWIZZLE_WWWW:
656         brw_MOV(p, dst, stride(src_0, 4, 4, 0));
657         break;
658
659      case BRW_SWIZZLE_XXZZ:
660      case BRW_SWIZZLE_YYWW:
661         brw_MOV(p, dst, stride(src_0, 2, 2, 0));
662         break;
663
664      case BRW_SWIZZLE_XYXY:
665      case BRW_SWIZZLE_ZWZW:
666         assert(inst->exec_size == 4);
667         brw_MOV(p, dst, stride(src_0, 0, 2, 1));
668         break;
669
670      default:
671         assert(inst->force_writemask_all);
672         brw_set_default_exec_size(p, cvt(inst->exec_size / 4) - 1);
673
674         for (unsigned c = 0; c < 4; c++) {
675            brw_inst *insn = brw_MOV(
676               p, stride(suboffset(dst, c),
677                         4 * inst->dst.stride, 1, 4 * inst->dst.stride),
678               stride(suboffset(src, BRW_GET_SWZ(swiz, c)), 4, 1, 0));
679
680            brw_inst_set_no_dd_clear(devinfo, insn, c < 3);
681            brw_inst_set_no_dd_check(devinfo, insn, c > 0);
682         }
683
684         break;
685      }
686   }
687}
688
689void
690fs_generator::generate_urb_read(fs_inst *inst,
691                                struct brw_reg dst,
692                                struct brw_reg header)
693{
694   assert(inst->size_written % REG_SIZE == 0);
695   assert(header.file == BRW_GENERAL_REGISTER_FILE);
696   assert(header.type == BRW_REGISTER_TYPE_UD);
697
698   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
699   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
700   brw_set_src0(p, send, header);
701   brw_set_src1(p, send, brw_imm_ud(0u));
702
703   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
704   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
705
706   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
707      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
708
709   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
710   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
711   brw_inst_set_header_present(p->devinfo, send, true);
712   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
713}
714
715void
716fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
717{
718   brw_inst *insn;
719
720    /* WaClearTDRRegBeforeEOTForNonPS.
721     *
722     *   WA: Clear tdr register before send EOT in all non-PS shader kernels
723     *
724     *   mov(8) tdr0:ud 0x0:ud {NoMask}"
725     */
726   if (inst->eot && p->devinfo->gen == 10) {
727      brw_push_insn_state(p);
728      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
729      brw_MOV(p, brw_tdr_reg(), brw_imm_uw(0));
730      brw_pop_insn_state(p);
731   }
732
733   insn = brw_next_insn(p, BRW_OPCODE_SEND);
734
735   brw_set_dest(p, insn, brw_null_reg());
736   brw_set_src0(p, insn, payload);
737   brw_set_src1(p, insn, brw_imm_ud(0u));
738
739   brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
740   brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
741
742   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
743       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
744      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
745
746   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
747       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
748      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
749
750   brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
751   brw_inst_set_rlen(p->devinfo, insn, 0);
752   brw_inst_set_eot(p->devinfo, insn, inst->eot);
753   brw_inst_set_header_present(p->devinfo, insn, true);
754   brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
755}
756
757void
758fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
759{
760   struct brw_inst *insn;
761
762   insn = brw_next_insn(p, BRW_OPCODE_SEND);
763
764   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
765   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
766   brw_set_src1(p, insn, brw_imm_ud(0u));
767
768   /* Terminate a compute shader by sending a message to the thread spawner.
769    */
770   brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
771   brw_inst_set_mlen(devinfo, insn, 1);
772   brw_inst_set_rlen(devinfo, insn, 0);
773   brw_inst_set_eot(devinfo, insn, inst->eot);
774   brw_inst_set_header_present(devinfo, insn, false);
775
776   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
777   brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
778
779   /* Note that even though the thread has a URB resource associated with it,
780    * we set the "do not dereference URB" bit, because the URB resource is
781    * managed by the fixed-function unit, so it will free it automatically.
782    */
783   brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
784
785   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
786}
787
788void
789fs_generator::generate_barrier(fs_inst *, struct brw_reg src)
790{
791   brw_barrier(p, src);
792   brw_WAIT(p);
793}
794
795bool
796fs_generator::generate_linterp(fs_inst *inst,
797                               struct brw_reg dst, struct brw_reg *src)
798{
799   /* PLN reads:
800    *                      /   in SIMD16   \
801    *    -----------------------------------
802    *   | src1+0 | src1+1 | src1+2 | src1+3 |
803    *   |-----------------------------------|
804    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
805    *    -----------------------------------
806    *
807    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
808    *
809    *    -----------------------------------
810    *   | src1+0 | src1+1 | src1+2 | src1+3 |
811    *   |-----------------------------------|
812    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
813    *   |-----------------------------------|
814    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
815    *    -----------------------------------
816    *
817    * See also: emit_interpolation_setup_gen4().
818    */
819   struct brw_reg delta_x = src[0];
820   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
821   struct brw_reg interp = src[1];
822   brw_inst *i[2];
823
824   /* fs_visitor::lower_linterp() will do the lowering to MAD instructions for
825    * us on gen11+
826    */
827   assert(devinfo->gen < 11);
828
829   if (devinfo->has_pln) {
830      if (devinfo->gen <= 6 && (delta_x.nr & 1) != 0) {
831         /* From the Sandy Bridge PRM Vol. 4, Pt. 2, Section 8.3.53, "Plane":
832          *
833          *    "[DevSNB]:<src1> must be even register aligned.
834          *
835          * This restriction is lifted on Ivy Bridge.
836          *
837          * This means that we need to split PLN into LINE+MAC on-the-fly.
838          * Unfortunately, the inputs are laid out for PLN and not LINE+MAC so
839          * we have to split into SIMD8 pieces.  For gen4 (!has_pln), the
840          * coordinate registers are laid out differently so we leave it as a
841          * SIMD16 instruction.
842          */
843         assert(inst->exec_size == 8 || inst->exec_size == 16);
844         assert(inst->group % 16 == 0);
845
846         brw_push_insn_state(p);
847         brw_set_default_exec_size(p, BRW_EXECUTE_8);
848
849         /* Thanks to two accumulators, we can emit all the LINEs and then all
850          * the MACs.  This improves parallelism a bit.
851          */
852         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
853            brw_inst *line = brw_LINE(p, brw_null_reg(), interp,
854                                      offset(delta_x, g * 2));
855            brw_inst_set_group(devinfo, line, inst->group + g * 8);
856
857            /* LINE writes the accumulator automatically on gen4-5.  On Sandy
858             * Bridge and later, we have to explicitly enable it.
859             */
860            if (devinfo->gen >= 6)
861               brw_inst_set_acc_wr_control(p->devinfo, line, true);
862
863            /* brw_set_default_saturate() is called before emitting
864             * instructions, so the saturate bit is set in each instruction,
865             * so we need to unset it on the LINE instructions.
866             */
867            brw_inst_set_saturate(p->devinfo, line, false);
868         }
869
870         for (unsigned g = 0; g < inst->exec_size / 8; g++) {
871            brw_inst *mac = brw_MAC(p, offset(dst, g), suboffset(interp, 1),
872                                    offset(delta_x, g * 2 + 1));
873            brw_inst_set_group(devinfo, mac, inst->group + g * 8);
874            brw_inst_set_cond_modifier(p->devinfo, mac, inst->conditional_mod);
875         }
876
877         brw_pop_insn_state(p);
878
879         return true;
880      } else {
881         brw_PLN(p, dst, interp, delta_x);
882
883         return false;
884      }
885   } else {
886      i[0] = brw_LINE(p, brw_null_reg(), interp, delta_x);
887      i[1] = brw_MAC(p, dst, suboffset(interp, 1), delta_y);
888
889      brw_inst_set_cond_modifier(p->devinfo, i[1], inst->conditional_mod);
890
891      /* brw_set_default_saturate() is called before emitting instructions, so
892       * the saturate bit is set in each instruction, so we need to unset it on
893       * the first instruction.
894       */
895      brw_inst_set_saturate(p->devinfo, i[0], false);
896
897      return true;
898   }
899}
900
901void
902fs_generator::generate_get_buffer_size(fs_inst *inst,
903                                       struct brw_reg dst,
904                                       struct brw_reg src,
905                                       struct brw_reg surf_index)
906{
907   assert(devinfo->gen >= 7);
908   assert(surf_index.file == BRW_IMMEDIATE_VALUE);
909
910   uint32_t simd_mode;
911   int rlen = 4;
912
913   switch (inst->exec_size) {
914   case 8:
915      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
916      break;
917   case 16:
918      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
919      break;
920   default:
921      unreachable("Invalid width for texture instruction");
922   }
923
924   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
925      rlen = 8;
926      dst = vec16(dst);
927   }
928
929   brw_SAMPLE(p,
930              retype(dst, BRW_REGISTER_TYPE_UW),
931              inst->base_mrf,
932              src,
933              surf_index.ud,
934              0,
935              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
936              rlen, /* response length */
937              inst->mlen,
938              inst->header_size > 0,
939              simd_mode,
940              BRW_SAMPLER_RETURN_FORMAT_SINT32);
941}
942
943void
944fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst,
945                           struct brw_reg surface_index,
946                           struct brw_reg sampler_index)
947{
948   assert(devinfo->gen < 7);
949   assert(inst->size_written % REG_SIZE == 0);
950   int msg_type = -1;
951   uint32_t simd_mode;
952   uint32_t return_format;
953
954   /* Sampler EOT message of less than the dispatch width would kill the
955    * thread prematurely.
956    */
957   assert(!inst->eot || inst->exec_size == dispatch_width);
958
959   switch (dst.type) {
960   case BRW_REGISTER_TYPE_D:
961      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
962      break;
963   case BRW_REGISTER_TYPE_UD:
964      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
965      break;
966   default:
967      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
968      break;
969   }
970
971   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
972    * is set as part of the message descriptor.  On gen4, the PRM seems to
973    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
974    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
975    * gone from the message descriptor entirely and you just get UINT32 all
976    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
977    * just stomp it to UINT32 all the time.
978    */
979   if (inst->opcode == SHADER_OPCODE_TXS)
980      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
981
982   switch (inst->exec_size) {
983   case 8:
984      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
985      break;
986   case 16:
987      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
988      break;
989   default:
990      unreachable("Invalid width for texture instruction");
991   }
992
993   if (devinfo->gen >= 5) {
994      switch (inst->opcode) {
995      case SHADER_OPCODE_TEX:
996	 if (inst->shadow_compare) {
997	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
998	 } else {
999	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
1000	 }
1001	 break;
1002      case FS_OPCODE_TXB:
1003	 if (inst->shadow_compare) {
1004	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
1005	 } else {
1006	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
1007	 }
1008	 break;
1009      case SHADER_OPCODE_TXL:
1010	 if (inst->shadow_compare) {
1011	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
1012	 } else {
1013	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
1014	 }
1015	 break;
1016      case SHADER_OPCODE_TXS:
1017	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
1018	 break;
1019      case SHADER_OPCODE_TXD:
1020         assert(!inst->shadow_compare);
1021         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
1022	 break;
1023      case SHADER_OPCODE_TXF:
1024	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1025	 break;
1026      case SHADER_OPCODE_TXF_CMS:
1027         msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1028         break;
1029      case SHADER_OPCODE_LOD:
1030         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
1031         break;
1032      case SHADER_OPCODE_TG4:
1033         assert(devinfo->gen == 6);
1034         assert(!inst->shadow_compare);
1035         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
1036         break;
1037      case SHADER_OPCODE_SAMPLEINFO:
1038         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
1039         break;
1040      default:
1041	 unreachable("not reached");
1042      }
1043   } else {
1044      switch (inst->opcode) {
1045      case SHADER_OPCODE_TEX:
1046	 /* Note that G45 and older determines shadow compare and dispatch width
1047	  * from message length for most messages.
1048	  */
1049         if (inst->exec_size == 8) {
1050            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
1051            if (inst->shadow_compare) {
1052               assert(inst->mlen == 6);
1053            } else {
1054               assert(inst->mlen <= 4);
1055            }
1056         } else {
1057            if (inst->shadow_compare) {
1058               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
1059               assert(inst->mlen == 9);
1060            } else {
1061               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
1062               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
1063            }
1064         }
1065	 break;
1066      case FS_OPCODE_TXB:
1067	 if (inst->shadow_compare) {
1068            assert(inst->exec_size == 8);
1069	    assert(inst->mlen == 6);
1070	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
1071	 } else {
1072	    assert(inst->mlen == 9);
1073	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
1074	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1075	 }
1076	 break;
1077      case SHADER_OPCODE_TXL:
1078	 if (inst->shadow_compare) {
1079            assert(inst->exec_size == 8);
1080	    assert(inst->mlen == 6);
1081	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
1082	 } else {
1083	    assert(inst->mlen == 9);
1084	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
1085	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1086	 }
1087	 break;
1088      case SHADER_OPCODE_TXD:
1089	 /* There is no sample_d_c message; comparisons are done manually */
1090         assert(inst->exec_size == 8);
1091	 assert(inst->mlen == 7 || inst->mlen == 10);
1092	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
1093	 break;
1094      case SHADER_OPCODE_TXF:
1095         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
1096	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1097	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1098	 break;
1099      case SHADER_OPCODE_TXS:
1100	 assert(inst->mlen == 3);
1101	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
1102	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1103	 break;
1104      default:
1105	 unreachable("not reached");
1106      }
1107   }
1108   assert(msg_type != -1);
1109
1110   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
1111      dst = vec16(dst);
1112   }
1113
1114   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
1115
1116   /* Load the message header if present.  If there's a texture offset,
1117    * we need to set it up explicitly and load the offset bitfield.
1118    * Otherwise, we can use an implied move from g0 to the first message reg.
1119    */
1120   struct brw_reg src = brw_null_reg();
1121   if (inst->header_size != 0) {
1122      if (devinfo->gen < 6 && !inst->offset) {
1123         /* Set up an implied move from g0 to the MRF. */
1124         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
1125      } else {
1126         assert(inst->base_mrf != -1);
1127         struct brw_reg header_reg = brw_message_reg(inst->base_mrf);
1128
1129         brw_push_insn_state(p);
1130         brw_set_default_exec_size(p, BRW_EXECUTE_8);
1131         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1132         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1133         /* Explicitly set up the message header by copying g0 to the MRF. */
1134         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
1135
1136         brw_set_default_exec_size(p, BRW_EXECUTE_1);
1137         if (inst->offset) {
1138            /* Set the offset bits in DWord 2. */
1139            brw_MOV(p, get_element_ud(header_reg, 2),
1140                       brw_imm_ud(inst->offset));
1141         }
1142
1143         brw_pop_insn_state(p);
1144      }
1145   }
1146
1147   uint32_t base_binding_table_index;
1148   switch (inst->opcode) {
1149   case SHADER_OPCODE_TG4:
1150      base_binding_table_index = prog_data->binding_table.gather_texture_start;
1151      break;
1152   default:
1153      base_binding_table_index = prog_data->binding_table.texture_start;
1154      break;
1155   }
1156
1157   assert(surface_index.file == BRW_IMMEDIATE_VALUE);
1158   assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
1159
1160   brw_SAMPLE(p,
1161              retype(dst, BRW_REGISTER_TYPE_UW),
1162              inst->base_mrf,
1163              src,
1164              surface_index.ud + base_binding_table_index,
1165              sampler_index.ud % 16,
1166              msg_type,
1167              inst->size_written / REG_SIZE,
1168              inst->mlen,
1169              inst->header_size != 0,
1170              simd_mode,
1171              return_format);
1172}
1173
1174
1175/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1176 * looking like:
1177 *
1178 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1179 *
1180 * Ideally, we want to produce:
1181 *
1182 *           DDX                     DDY
1183 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
1184 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
1185 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
1186 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
1187 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
1188 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
1189 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
1190 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
1191 *
1192 * and add another set of two more subspans if in 16-pixel dispatch mode.
1193 *
1194 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1195 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1196 * pair.  But the ideal approximation may impose a huge performance cost on
1197 * sample_d.  On at least Haswell, sample_d instruction does some
1198 * optimizations if the same LOD is used for all pixels in the subspan.
1199 *
1200 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1201 * appropriate swizzling.
1202 */
1203void
1204fs_generator::generate_ddx(const fs_inst *inst,
1205                           struct brw_reg dst, struct brw_reg src)
1206{
1207   unsigned vstride, width;
1208
1209   if (devinfo->gen >= 8) {
1210      if (inst->opcode == FS_OPCODE_DDX_FINE) {
1211         /* produce accurate derivatives */
1212         vstride = BRW_VERTICAL_STRIDE_2;
1213         width = BRW_WIDTH_2;
1214      } else {
1215         /* replicate the derivative at the top-left pixel to other pixels */
1216         vstride = BRW_VERTICAL_STRIDE_4;
1217         width = BRW_WIDTH_4;
1218      }
1219
1220      struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
1221      struct brw_reg src1 = src;
1222
1223      src0.vstride = vstride;
1224      src0.width   = width;
1225      src0.hstride = BRW_HORIZONTAL_STRIDE_0;
1226      src1.vstride = vstride;
1227      src1.width   = width;
1228      src1.hstride = BRW_HORIZONTAL_STRIDE_0;
1229
1230      brw_ADD(p, dst, src0, negate(src1));
1231   } else {
1232      /* On Haswell and earlier, the region used above appears to not work
1233       * correctly for compressed instructions.  At least on Haswell and
1234       * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1235       * would have to split to SIMD8 no matter which method we choose, we
1236       * may as well use ALIGN16 on all platforms gen7 and earlier.
1237       */
1238      struct brw_reg src0 = stride(src, 4, 4, 1);
1239      struct brw_reg src1 = stride(src, 4, 4, 1);
1240      if (inst->opcode == FS_OPCODE_DDX_FINE) {
1241         src0.swizzle = BRW_SWIZZLE_XXZZ;
1242         src1.swizzle = BRW_SWIZZLE_YYWW;
1243      } else {
1244         src0.swizzle = BRW_SWIZZLE_XXXX;
1245         src1.swizzle = BRW_SWIZZLE_YYYY;
1246      }
1247
1248      brw_push_insn_state(p);
1249      brw_set_default_access_mode(p, BRW_ALIGN_16);
1250      brw_ADD(p, dst, negate(src0), src1);
1251      brw_pop_insn_state(p);
1252   }
1253}
1254
1255/* The negate_value boolean is used to negate the derivative computation for
1256 * FBOs, since they place the origin at the upper left instead of the lower
1257 * left.
1258 */
1259void
1260fs_generator::generate_ddy(const fs_inst *inst,
1261                           struct brw_reg dst, struct brw_reg src)
1262{
1263   const uint32_t type_size = type_sz(src.type);
1264
1265   if (inst->opcode == FS_OPCODE_DDY_FINE) {
1266      /* produce accurate derivatives.
1267       *
1268       * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU)
1269       * "Register Region Restrictions", Section "1. Special Restrictions":
1270       *
1271       *    "In Align16 mode, the channel selects and channel enables apply to
1272       *     a pair of half-floats, because these parameters are defined for
1273       *     DWord elements ONLY. This is applicable when both source and
1274       *     destination are half-floats."
1275       *
1276       * So for half-float operations we use the Gen11+ Align1 path. CHV
1277       * inherits its FP16 hardware from SKL, so it is not affected.
1278       */
1279      if (devinfo->gen >= 11 ||
1280          (devinfo->is_broadwell && src.type == BRW_REGISTER_TYPE_HF)) {
1281         src = stride(src, 0, 2, 1);
1282
1283         brw_push_insn_state(p);
1284         brw_set_default_exec_size(p, BRW_EXECUTE_4);
1285         for (uint32_t g = 0; g < inst->exec_size; g += 4) {
1286            brw_set_default_group(p, inst->group + g);
1287            brw_ADD(p, byte_offset(dst, g * type_size),
1288                       negate(byte_offset(src,  g * type_size)),
1289                       byte_offset(src, (g + 2) * type_size));
1290         }
1291         brw_pop_insn_state(p);
1292      } else {
1293         struct brw_reg src0 = stride(src, 4, 4, 1);
1294         struct brw_reg src1 = stride(src, 4, 4, 1);
1295         src0.swizzle = BRW_SWIZZLE_XYXY;
1296         src1.swizzle = BRW_SWIZZLE_ZWZW;
1297
1298         brw_push_insn_state(p);
1299         brw_set_default_access_mode(p, BRW_ALIGN_16);
1300         brw_ADD(p, dst, negate(src0), src1);
1301         brw_pop_insn_state(p);
1302      }
1303   } else {
1304      /* replicate the derivative at the top-left pixel to other pixels */
1305      if (devinfo->gen >= 8) {
1306         struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
1307         struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
1308
1309         brw_ADD(p, dst, negate(src0), src1);
1310      } else {
1311         /* On Haswell and earlier, the region used above appears to not work
1312          * correctly for compressed instructions.  At least on Haswell and
1313          * Iron Lake, compressed ALIGN16 instructions do work.  Since we
1314          * would have to split to SIMD8 no matter which method we choose, we
1315          * may as well use ALIGN16 on all platforms gen7 and earlier.
1316          */
1317         struct brw_reg src0 = stride(src, 4, 4, 1);
1318         struct brw_reg src1 = stride(src, 4, 4, 1);
1319         src0.swizzle = BRW_SWIZZLE_XXXX;
1320         src1.swizzle = BRW_SWIZZLE_ZZZZ;
1321
1322         brw_push_insn_state(p);
1323         brw_set_default_access_mode(p, BRW_ALIGN_16);
1324         brw_ADD(p, dst, negate(src0), src1);
1325         brw_pop_insn_state(p);
1326      }
1327   }
1328}
1329
1330void
1331fs_generator::generate_discard_jump(fs_inst *)
1332{
1333   assert(devinfo->gen >= 6);
1334
1335   /* This HALT will be patched up at FB write time to point UIP at the end of
1336    * the program, and at brw_uip_jip() JIP will be set to the end of the
1337    * current block (or the program).
1338    */
1339   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1340   gen6_HALT(p);
1341}
1342
1343void
1344fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1345{
1346   /* The 32-wide messages only respect the first 16-wide half of the channel
1347    * enable signals which are replicated identically for the second group of
1348    * 16 channels, so we cannot use them unless the write is marked
1349    * force_writemask_all.
1350    */
1351   const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1352                               MIN2(16, inst->exec_size);
1353   const unsigned block_size = 4 * lower_size / REG_SIZE;
1354   assert(inst->mlen != 0);
1355
1356   brw_push_insn_state(p);
1357   brw_set_default_exec_size(p, cvt(lower_size) - 1);
1358   brw_set_default_compression(p, lower_size > 8);
1359
1360   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1361      brw_set_default_group(p, inst->group + lower_size * i);
1362
1363      brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1364              retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1365
1366      brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1367                                    block_size,
1368                                    inst->offset + block_size * REG_SIZE * i);
1369   }
1370
1371   brw_pop_insn_state(p);
1372}
1373
1374void
1375fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1376{
1377   assert(inst->exec_size <= 16 || inst->force_writemask_all);
1378   assert(inst->mlen != 0);
1379
1380   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1381                                inst->exec_size / 8, inst->offset);
1382}
1383
1384void
1385fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1386{
1387   assert(inst->exec_size <= 16 || inst->force_writemask_all);
1388
1389   gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1390}
1391
1392void
1393fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1394                                                  struct brw_reg dst,
1395                                                  struct brw_reg index,
1396                                                  struct brw_reg offset)
1397{
1398   assert(type_sz(dst.type) == 4);
1399   assert(inst->mlen != 0);
1400
1401   assert(index.file == BRW_IMMEDIATE_VALUE &&
1402	  index.type == BRW_REGISTER_TYPE_UD);
1403   uint32_t surf_index = index.ud;
1404
1405   assert(offset.file == BRW_IMMEDIATE_VALUE &&
1406	  offset.type == BRW_REGISTER_TYPE_UD);
1407   uint32_t read_offset = offset.ud;
1408
1409   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1410			read_offset, surf_index);
1411}
1412
1413void
1414fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1415                                                       struct brw_reg dst,
1416                                                       struct brw_reg index,
1417                                                       struct brw_reg payload)
1418{
1419   assert(index.type == BRW_REGISTER_TYPE_UD);
1420   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1421   assert(type_sz(dst.type) == 4);
1422
1423   if (index.file == BRW_IMMEDIATE_VALUE) {
1424      const uint32_t surf_index = index.ud;
1425
1426      brw_push_insn_state(p);
1427      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1428      brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1429      brw_pop_insn_state(p);
1430
1431      brw_inst_set_sfid(devinfo, send, GEN6_SFID_DATAPORT_CONSTANT_CACHE);
1432      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1433      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1434      brw_set_desc(p, send,
1435                   brw_message_desc(devinfo, 1, DIV_ROUND_UP(inst->size_written,
1436                                                             REG_SIZE), true) |
1437                   brw_dp_read_desc(devinfo, surf_index,
1438                                    BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1439                                    GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1440                                    BRW_DATAPORT_READ_TARGET_DATA_CACHE));
1441
1442   } else {
1443      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1444
1445      brw_push_insn_state(p);
1446      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1447
1448      /* a0.0 = surf_index & 0xff */
1449      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1450      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1451      brw_set_dest(p, insn_and, addr);
1452      brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1453      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1454
1455      /* dst = send(payload, a0.0 | <descriptor>) */
1456      brw_send_indirect_message(
1457         p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1458         retype(dst, BRW_REGISTER_TYPE_UD),
1459         retype(payload, BRW_REGISTER_TYPE_UD), addr,
1460         brw_message_desc(devinfo, 1,
1461                          DIV_ROUND_UP(inst->size_written, REG_SIZE), true) |
1462         brw_dp_read_desc(devinfo, 0 /* surface */,
1463                          BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1464                          GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1465                          BRW_DATAPORT_READ_TARGET_DATA_CACHE),
1466         false /* EOT */);
1467
1468      brw_pop_insn_state(p);
1469   }
1470}
1471
1472void
1473fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1474                                                       struct brw_reg dst,
1475                                                       struct brw_reg index)
1476{
1477   assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1478   assert(inst->header_size != 0);
1479   assert(inst->mlen);
1480
1481   assert(index.file == BRW_IMMEDIATE_VALUE &&
1482	  index.type == BRW_REGISTER_TYPE_UD);
1483   uint32_t surf_index = index.ud;
1484
1485   uint32_t simd_mode, rlen, msg_type;
1486   if (inst->exec_size == 16) {
1487      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1488      rlen = 8;
1489   } else {
1490      assert(inst->exec_size == 8);
1491      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1492      rlen = 4;
1493   }
1494
1495   if (devinfo->gen >= 5)
1496      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1497   else {
1498      /* We always use the SIMD16 message so that we only have to load U, and
1499       * not V or R.
1500       */
1501      msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1502      assert(inst->mlen == 3);
1503      assert(inst->size_written == 8 * REG_SIZE);
1504      rlen = 8;
1505      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1506   }
1507
1508   struct brw_reg header = brw_vec8_grf(0, 0);
1509   gen6_resolve_implied_move(p, &header, inst->base_mrf);
1510
1511   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1512   brw_inst_set_compression(devinfo, send, false);
1513   brw_inst_set_sfid(devinfo, send, BRW_SFID_SAMPLER);
1514   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1515   brw_set_src0(p, send, header);
1516   if (devinfo->gen < 6)
1517      brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1518
1519   /* Our surface is set up as floats, regardless of what actual data is
1520    * stored in it.
1521    */
1522   uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1523   brw_set_desc(p, send,
1524                brw_message_desc(devinfo, inst->mlen, rlen, inst->header_size) |
1525                brw_sampler_desc(devinfo, surf_index,
1526                                 0, /* sampler (unused) */
1527                                 msg_type, simd_mode, return_format));
1528}
1529
1530void
1531fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1532                                                struct brw_reg dst,
1533                                                struct brw_reg src,
1534                                                struct brw_reg msg_data,
1535                                                unsigned msg_type)
1536{
1537   const bool has_payload = inst->src[0].file != BAD_FILE;
1538   assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1539   assert(inst->size_written % REG_SIZE == 0);
1540
1541   brw_pixel_interpolator_query(p,
1542         retype(dst, BRW_REGISTER_TYPE_UW),
1543         /* If we don't have a payload, what we send doesn't matter */
1544         has_payload ? src : brw_vec8_grf(0, 0),
1545         inst->pi_noperspective,
1546         msg_type,
1547         msg_data,
1548         has_payload ? 2 * inst->exec_size / 8 : 1,
1549         inst->size_written / REG_SIZE);
1550}
1551
1552/* Sets vstride=1, width=4, hstride=0 of register src1 during
1553 * the ADD instruction.
1554 */
1555void
1556fs_generator::generate_set_sample_id(fs_inst *inst,
1557                                     struct brw_reg dst,
1558                                     struct brw_reg src0,
1559                                     struct brw_reg src1)
1560{
1561   assert(dst.type == BRW_REGISTER_TYPE_D ||
1562          dst.type == BRW_REGISTER_TYPE_UD);
1563   assert(src0.type == BRW_REGISTER_TYPE_D ||
1564          src0.type == BRW_REGISTER_TYPE_UD);
1565
1566   const struct brw_reg reg = stride(src1, 1, 4, 0);
1567   const unsigned lower_size = MIN2(inst->exec_size,
1568                                    devinfo->gen >= 8 ? 16 : 8);
1569
1570   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1571      brw_inst *insn = brw_ADD(p, offset(dst, i * lower_size / 8),
1572                               offset(src0, (src0.vstride == 0 ? 0 : (1 << (src0.vstride - 1)) *
1573                                             (i * lower_size / (1 << src0.width))) *
1574                                            type_sz(src0.type) / REG_SIZE),
1575                               suboffset(reg, i * lower_size / 4));
1576      brw_inst_set_exec_size(devinfo, insn, cvt(lower_size) - 1);
1577      brw_inst_set_group(devinfo, insn, inst->group + lower_size * i);
1578      brw_inst_set_compression(devinfo, insn, lower_size > 8);
1579   }
1580}
1581
1582void
1583fs_generator::generate_pack_half_2x16_split(fs_inst *,
1584                                            struct brw_reg dst,
1585                                            struct brw_reg x,
1586                                            struct brw_reg y)
1587{
1588   assert(devinfo->gen >= 7);
1589   assert(dst.type == BRW_REGISTER_TYPE_UD);
1590   assert(x.type == BRW_REGISTER_TYPE_F);
1591   assert(y.type == BRW_REGISTER_TYPE_F);
1592
1593   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1594    *
1595    *   Because this instruction does not have a 16-bit floating-point type,
1596    *   the destination data type must be Word (W).
1597    *
1598    *   The destination must be DWord-aligned and specify a horizontal stride
1599    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
1600    *   each destination channel and the upper word is not modified.
1601    */
1602   struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1603
1604   /* Give each 32-bit channel of dst the form below, where "." means
1605    * unchanged.
1606    *   0x....hhhh
1607    */
1608   brw_F32TO16(p, dst_w, y);
1609
1610   /* Now the form:
1611    *   0xhhhh0000
1612    */
1613   brw_SHL(p, dst, dst, brw_imm_ud(16u));
1614
1615   /* And, finally the form of packHalf2x16's output:
1616    *   0xhhhhllll
1617    */
1618   brw_F32TO16(p, dst_w, x);
1619}
1620
1621void
1622fs_generator::generate_shader_time_add(fs_inst *,
1623                                       struct brw_reg payload,
1624                                       struct brw_reg offset,
1625                                       struct brw_reg value)
1626{
1627   assert(devinfo->gen >= 7);
1628   brw_push_insn_state(p);
1629   brw_set_default_mask_control(p, true);
1630
1631   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1632   struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1633                                          offset.type);
1634   struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1635                                         value.type);
1636
1637   assert(offset.file == BRW_IMMEDIATE_VALUE);
1638   if (value.file == BRW_GENERAL_REGISTER_FILE) {
1639      value.width = BRW_WIDTH_1;
1640      value.hstride = BRW_HORIZONTAL_STRIDE_0;
1641      value.vstride = BRW_VERTICAL_STRIDE_0;
1642   } else {
1643      assert(value.file == BRW_IMMEDIATE_VALUE);
1644   }
1645
1646   /* Trying to deal with setup of the params from the IR is crazy in the FS8
1647    * case, and we don't really care about squeezing every bit of performance
1648    * out of this path, so we just emit the MOVs from here.
1649    */
1650   brw_MOV(p, payload_offset, offset);
1651   brw_MOV(p, payload_value, value);
1652   brw_shader_time_add(p, payload,
1653                       prog_data->binding_table.shader_time_start);
1654   brw_pop_insn_state(p);
1655}
1656
1657void
1658fs_generator::enable_debug(const char *shader_name)
1659{
1660   debug_flag = true;
1661   this->shader_name = shader_name;
1662}
1663
1664int
1665fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1666{
1667   /* align to 64 byte boundary. */
1668   while (p->next_insn_offset % 64)
1669      brw_NOP(p);
1670
1671   this->dispatch_width = dispatch_width;
1672
1673   int start_offset = p->next_insn_offset;
1674   int spill_count = 0, fill_count = 0;
1675   int loop_count = 0;
1676
1677   struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg);
1678
1679   foreach_block_and_inst (block, fs_inst, inst, cfg) {
1680      struct brw_reg src[4], dst;
1681      unsigned int last_insn_offset = p->next_insn_offset;
1682      bool multiple_instructions_emitted = false;
1683
1684      /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1685       * "Register Region Restrictions" section: for BDW, SKL:
1686       *
1687       *    "A POW/FDIV operation must not be followed by an instruction
1688       *     that requires two destination registers."
1689       *
1690       * The documentation is often lacking annotations for Atom parts,
1691       * and empirically this affects CHV as well.
1692       */
1693      if (devinfo->gen >= 8 &&
1694          devinfo->gen <= 9 &&
1695          p->nr_insn > 1 &&
1696          brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1697          brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1698          inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1699         brw_NOP(p);
1700         last_insn_offset = p->next_insn_offset;
1701      }
1702
1703      if (unlikely(debug_flag))
1704         disasm_annotate(disasm_info, inst, p->next_insn_offset);
1705
1706      /* If the instruction writes to more than one register, it needs to be
1707       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
1708       * hardware figures out by itself what the right compression mode is,
1709       * but we still need to know whether the instruction is compressed to
1710       * set up the source register regions appropriately.
1711       *
1712       * XXX - This is wrong for instructions that write a single register but
1713       *       read more than one which should strictly speaking be treated as
1714       *       compressed.  For instructions that don't write any registers it
1715       *       relies on the destination being a null register of the correct
1716       *       type and regioning so the instruction is considered compressed
1717       *       or not accordingly.
1718       */
1719      const bool compressed =
1720           inst->dst.component_size(inst->exec_size) > REG_SIZE;
1721      brw_set_default_compression(p, compressed);
1722      brw_set_default_group(p, inst->group);
1723
1724      for (unsigned int i = 0; i < inst->sources; i++) {
1725         src[i] = brw_reg_from_fs_reg(devinfo, inst,
1726                                      &inst->src[i], compressed);
1727	 /* The accumulator result appears to get used for the
1728	  * conditional modifier generation.  When negating a UD
1729	  * value, there is a 33rd bit generated for the sign in the
1730	  * accumulator value, so now you can't check, for example,
1731	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1732	  */
1733	 assert(!inst->conditional_mod ||
1734		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1735		!inst->src[i].negate);
1736      }
1737      dst = brw_reg_from_fs_reg(devinfo, inst,
1738                                &inst->dst, compressed);
1739
1740      brw_set_default_access_mode(p, BRW_ALIGN_1);
1741      brw_set_default_predicate_control(p, inst->predicate);
1742      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1743      /* On gen7 and above, hardware automatically adds the group onto the
1744       * flag subregister number.  On Sandy Bridge and older, we have to do it
1745       * ourselves.
1746       */
1747      const unsigned flag_subreg = inst->flag_subreg +
1748         (devinfo->gen >= 7 ? 0 : inst->group / 16);
1749      brw_set_default_flag_reg(p, flag_subreg / 2, flag_subreg % 2);
1750      brw_set_default_saturate(p, inst->saturate);
1751      brw_set_default_mask_control(p, inst->force_writemask_all);
1752      brw_set_default_acc_write_control(p, inst->writes_accumulator);
1753
1754      unsigned exec_size = inst->exec_size;
1755      if (devinfo->gen == 7 && !devinfo->is_haswell &&
1756          (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1757         exec_size *= 2;
1758      }
1759
1760      brw_set_default_exec_size(p, cvt(exec_size) - 1);
1761
1762      assert(inst->force_writemask_all || inst->exec_size >= 4);
1763      assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1764      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1765      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1766
1767      switch (inst->opcode) {
1768      case BRW_OPCODE_MOV:
1769	 brw_MOV(p, dst, src[0]);
1770	 break;
1771      case BRW_OPCODE_ADD:
1772	 brw_ADD(p, dst, src[0], src[1]);
1773	 break;
1774      case BRW_OPCODE_MUL:
1775	 brw_MUL(p, dst, src[0], src[1]);
1776	 break;
1777      case BRW_OPCODE_AVG:
1778	 brw_AVG(p, dst, src[0], src[1]);
1779	 break;
1780      case BRW_OPCODE_MACH:
1781	 brw_MACH(p, dst, src[0], src[1]);
1782	 break;
1783
1784      case BRW_OPCODE_LINE:
1785         brw_LINE(p, dst, src[0], src[1]);
1786         break;
1787
1788      case BRW_OPCODE_MAD:
1789         assert(devinfo->gen >= 6);
1790         if (devinfo->gen < 10)
1791            brw_set_default_access_mode(p, BRW_ALIGN_16);
1792         brw_MAD(p, dst, src[0], src[1], src[2]);
1793	 break;
1794
1795      case BRW_OPCODE_LRP:
1796         assert(devinfo->gen >= 6 && devinfo->gen <= 10);
1797         if (devinfo->gen < 10)
1798            brw_set_default_access_mode(p, BRW_ALIGN_16);
1799         brw_LRP(p, dst, src[0], src[1], src[2]);
1800	 break;
1801
1802      case BRW_OPCODE_FRC:
1803	 brw_FRC(p, dst, src[0]);
1804	 break;
1805      case BRW_OPCODE_RNDD:
1806	 brw_RNDD(p, dst, src[0]);
1807	 break;
1808      case BRW_OPCODE_RNDE:
1809	 brw_RNDE(p, dst, src[0]);
1810	 break;
1811      case BRW_OPCODE_RNDZ:
1812	 brw_RNDZ(p, dst, src[0]);
1813	 break;
1814
1815      case BRW_OPCODE_AND:
1816	 brw_AND(p, dst, src[0], src[1]);
1817	 break;
1818      case BRW_OPCODE_OR:
1819	 brw_OR(p, dst, src[0], src[1]);
1820	 break;
1821      case BRW_OPCODE_XOR:
1822	 brw_XOR(p, dst, src[0], src[1]);
1823	 break;
1824      case BRW_OPCODE_NOT:
1825	 brw_NOT(p, dst, src[0]);
1826	 break;
1827      case BRW_OPCODE_ASR:
1828	 brw_ASR(p, dst, src[0], src[1]);
1829	 break;
1830      case BRW_OPCODE_SHR:
1831	 brw_SHR(p, dst, src[0], src[1]);
1832	 break;
1833      case BRW_OPCODE_SHL:
1834	 brw_SHL(p, dst, src[0], src[1]);
1835	 break;
1836      case BRW_OPCODE_F32TO16:
1837         assert(devinfo->gen >= 7);
1838         brw_F32TO16(p, dst, src[0]);
1839         break;
1840      case BRW_OPCODE_F16TO32:
1841         assert(devinfo->gen >= 7);
1842         brw_F16TO32(p, dst, src[0]);
1843         break;
1844      case BRW_OPCODE_CMP:
1845         if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1846             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1847            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1848             * implemented in the compiler is not sufficient. Overriding the
1849             * type when the destination is the null register is necessary but
1850             * not sufficient by itself.
1851             */
1852            assert(dst.nr == BRW_ARF_NULL);
1853            dst.type = BRW_REGISTER_TYPE_D;
1854         }
1855         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1856	 break;
1857      case BRW_OPCODE_SEL:
1858	 brw_SEL(p, dst, src[0], src[1]);
1859	 break;
1860      case BRW_OPCODE_CSEL:
1861         assert(devinfo->gen >= 8);
1862         if (devinfo->gen < 10)
1863            brw_set_default_access_mode(p, BRW_ALIGN_16);
1864         brw_CSEL(p, dst, src[0], src[1], src[2]);
1865         break;
1866      case BRW_OPCODE_BFREV:
1867         assert(devinfo->gen >= 7);
1868         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1869                   retype(src[0], BRW_REGISTER_TYPE_UD));
1870         break;
1871      case BRW_OPCODE_FBH:
1872         assert(devinfo->gen >= 7);
1873         brw_FBH(p, retype(dst, src[0].type), src[0]);
1874         break;
1875      case BRW_OPCODE_FBL:
1876         assert(devinfo->gen >= 7);
1877         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1878                 retype(src[0], BRW_REGISTER_TYPE_UD));
1879         break;
1880      case BRW_OPCODE_LZD:
1881         brw_LZD(p, dst, src[0]);
1882         break;
1883      case BRW_OPCODE_CBIT:
1884         assert(devinfo->gen >= 7);
1885         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1886                  retype(src[0], BRW_REGISTER_TYPE_UD));
1887         break;
1888      case BRW_OPCODE_ADDC:
1889         assert(devinfo->gen >= 7);
1890         brw_ADDC(p, dst, src[0], src[1]);
1891         break;
1892      case BRW_OPCODE_SUBB:
1893         assert(devinfo->gen >= 7);
1894         brw_SUBB(p, dst, src[0], src[1]);
1895         break;
1896      case BRW_OPCODE_MAC:
1897         brw_MAC(p, dst, src[0], src[1]);
1898         break;
1899
1900      case BRW_OPCODE_BFE:
1901         assert(devinfo->gen >= 7);
1902         if (devinfo->gen < 10)
1903            brw_set_default_access_mode(p, BRW_ALIGN_16);
1904         brw_BFE(p, dst, src[0], src[1], src[2]);
1905         break;
1906
1907      case BRW_OPCODE_BFI1:
1908         assert(devinfo->gen >= 7);
1909         brw_BFI1(p, dst, src[0], src[1]);
1910         break;
1911      case BRW_OPCODE_BFI2:
1912         assert(devinfo->gen >= 7);
1913         if (devinfo->gen < 10)
1914            brw_set_default_access_mode(p, BRW_ALIGN_16);
1915         brw_BFI2(p, dst, src[0], src[1], src[2]);
1916         break;
1917
1918      case BRW_OPCODE_IF:
1919	 if (inst->src[0].file != BAD_FILE) {
1920	    /* The instruction has an embedded compare (only allowed on gen6) */
1921	    assert(devinfo->gen == 6);
1922	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1923	 } else {
1924	    brw_IF(p, brw_get_default_exec_size(p));
1925	 }
1926	 break;
1927
1928      case BRW_OPCODE_ELSE:
1929	 brw_ELSE(p);
1930	 break;
1931      case BRW_OPCODE_ENDIF:
1932	 brw_ENDIF(p);
1933	 break;
1934
1935      case BRW_OPCODE_DO:
1936	 brw_DO(p, brw_get_default_exec_size(p));
1937	 break;
1938
1939      case BRW_OPCODE_BREAK:
1940	 brw_BREAK(p);
1941	 break;
1942      case BRW_OPCODE_CONTINUE:
1943         brw_CONT(p);
1944	 break;
1945
1946      case BRW_OPCODE_WHILE:
1947	 brw_WHILE(p);
1948         loop_count++;
1949	 break;
1950
1951      case SHADER_OPCODE_RCP:
1952      case SHADER_OPCODE_RSQ:
1953      case SHADER_OPCODE_SQRT:
1954      case SHADER_OPCODE_EXP2:
1955      case SHADER_OPCODE_LOG2:
1956      case SHADER_OPCODE_SIN:
1957      case SHADER_OPCODE_COS:
1958         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1959	 if (devinfo->gen >= 6) {
1960            assert(inst->mlen == 0);
1961            assert(devinfo->gen >= 7 || inst->exec_size == 8);
1962            gen6_math(p, dst, brw_math_function(inst->opcode),
1963                      src[0], brw_null_reg());
1964	 } else {
1965            assert(inst->mlen >= 1);
1966            assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1967            gen4_math(p, dst,
1968                      brw_math_function(inst->opcode),
1969                      inst->base_mrf, src[0],
1970                      BRW_MATH_PRECISION_FULL);
1971	 }
1972	 break;
1973      case SHADER_OPCODE_INT_QUOTIENT:
1974      case SHADER_OPCODE_INT_REMAINDER:
1975      case SHADER_OPCODE_POW:
1976         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1977         if (devinfo->gen >= 6) {
1978            assert(inst->mlen == 0);
1979            assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1980                   inst->exec_size == 8);
1981            gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1982         } else {
1983            assert(inst->mlen >= 1);
1984            assert(inst->exec_size == 8);
1985            gen4_math(p, dst, brw_math_function(inst->opcode),
1986                      inst->base_mrf, src[0],
1987                      BRW_MATH_PRECISION_FULL);
1988	 }
1989	 break;
1990      case FS_OPCODE_LINTERP:
1991	 multiple_instructions_emitted = generate_linterp(inst, dst, src);
1992	 break;
1993      case FS_OPCODE_PIXEL_X:
1994         assert(src[0].type == BRW_REGISTER_TYPE_UW);
1995         src[0].subnr = 0 * type_sz(src[0].type);
1996         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1997         break;
1998      case FS_OPCODE_PIXEL_Y:
1999         assert(src[0].type == BRW_REGISTER_TYPE_UW);
2000         src[0].subnr = 4 * type_sz(src[0].type);
2001         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
2002         break;
2003
2004      case SHADER_OPCODE_SEND:
2005         generate_send(inst, dst, src[0], src[1], src[2],
2006                       inst->ex_mlen > 0 ? src[3] : brw_null_reg());
2007         break;
2008
2009      case SHADER_OPCODE_GET_BUFFER_SIZE:
2010         generate_get_buffer_size(inst, dst, src[0], src[1]);
2011         break;
2012      case SHADER_OPCODE_TEX:
2013      case FS_OPCODE_TXB:
2014      case SHADER_OPCODE_TXD:
2015      case SHADER_OPCODE_TXF:
2016      case SHADER_OPCODE_TXF_CMS:
2017      case SHADER_OPCODE_TXL:
2018      case SHADER_OPCODE_TXS:
2019      case SHADER_OPCODE_LOD:
2020      case SHADER_OPCODE_TG4:
2021      case SHADER_OPCODE_SAMPLEINFO:
2022         assert(inst->src[0].file == BAD_FILE);
2023         generate_tex(inst, dst, src[1], src[2]);
2024         break;
2025
2026      case FS_OPCODE_DDX_COARSE:
2027      case FS_OPCODE_DDX_FINE:
2028         generate_ddx(inst, dst, src[0]);
2029         break;
2030      case FS_OPCODE_DDY_COARSE:
2031      case FS_OPCODE_DDY_FINE:
2032         generate_ddy(inst, dst, src[0]);
2033	 break;
2034
2035      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
2036	 generate_scratch_write(inst, src[0]);
2037         spill_count++;
2038	 break;
2039
2040      case SHADER_OPCODE_GEN4_SCRATCH_READ:
2041	 generate_scratch_read(inst, dst);
2042         fill_count++;
2043	 break;
2044
2045      case SHADER_OPCODE_GEN7_SCRATCH_READ:
2046	 generate_scratch_read_gen7(inst, dst);
2047         fill_count++;
2048	 break;
2049
2050      case SHADER_OPCODE_MOV_INDIRECT:
2051         generate_mov_indirect(inst, dst, src[0], src[1]);
2052         break;
2053
2054      case SHADER_OPCODE_URB_READ_SIMD8:
2055      case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2056         generate_urb_read(inst, dst, src[0]);
2057         break;
2058
2059      case SHADER_OPCODE_URB_WRITE_SIMD8:
2060      case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2061      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2062      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2063	 generate_urb_write(inst, src[0]);
2064	 break;
2065
2066      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2067         assert(inst->force_writemask_all);
2068	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2069	 break;
2070
2071      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2072         assert(inst->force_writemask_all);
2073	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2074	 break;
2075
2076      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
2077	 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
2078	 break;
2079
2080      case FS_OPCODE_REP_FB_WRITE:
2081      case FS_OPCODE_FB_WRITE:
2082	 generate_fb_write(inst, src[0]);
2083	 break;
2084
2085      case FS_OPCODE_FB_READ:
2086         generate_fb_read(inst, dst, src[0]);
2087         break;
2088
2089      case FS_OPCODE_DISCARD_JUMP:
2090         generate_discard_jump(inst);
2091         break;
2092
2093      case SHADER_OPCODE_SHADER_TIME_ADD:
2094         generate_shader_time_add(inst, src[0], src[1], src[2]);
2095         break;
2096
2097      case SHADER_OPCODE_MEMORY_FENCE:
2098         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2099         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud);
2100         break;
2101
2102      case SHADER_OPCODE_INTERLOCK:
2103         assert(devinfo->gen >= 9);
2104         /* The interlock is basically a memory fence issued via sendc */
2105         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false);
2106         break;
2107
2108      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2109         const struct brw_reg mask =
2110            brw_stage_has_packed_dispatch(devinfo, stage,
2111                                          prog_data) ? brw_imm_ud(~0u) :
2112            stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2113            brw_dmask_reg();
2114         brw_find_live_channel(p, dst, mask);
2115         break;
2116      }
2117
2118      case SHADER_OPCODE_BROADCAST:
2119         assert(inst->force_writemask_all);
2120         brw_broadcast(p, dst, src[0], src[1]);
2121         break;
2122
2123      case SHADER_OPCODE_SHUFFLE:
2124         generate_shuffle(inst, dst, src[0], src[1]);
2125         break;
2126
2127      case SHADER_OPCODE_SEL_EXEC:
2128         assert(inst->force_writemask_all);
2129         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
2130         brw_MOV(p, dst, src[1]);
2131         brw_set_default_mask_control(p, BRW_MASK_ENABLE);
2132         brw_MOV(p, dst, src[0]);
2133         break;
2134
2135      case SHADER_OPCODE_QUAD_SWIZZLE:
2136         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2137         assert(src[1].type == BRW_REGISTER_TYPE_UD);
2138         generate_quad_swizzle(inst, dst, src[0], src[1].ud);
2139         break;
2140
2141      case SHADER_OPCODE_CLUSTER_BROADCAST: {
2142         assert(src[0].type == dst.type);
2143         assert(!src[0].negate && !src[0].abs);
2144         assert(src[1].file == BRW_IMMEDIATE_VALUE);
2145         assert(src[1].type == BRW_REGISTER_TYPE_UD);
2146         assert(src[2].file == BRW_IMMEDIATE_VALUE);
2147         assert(src[2].type == BRW_REGISTER_TYPE_UD);
2148         const unsigned component = src[1].ud;
2149         const unsigned cluster_size = src[2].ud;
2150         struct brw_reg strided = stride(suboffset(src[0], component),
2151                                         cluster_size, cluster_size, 0);
2152         if (type_sz(src[0].type) > 4 &&
2153             (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
2154            /* IVB has an issue (which we found empirically) where it reads
2155             * two address register components per channel for indirectly
2156             * addressed 64-bit sources.
2157             *
2158             * From the Cherryview PRM Vol 7. "Register Region Restrictions":
2159             *
2160             *    "When source or destination datatype is 64b or operation is
2161             *    integer DWord multiply, indirect addressing must not be
2162             *    used."
2163             *
2164             * To work around both of these, we do two integer MOVs insead of
2165             * one 64-bit MOV.  Because no double value should ever cross a
2166             * register boundary, it's safe to use the immediate offset in the
2167             * indirect here to handle adding 4 bytes to the offset and avoid
2168             * the extra ADD to the register file.
2169             */
2170            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
2171                       subscript(strided, BRW_REGISTER_TYPE_D, 0));
2172            brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
2173                       subscript(strided, BRW_REGISTER_TYPE_D, 1));
2174         } else {
2175            brw_MOV(p, dst, strided);
2176         }
2177         break;
2178      }
2179
2180      case FS_OPCODE_SET_SAMPLE_ID:
2181         generate_set_sample_id(inst, dst, src[0], src[1]);
2182         break;
2183
2184      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2185          generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2186          break;
2187
2188      case FS_OPCODE_PLACEHOLDER_HALT:
2189         /* This is the place where the final HALT needs to be inserted if
2190          * we've emitted any discards.  If not, this will emit no code.
2191          */
2192         if (!patch_discard_jumps_to_fb_writes()) {
2193            if (unlikely(debug_flag)) {
2194               disasm_info->use_tail = true;
2195            }
2196         }
2197         break;
2198
2199      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2200         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2201                                           GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2202         break;
2203
2204      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2205         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2206                                           GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2207         break;
2208
2209      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2210         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2211                                           GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2212         break;
2213
2214      case CS_OPCODE_CS_TERMINATE:
2215         generate_cs_terminate(inst, src[0]);
2216         break;
2217
2218      case SHADER_OPCODE_BARRIER:
2219	 generate_barrier(inst, src[0]);
2220	 break;
2221
2222      case BRW_OPCODE_DIM:
2223         assert(devinfo->is_haswell);
2224         assert(src[0].type == BRW_REGISTER_TYPE_DF);
2225         assert(dst.type == BRW_REGISTER_TYPE_DF);
2226         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2227         break;
2228
2229      case SHADER_OPCODE_RND_MODE:
2230         assert(src[0].file == BRW_IMMEDIATE_VALUE);
2231         brw_rounding_mode(p, (brw_rnd_mode) src[0].d);
2232         break;
2233
2234      default:
2235         unreachable("Unsupported opcode");
2236
2237      case SHADER_OPCODE_LOAD_PAYLOAD:
2238         unreachable("Should be lowered by lower_load_payload()");
2239      }
2240
2241      if (multiple_instructions_emitted)
2242         continue;
2243
2244      if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2245         assert(p->next_insn_offset == last_insn_offset + 16 ||
2246                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2247                 "emitting more than 1 instruction");
2248
2249         brw_inst *last = &p->store[last_insn_offset / 16];
2250
2251         if (inst->conditional_mod)
2252            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2253         brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2254         brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2255      }
2256   }
2257
2258   brw_set_uip_jip(p, start_offset);
2259
2260   /* end of program sentinel */
2261   disasm_new_inst_group(disasm_info, p->next_insn_offset);
2262
2263#ifndef NDEBUG
2264   bool validated =
2265#else
2266   if (unlikely(debug_flag))
2267#endif
2268      brw_validate_instructions(devinfo, p->store,
2269                                start_offset,
2270                                p->next_insn_offset,
2271                                disasm_info);
2272
2273   int before_size = p->next_insn_offset - start_offset;
2274   brw_compact_instructions(p, start_offset, disasm_info);
2275   int after_size = p->next_insn_offset - start_offset;
2276
2277   if (unlikely(debug_flag)) {
2278      fprintf(stderr, "Native code for %s\n"
2279              "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2280              " bytes (%.0f%%)\n",
2281              shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2282              spill_count, fill_count, promoted_constants, before_size, after_size,
2283              100.0f * (before_size - after_size) / before_size);
2284
2285      dump_assembly(p->store, disasm_info);
2286   }
2287   ralloc_free(disasm_info);
2288   assert(validated);
2289
2290   compiler->shader_debug_log(log_data,
2291                              "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2292                              "%d:%d spills:fills, Promoted %u constants, "
2293                              "compacted %d to %d bytes.",
2294                              _mesa_shader_stage_to_abbrev(stage),
2295                              dispatch_width, before_size / 16,
2296                              loop_count, cfg->cycle_count, spill_count,
2297                              fill_count, promoted_constants, before_size,
2298                              after_size);
2299
2300   return start_offset;
2301}
2302
2303const unsigned *
2304fs_generator::get_assembly()
2305{
2306   return brw_get_program(p, &prog_data->program_size);
2307}
2308