brw_fs_builder.h revision 01e04c3f
1/* -*- c++ -*- */
2/*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifndef BRW_FS_BUILDER_H
26#define BRW_FS_BUILDER_H
27
28#include "brw_ir_fs.h"
29#include "brw_shader.h"
30
31namespace brw {
32   /**
33    * Toolbox to assemble an FS IR program out of individual instructions.
34    *
35    * This object is meant to have an interface consistent with
36    * brw::vec4_builder.  They cannot be fully interchangeable because
37    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38    * vector code.
39    */
40   class fs_builder {
41   public:
42      /** Type used in this IR to represent a source of an instruction. */
43      typedef fs_reg src_reg;
44
45      /** Type used in this IR to represent the destination of an instruction. */
46      typedef fs_reg dst_reg;
47
48      /** Type used in this IR to represent an instruction. */
49      typedef fs_inst instruction;
50
51      /**
52       * Construct an fs_builder that inserts instructions into \p shader.
53       * \p dispatch_width gives the native execution width of the program.
54       */
55      fs_builder(backend_shader *shader,
56                 unsigned dispatch_width) :
57         shader(shader), block(NULL), cursor(NULL),
58         _dispatch_width(dispatch_width),
59         _group(0),
60         force_writemask_all(false),
61         annotation()
62      {
63      }
64
65      /**
66       * Construct an fs_builder that inserts instructions into \p shader
67       * before instruction \p inst in basic block \p block.  The default
68       * execution controls and debug annotation are initialized from the
69       * instruction passed as argument.
70       */
71      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72         shader(shader), block(block), cursor(inst),
73         _dispatch_width(inst->exec_size),
74         _group(inst->group),
75         force_writemask_all(inst->force_writemask_all)
76      {
77         annotation.str = inst->annotation;
78         annotation.ir = inst->ir;
79      }
80
81      /**
82       * Construct an fs_builder that inserts instructions before \p cursor in
83       * basic block \p block, inheriting other code generation parameters
84       * from this.
85       */
86      fs_builder
87      at(bblock_t *block, exec_node *cursor) const
88      {
89         fs_builder bld = *this;
90         bld.block = block;
91         bld.cursor = cursor;
92         return bld;
93      }
94
95      /**
96       * Construct an fs_builder appending instructions at the end of the
97       * instruction list of the shader, inheriting other code generation
98       * parameters from this.
99       */
100      fs_builder
101      at_end() const
102      {
103         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104      }
105
106      /**
107       * Construct a builder specifying the default SIMD width and group of
108       * channel enable signals, inheriting other code generation parameters
109       * from this.
110       *
111       * \p n gives the default SIMD width, \p i gives the slot group used for
112       * predication and control flow masking in multiples of \p n channels.
113       */
114      fs_builder
115      group(unsigned n, unsigned i) const
116      {
117         assert(force_writemask_all ||
118                (n <= dispatch_width() && i < dispatch_width() / n));
119         fs_builder bld = *this;
120         bld._dispatch_width = n;
121         bld._group += i * n;
122         return bld;
123      }
124
125      /**
126       * Alias for group() with width equal to eight.
127       */
128      fs_builder
129      half(unsigned i) const
130      {
131         return group(8, i);
132      }
133
134      /**
135       * Construct a builder with per-channel control flow execution masking
136       * disabled if \p b is true.  If control flow execution masking is
137       * already disabled this has no effect.
138       */
139      fs_builder
140      exec_all(bool b = true) const
141      {
142         fs_builder bld = *this;
143         if (b)
144            bld.force_writemask_all = true;
145         return bld;
146      }
147
148      /**
149       * Construct a builder with the given debug annotation info.
150       */
151      fs_builder
152      annotate(const char *str, const void *ir = NULL) const
153      {
154         fs_builder bld = *this;
155         bld.annotation.str = str;
156         bld.annotation.ir = ir;
157         return bld;
158      }
159
160      /**
161       * Get the SIMD width in use.
162       */
163      unsigned
164      dispatch_width() const
165      {
166         return _dispatch_width;
167      }
168
169      /**
170       * Get the channel group in use.
171       */
172      unsigned
173      group() const
174      {
175         return _group;
176      }
177
178      /**
179       * Allocate a virtual register of natural vector size (one for this IR)
180       * and SIMD width.  \p n gives the amount of space to allocate in
181       * dispatch_width units (which is just enough space for one logical
182       * component in this IR).
183       */
184      dst_reg
185      vgrf(enum brw_reg_type type, unsigned n = 1) const
186      {
187         assert(dispatch_width() <= 32);
188
189         if (n > 0)
190            return dst_reg(VGRF, shader->alloc.allocate(
191                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
192                                           REG_SIZE)),
193                           type);
194         else
195            return retype(null_reg_ud(), type);
196      }
197
198      /**
199       * Create a null register of floating type.
200       */
201      dst_reg
202      null_reg_f() const
203      {
204         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
205      }
206
207      dst_reg
208      null_reg_df() const
209      {
210         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
211      }
212
213      /**
214       * Create a null register of signed integer type.
215       */
216      dst_reg
217      null_reg_d() const
218      {
219         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
220      }
221
222      /**
223       * Create a null register of unsigned integer type.
224       */
225      dst_reg
226      null_reg_ud() const
227      {
228         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
229      }
230
231      /**
232       * Get the mask of SIMD channels enabled by dispatch and not yet
233       * disabled by discard.
234       */
235      src_reg
236      sample_mask_reg() const
237      {
238         if (shader->stage != MESA_SHADER_FRAGMENT) {
239            return brw_imm_d(0xffffffff);
240         } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
241            return brw_flag_reg(0, 1);
242         } else {
243            assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
244            return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
245                          BRW_REGISTER_TYPE_UD);
246         }
247      }
248
249      /**
250       * Insert an instruction into the program.
251       */
252      instruction *
253      emit(const instruction &inst) const
254      {
255         return emit(new(shader->mem_ctx) instruction(inst));
256      }
257
258      /**
259       * Create and insert a nullary control instruction into the program.
260       */
261      instruction *
262      emit(enum opcode opcode) const
263      {
264         return emit(instruction(opcode, dispatch_width()));
265      }
266
267      /**
268       * Create and insert a nullary instruction into the program.
269       */
270      instruction *
271      emit(enum opcode opcode, const dst_reg &dst) const
272      {
273         return emit(instruction(opcode, dispatch_width(), dst));
274      }
275
276      /**
277       * Create and insert a unary instruction into the program.
278       */
279      instruction *
280      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
281      {
282         switch (opcode) {
283         case SHADER_OPCODE_RCP:
284         case SHADER_OPCODE_RSQ:
285         case SHADER_OPCODE_SQRT:
286         case SHADER_OPCODE_EXP2:
287         case SHADER_OPCODE_LOG2:
288         case SHADER_OPCODE_SIN:
289         case SHADER_OPCODE_COS:
290            return emit(instruction(opcode, dispatch_width(), dst,
291                                    fix_math_operand(src0)));
292
293         default:
294            return emit(instruction(opcode, dispatch_width(), dst, src0));
295         }
296      }
297
298      /**
299       * Create and insert a binary instruction into the program.
300       */
301      instruction *
302      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
303           const src_reg &src1) const
304      {
305         switch (opcode) {
306         case SHADER_OPCODE_POW:
307         case SHADER_OPCODE_INT_QUOTIENT:
308         case SHADER_OPCODE_INT_REMAINDER:
309            return emit(instruction(opcode, dispatch_width(), dst,
310                                    fix_math_operand(src0),
311                                    fix_math_operand(src1)));
312
313         default:
314            return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
315
316         }
317      }
318
319      /**
320       * Create and insert a ternary instruction into the program.
321       */
322      instruction *
323      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
324           const src_reg &src1, const src_reg &src2) const
325      {
326         switch (opcode) {
327         case BRW_OPCODE_BFE:
328         case BRW_OPCODE_BFI2:
329         case BRW_OPCODE_MAD:
330         case BRW_OPCODE_LRP:
331            return emit(instruction(opcode, dispatch_width(), dst,
332                                    fix_3src_operand(src0),
333                                    fix_3src_operand(src1),
334                                    fix_3src_operand(src2)));
335
336         default:
337            return emit(instruction(opcode, dispatch_width(), dst,
338                                    src0, src1, src2));
339         }
340      }
341
342      /**
343       * Create and insert an instruction with a variable number of sources
344       * into the program.
345       */
346      instruction *
347      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
348           unsigned n) const
349      {
350         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
351      }
352
353      /**
354       * Insert a preallocated instruction into the program.
355       */
356      instruction *
357      emit(instruction *inst) const
358      {
359         assert(inst->exec_size <= 32);
360         assert(inst->exec_size == dispatch_width() ||
361                force_writemask_all);
362
363         inst->group = _group;
364         inst->force_writemask_all = force_writemask_all;
365         inst->annotation = annotation.str;
366         inst->ir = annotation.ir;
367
368         if (block)
369            static_cast<instruction *>(cursor)->insert_before(block, inst);
370         else
371            cursor->insert_before(inst);
372
373         return inst;
374      }
375
376      /**
377       * Select \p src0 if the comparison of both sources with the given
378       * conditional mod evaluates to true, otherwise select \p src1.
379       *
380       * Generally useful to get the minimum or maximum of two values.
381       */
382      instruction *
383      emit_minmax(const dst_reg &dst, const src_reg &src0,
384                  const src_reg &src1, brw_conditional_mod mod) const
385      {
386         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
387
388         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
389                                     fix_unsigned_negate(src1)));
390      }
391
392      /**
393       * Copy any live channel from \p src to the first channel of the result.
394       */
395      src_reg
396      emit_uniformize(const src_reg &src) const
397      {
398         /* FIXME: We use a vector chan_index and dst to allow constant and
399          * copy propagration to move result all the way into the consuming
400          * instruction (typically a surface index or sampler index for a
401          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
402          * dispatch. Once we teach const/copy propagation about scalars we
403          * should go back to scalar destinations here.
404          */
405         const fs_builder ubld = exec_all();
406         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
407         const dst_reg dst = vgrf(src.type);
408
409         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
410         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
411
412         return src_reg(component(dst, 0));
413      }
414
415      void
416      emit_scan(enum opcode opcode, const dst_reg &tmp,
417                unsigned cluster_size, brw_conditional_mod mod) const
418      {
419         assert(dispatch_width() >= 8);
420
421         /* The instruction splitting code isn't advanced enough to split
422          * these so we need to handle that ourselves.
423          */
424         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
425            const unsigned half_width = dispatch_width() / 2;
426            const fs_builder ubld = exec_all().group(half_width, 0);
427            dst_reg left = tmp;
428            dst_reg right = horiz_offset(tmp, half_width);
429            ubld.emit_scan(opcode, left, cluster_size, mod);
430            ubld.emit_scan(opcode, right, cluster_size, mod);
431            if (cluster_size > half_width) {
432               src_reg left_comp = component(left, half_width - 1);
433               set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
434            }
435            return;
436         }
437
438         if (cluster_size > 1) {
439            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
440            dst_reg left = horiz_stride(tmp, 2);
441            dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
442
443            /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
444             *
445             *    "When source or destination datatype is 64b or operation is
446             *    integer DWord multiply, regioning in Align1 must follow
447             *    these rules:
448             *
449             *    [...]
450             *
451             *    3. Source and Destination offset must be the same, except
452             *       the case of scalar source."
453             *
454             * In order to work around this, we create a temporary register
455             * and shift left over to match right.  If we have a 64-bit type,
456             * we have to use two integer MOVs instead of a 64-bit MOV.
457             */
458            if (need_matching_subreg_offset(opcode, tmp.type)) {
459               dst_reg tmp2 = vgrf(tmp.type);
460               dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2);
461               if (type_sz(tmp.type) > 4) {
462                  ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0),
463                           subscript(left, BRW_REGISTER_TYPE_D, 0));
464                  ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1),
465                           subscript(left, BRW_REGISTER_TYPE_D, 1));
466               } else {
467                  ubld.MOV(new_left, left);
468               }
469               left = new_left;
470            }
471            set_condmod(mod, ubld.emit(opcode, right, left, right));
472         }
473
474         if (cluster_size > 2) {
475            if (type_sz(tmp.type) <= 4 &&
476                !need_matching_subreg_offset(opcode, tmp.type)) {
477               const fs_builder ubld =
478                  exec_all().group(dispatch_width() / 4, 0);
479               src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
480
481               dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
482               set_condmod(mod, ubld.emit(opcode, right, left, right));
483
484               right = horiz_stride(horiz_offset(tmp, 3), 4);
485               set_condmod(mod, ubld.emit(opcode, right, left, right));
486            } else {
487               /* For 64-bit types, we have to do things differently because
488                * the code above would land us with destination strides that
489                * the hardware can't handle.  Fortunately, we'll only be
490                * 8-wide in that case and it's the same number of
491                * instructions.
492                */
493               const fs_builder ubld = exec_all().group(2, 0);
494
495               for (unsigned i = 0; i < dispatch_width(); i += 4) {
496                  src_reg left = component(tmp, i + 1);
497                  dst_reg right = horiz_offset(tmp, i + 2);
498                  set_condmod(mod, ubld.emit(opcode, right, left, right));
499               }
500            }
501         }
502
503         if (cluster_size > 4) {
504            const fs_builder ubld = exec_all().group(4, 0);
505            src_reg left = component(tmp, 3);
506            dst_reg right = horiz_offset(tmp, 4);
507            set_condmod(mod, ubld.emit(opcode, right, left, right));
508
509            if (dispatch_width() > 8) {
510               left = component(tmp, 8 + 3);
511               right = horiz_offset(tmp, 8 + 4);
512               set_condmod(mod, ubld.emit(opcode, right, left, right));
513            }
514         }
515
516         if (cluster_size > 8 && dispatch_width() > 8) {
517            const fs_builder ubld = exec_all().group(8, 0);
518            src_reg left = component(tmp, 7);
519            dst_reg right = horiz_offset(tmp, 8);
520            set_condmod(mod, ubld.emit(opcode, right, left, right));
521         }
522      }
523
524      /**
525       * Assorted arithmetic ops.
526       * @{
527       */
528#define ALU1(op)                                        \
529      instruction *                                     \
530      op(const dst_reg &dst, const src_reg &src0) const \
531      {                                                 \
532         return emit(BRW_OPCODE_##op, dst, src0);       \
533      }
534
535#define ALU2(op)                                                        \
536      instruction *                                                     \
537      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
538      {                                                                 \
539         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
540      }
541
542#define ALU2_ACC(op)                                                    \
543      instruction *                                                     \
544      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
545      {                                                                 \
546         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
547         inst->writes_accumulator = true;                               \
548         return inst;                                                   \
549      }
550
551#define ALU3(op)                                                        \
552      instruction *                                                     \
553      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
554         const src_reg &src2) const                                     \
555      {                                                                 \
556         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
557      }
558
559      ALU2(ADD)
560      ALU2_ACC(ADDC)
561      ALU2(AND)
562      ALU2(ASR)
563      ALU2(AVG)
564      ALU3(BFE)
565      ALU2(BFI1)
566      ALU3(BFI2)
567      ALU1(BFREV)
568      ALU1(CBIT)
569      ALU2(CMPN)
570      ALU1(DIM)
571      ALU2(DP2)
572      ALU2(DP3)
573      ALU2(DP4)
574      ALU2(DPH)
575      ALU1(F16TO32)
576      ALU1(F32TO16)
577      ALU1(FBH)
578      ALU1(FBL)
579      ALU1(FRC)
580      ALU2(LINE)
581      ALU1(LZD)
582      ALU2(MAC)
583      ALU2_ACC(MACH)
584      ALU3(MAD)
585      ALU1(MOV)
586      ALU2(MUL)
587      ALU1(NOT)
588      ALU2(OR)
589      ALU2(PLN)
590      ALU1(RNDD)
591      ALU1(RNDE)
592      ALU1(RNDU)
593      ALU1(RNDZ)
594      ALU2(SAD2)
595      ALU2_ACC(SADA2)
596      ALU2(SEL)
597      ALU2(SHL)
598      ALU2(SHR)
599      ALU2_ACC(SUBB)
600      ALU2(XOR)
601
602#undef ALU3
603#undef ALU2_ACC
604#undef ALU2
605#undef ALU1
606      /** @} */
607
608      /**
609       * CMP: Sets the low bit of the destination channels with the result
610       * of the comparison, while the upper bits are undefined, and updates
611       * the flag register with the packed 16 bits of the result.
612       */
613      instruction *
614      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
615          brw_conditional_mod condition) const
616      {
617         /* Take the instruction:
618          *
619          * CMP null<d> src0<f> src1<f>
620          *
621          * Original gen4 does type conversion to the destination type
622          * before comparison, producing garbage results for floating
623          * point comparisons.
624          *
625          * The destination type doesn't matter on newer generations,
626          * so we set the type to match src0 so we can compact the
627          * instruction.
628          */
629         return set_condmod(condition,
630                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
631                                 fix_unsigned_negate(src0),
632                                 fix_unsigned_negate(src1)));
633      }
634
635      /**
636       * Gen4 predicated IF.
637       */
638      instruction *
639      IF(brw_predicate predicate) const
640      {
641         return set_predicate(predicate, emit(BRW_OPCODE_IF));
642      }
643
644      /**
645       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
646       */
647      instruction *
648      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
649           const src_reg &src2, brw_conditional_mod condition) const
650      {
651         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
652          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
653          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
654          */
655         assert(src2.type == BRW_REGISTER_TYPE_F);
656
657         return set_condmod(condition,
658                            emit(BRW_OPCODE_CSEL,
659                                 retype(dst, BRW_REGISTER_TYPE_F),
660                                 retype(src0, BRW_REGISTER_TYPE_F),
661                                 retype(src1, BRW_REGISTER_TYPE_F),
662                                 src2));
663      }
664
665      /**
666       * Emit a linear interpolation instruction.
667       */
668      instruction *
669      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
670          const src_reg &a) const
671      {
672         if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
673            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
674             * we need to reorder the operands.
675             */
676            return emit(BRW_OPCODE_LRP, dst, a, y, x);
677
678         } else {
679            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
680            const dst_reg y_times_a = vgrf(dst.type);
681            const dst_reg one_minus_a = vgrf(dst.type);
682            const dst_reg x_times_one_minus_a = vgrf(dst.type);
683
684            MUL(y_times_a, y, a);
685            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
686            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
687            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
688         }
689      }
690
691      /**
692       * Collect a number of registers in a contiguous range of registers.
693       */
694      instruction *
695      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
696                   unsigned sources, unsigned header_size) const
697      {
698         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
699         inst->header_size = header_size;
700         inst->size_written = header_size * REG_SIZE;
701         for (unsigned i = header_size; i < sources; i++) {
702            inst->size_written +=
703               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
704                     REG_SIZE);
705         }
706
707         return inst;
708      }
709
710      backend_shader *shader;
711
712   private:
713      /**
714       * Workaround for negation of UD registers.  See comment in
715       * fs_generator::generate_code() for more details.
716       */
717      src_reg
718      fix_unsigned_negate(const src_reg &src) const
719      {
720         if (src.type == BRW_REGISTER_TYPE_UD &&
721             src.negate) {
722            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
723            MOV(temp, src);
724            return src_reg(temp);
725         } else {
726            return src;
727         }
728      }
729
730      /**
731       * Workaround for source register modes not supported by the ternary
732       * instruction encoding.
733       */
734      src_reg
735      fix_3src_operand(const src_reg &src) const
736      {
737         if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
738            return src;
739         } else {
740            dst_reg expanded = vgrf(src.type);
741            MOV(expanded, src);
742            return expanded;
743         }
744      }
745
746      /**
747       * Workaround for source register modes not supported by the math
748       * instruction.
749       */
750      src_reg
751      fix_math_operand(const src_reg &src) const
752      {
753         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
754          * might be able to do better by doing execsize = 1 math and then
755          * expanding that result out, but we would need to be careful with
756          * masking.
757          *
758          * Gen6 hardware ignores source modifiers (negate and abs) on math
759          * instructions, so we also move to a temp to set those up.
760          *
761          * Gen7 relaxes most of the above restrictions, but still can't use IMM
762          * operands to math
763          */
764         if ((shader->devinfo->gen == 6 &&
765              (src.file == IMM || src.file == UNIFORM ||
766               src.abs || src.negate)) ||
767             (shader->devinfo->gen == 7 && src.file == IMM)) {
768            const dst_reg tmp = vgrf(src.type);
769            MOV(tmp, src);
770            return tmp;
771         } else {
772            return src;
773         }
774      }
775
776
777      /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn":
778       *
779       *    "When source or destination datatype is 64b or operation is
780       *    integer DWord multiply, regioning in Align1 must follow
781       *    these rules:
782       *
783       *    [...]
784       *
785       *    3. Source and Destination offset must be the same, except
786       *       the case of scalar source."
787       *
788       * This helper just detects when we're in this case.
789       */
790      bool
791      need_matching_subreg_offset(enum opcode opcode,
792                                  enum brw_reg_type type) const
793      {
794         if (!shader->devinfo->is_cherryview &&
795             !gen_device_info_is_9lp(shader->devinfo))
796            return false;
797
798         if (type_sz(type) > 4)
799            return true;
800
801         if (opcode == BRW_OPCODE_MUL &&
802             !brw_reg_type_is_floating_point(type))
803            return true;
804
805         return false;
806      }
807
808      bblock_t *block;
809      exec_node *cursor;
810
811      unsigned _dispatch_width;
812      unsigned _group;
813      bool force_writemask_all;
814
815      /** Debug annotation info. */
816      struct {
817         const char *str;
818         const void *ir;
819      } annotation;
820   };
821}
822
823#endif
824