1/* -*- c++ -*- */
2/*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifndef BRW_FS_BUILDER_H
26#define BRW_FS_BUILDER_H
27
28#include "brw_ir_fs.h"
29#include "brw_shader.h"
30
31namespace brw {
32   /**
33    * Toolbox to assemble an FS IR program out of individual instructions.
34    *
35    * This object is meant to have an interface consistent with
36    * brw::vec4_builder.  They cannot be fully interchangeable because
37    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38    * vector code.
39    */
40   class fs_builder {
41   public:
42      /** Type used in this IR to represent a source of an instruction. */
43      typedef fs_reg src_reg;
44
45      /** Type used in this IR to represent the destination of an instruction. */
46      typedef fs_reg dst_reg;
47
48      /** Type used in this IR to represent an instruction. */
49      typedef fs_inst instruction;
50
51      /**
52       * Construct an fs_builder that inserts instructions into \p shader.
53       * \p dispatch_width gives the native execution width of the program.
54       */
55      fs_builder(backend_shader *shader,
56                 unsigned dispatch_width) :
57         shader(shader), block(NULL), cursor(NULL),
58         _dispatch_width(dispatch_width),
59         _group(0),
60         force_writemask_all(false),
61         annotation()
62      {
63      }
64
65      /**
66       * Construct an fs_builder that inserts instructions into \p shader
67       * before instruction \p inst in basic block \p block.  The default
68       * execution controls and debug annotation are initialized from the
69       * instruction passed as argument.
70       */
71      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72         shader(shader), block(block), cursor(inst),
73         _dispatch_width(inst->exec_size),
74         _group(inst->group),
75         force_writemask_all(inst->force_writemask_all)
76      {
77         annotation.str = inst->annotation;
78         annotation.ir = inst->ir;
79      }
80
81      /**
82       * Construct an fs_builder that inserts instructions before \p cursor in
83       * basic block \p block, inheriting other code generation parameters
84       * from this.
85       */
86      fs_builder
87      at(bblock_t *block, exec_node *cursor) const
88      {
89         fs_builder bld = *this;
90         bld.block = block;
91         bld.cursor = cursor;
92         return bld;
93      }
94
95      /**
96       * Construct an fs_builder appending instructions at the end of the
97       * instruction list of the shader, inheriting other code generation
98       * parameters from this.
99       */
100      fs_builder
101      at_end() const
102      {
103         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104      }
105
106      /**
107       * Construct a builder specifying the default SIMD width and group of
108       * channel enable signals, inheriting other code generation parameters
109       * from this.
110       *
111       * \p n gives the default SIMD width, \p i gives the slot group used for
112       * predication and control flow masking in multiples of \p n channels.
113       */
114      fs_builder
115      group(unsigned n, unsigned i) const
116      {
117         fs_builder bld = *this;
118
119         if (n <= dispatch_width() && i < dispatch_width() / n) {
120            bld._group += i * n;
121         } else {
122            /* The requested channel group isn't a subset of the channel group
123             * of this builder, which means that the resulting instructions
124             * would use (potentially undefined) channel enable signals not
125             * specified by the parent builder.  That's only valid if the
126             * instruction doesn't have per-channel semantics, in which case
127             * we should clear off the default group index in order to prevent
128             * emitting instructions with channel group not aligned to their
129             * own execution size.
130             */
131            assert(force_writemask_all);
132            bld._group = 0;
133         }
134
135         bld._dispatch_width = n;
136         return bld;
137      }
138
139      /**
140       * Alias for group() with width equal to eight.
141       */
142      fs_builder
143      half(unsigned i) const
144      {
145         return group(8, i);
146      }
147
148      /**
149       * Construct a builder with per-channel control flow execution masking
150       * disabled if \p b is true.  If control flow execution masking is
151       * already disabled this has no effect.
152       */
153      fs_builder
154      exec_all(bool b = true) const
155      {
156         fs_builder bld = *this;
157         if (b)
158            bld.force_writemask_all = true;
159         return bld;
160      }
161
162      /**
163       * Construct a builder with the given debug annotation info.
164       */
165      fs_builder
166      annotate(const char *str, const void *ir = NULL) const
167      {
168         fs_builder bld = *this;
169         bld.annotation.str = str;
170         bld.annotation.ir = ir;
171         return bld;
172      }
173
174      /**
175       * Get the SIMD width in use.
176       */
177      unsigned
178      dispatch_width() const
179      {
180         return _dispatch_width;
181      }
182
183      /**
184       * Get the channel group in use.
185       */
186      unsigned
187      group() const
188      {
189         return _group;
190      }
191
192      /**
193       * Allocate a virtual register of natural vector size (one for this IR)
194       * and SIMD width.  \p n gives the amount of space to allocate in
195       * dispatch_width units (which is just enough space for one logical
196       * component in this IR).
197       */
198      dst_reg
199      vgrf(enum brw_reg_type type, unsigned n = 1) const
200      {
201         assert(dispatch_width() <= 32);
202
203         if (n > 0)
204            return dst_reg(VGRF, shader->alloc.allocate(
205                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206                                           REG_SIZE)),
207                           type);
208         else
209            return retype(null_reg_ud(), type);
210      }
211
212      /**
213       * Create a null register of floating type.
214       */
215      dst_reg
216      null_reg_f() const
217      {
218         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219      }
220
221      dst_reg
222      null_reg_df() const
223      {
224         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225      }
226
227      /**
228       * Create a null register of signed integer type.
229       */
230      dst_reg
231      null_reg_d() const
232      {
233         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234      }
235
236      /**
237       * Create a null register of unsigned integer type.
238       */
239      dst_reg
240      null_reg_ud() const
241      {
242         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243      }
244
245      /**
246       * Get the mask of SIMD channels enabled by dispatch and not yet
247       * disabled by discard.
248       */
249      src_reg
250      sample_mask_reg() const
251      {
252         if (shader->stage != MESA_SHADER_FRAGMENT) {
253            return brw_imm_d(0xffffffff);
254         } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
255            return brw_flag_reg(0, 1);
256         } else {
257            assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
258            return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
259                          BRW_REGISTER_TYPE_UD);
260         }
261      }
262
263      /**
264       * Insert an instruction into the program.
265       */
266      instruction *
267      emit(const instruction &inst) const
268      {
269         return emit(new(shader->mem_ctx) instruction(inst));
270      }
271
272      /**
273       * Create and insert a nullary control instruction into the program.
274       */
275      instruction *
276      emit(enum opcode opcode) const
277      {
278         return emit(instruction(opcode, dispatch_width()));
279      }
280
281      /**
282       * Create and insert a nullary instruction into the program.
283       */
284      instruction *
285      emit(enum opcode opcode, const dst_reg &dst) const
286      {
287         return emit(instruction(opcode, dispatch_width(), dst));
288      }
289
290      /**
291       * Create and insert a unary instruction into the program.
292       */
293      instruction *
294      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
295      {
296         switch (opcode) {
297         case SHADER_OPCODE_RCP:
298         case SHADER_OPCODE_RSQ:
299         case SHADER_OPCODE_SQRT:
300         case SHADER_OPCODE_EXP2:
301         case SHADER_OPCODE_LOG2:
302         case SHADER_OPCODE_SIN:
303         case SHADER_OPCODE_COS:
304            return emit(instruction(opcode, dispatch_width(), dst,
305                                    fix_math_operand(src0)));
306
307         default:
308            return emit(instruction(opcode, dispatch_width(), dst, src0));
309         }
310      }
311
312      /**
313       * Create and insert a binary instruction into the program.
314       */
315      instruction *
316      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
317           const src_reg &src1) const
318      {
319         switch (opcode) {
320         case SHADER_OPCODE_POW:
321         case SHADER_OPCODE_INT_QUOTIENT:
322         case SHADER_OPCODE_INT_REMAINDER:
323            return emit(instruction(opcode, dispatch_width(), dst,
324                                    fix_math_operand(src0),
325                                    fix_math_operand(fix_byte_src(src1))));
326
327         default:
328            return emit(instruction(opcode, dispatch_width(), dst,
329                                    src0, fix_byte_src(src1)));
330
331         }
332      }
333
334      /**
335       * Create and insert a ternary instruction into the program.
336       */
337      instruction *
338      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
339           const src_reg &src1, const src_reg &src2) const
340      {
341         switch (opcode) {
342         case BRW_OPCODE_BFE:
343         case BRW_OPCODE_BFI2:
344         case BRW_OPCODE_MAD:
345         case BRW_OPCODE_LRP:
346            return emit(instruction(opcode, dispatch_width(), dst,
347                                    fix_3src_operand(src0),
348                                    fix_3src_operand(fix_byte_src(src1)),
349                                    fix_3src_operand(fix_byte_src(src2))));
350
351         default:
352            return emit(instruction(opcode, dispatch_width(), dst,
353                                    src0, fix_byte_src(src1), fix_byte_src(src2)));
354         }
355      }
356
357      /**
358       * Create and insert an instruction with a variable number of sources
359       * into the program.
360       */
361      instruction *
362      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
363           unsigned n) const
364      {
365         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
366      }
367
368      /**
369       * Insert a preallocated instruction into the program.
370       */
371      instruction *
372      emit(instruction *inst) const
373      {
374         assert(inst->exec_size <= 32);
375         assert(inst->exec_size == dispatch_width() ||
376                force_writemask_all);
377
378         inst->group = _group;
379         inst->force_writemask_all = force_writemask_all;
380         inst->annotation = annotation.str;
381         inst->ir = annotation.ir;
382
383         if (block)
384            static_cast<instruction *>(cursor)->insert_before(block, inst);
385         else
386            cursor->insert_before(inst);
387
388         return inst;
389      }
390
391      /**
392       * Select \p src0 if the comparison of both sources with the given
393       * conditional mod evaluates to true, otherwise select \p src1.
394       *
395       * Generally useful to get the minimum or maximum of two values.
396       */
397      instruction *
398      emit_minmax(const dst_reg &dst, const src_reg &src0,
399                  const src_reg &src1, brw_conditional_mod mod) const
400      {
401         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
402
403         /* In some cases we can't have bytes as operand for src1, so use the
404          * same type for both operand.
405          */
406         return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
407                                     fix_unsigned_negate(fix_byte_src(src1))));
408      }
409
410      /**
411       * Copy any live channel from \p src to the first channel of the result.
412       */
413      src_reg
414      emit_uniformize(const src_reg &src) const
415      {
416         /* FIXME: We use a vector chan_index and dst to allow constant and
417          * copy propagration to move result all the way into the consuming
418          * instruction (typically a surface index or sampler index for a
419          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
420          * dispatch. Once we teach const/copy propagation about scalars we
421          * should go back to scalar destinations here.
422          */
423         const fs_builder ubld = exec_all();
424         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
425         const dst_reg dst = vgrf(src.type);
426
427         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
428         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
429
430         return src_reg(component(dst, 0));
431      }
432
433      src_reg
434      move_to_vgrf(const src_reg &src, unsigned num_components) const
435      {
436         src_reg *const src_comps = new src_reg[num_components];
437         for (unsigned i = 0; i < num_components; i++)
438            src_comps[i] = offset(src, dispatch_width(), i);
439
440         const dst_reg dst = vgrf(src.type, num_components);
441         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
442
443         delete[] src_comps;
444
445         return src_reg(dst);
446      }
447
448      void
449      emit_scan(enum opcode opcode, const dst_reg &tmp,
450                unsigned cluster_size, brw_conditional_mod mod) const
451      {
452         assert(dispatch_width() >= 8);
453
454         /* The instruction splitting code isn't advanced enough to split
455          * these so we need to handle that ourselves.
456          */
457         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
458            const unsigned half_width = dispatch_width() / 2;
459            const fs_builder ubld = exec_all().group(half_width, 0);
460            dst_reg left = tmp;
461            dst_reg right = horiz_offset(tmp, half_width);
462            ubld.emit_scan(opcode, left, cluster_size, mod);
463            ubld.emit_scan(opcode, right, cluster_size, mod);
464            if (cluster_size > half_width) {
465               src_reg left_comp = component(left, half_width - 1);
466               set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
467            }
468            return;
469         }
470
471         if (cluster_size > 1) {
472            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
473            const dst_reg left = horiz_stride(tmp, 2);
474            const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
475            set_condmod(mod, ubld.emit(opcode, right, left, right));
476         }
477
478         if (cluster_size > 2) {
479            if (type_sz(tmp.type) <= 4) {
480               const fs_builder ubld =
481                  exec_all().group(dispatch_width() / 4, 0);
482               src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
483
484               dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
485               set_condmod(mod, ubld.emit(opcode, right, left, right));
486
487               right = horiz_stride(horiz_offset(tmp, 3), 4);
488               set_condmod(mod, ubld.emit(opcode, right, left, right));
489            } else {
490               /* For 64-bit types, we have to do things differently because
491                * the code above would land us with destination strides that
492                * the hardware can't handle.  Fortunately, we'll only be
493                * 8-wide in that case and it's the same number of
494                * instructions.
495                */
496               const fs_builder ubld = exec_all().group(2, 0);
497
498               for (unsigned i = 0; i < dispatch_width(); i += 4) {
499                  src_reg left = component(tmp, i + 1);
500                  dst_reg right = horiz_offset(tmp, i + 2);
501                  set_condmod(mod, ubld.emit(opcode, right, left, right));
502               }
503            }
504         }
505
506         if (cluster_size > 4) {
507            const fs_builder ubld = exec_all().group(4, 0);
508            src_reg left = component(tmp, 3);
509            dst_reg right = horiz_offset(tmp, 4);
510            set_condmod(mod, ubld.emit(opcode, right, left, right));
511
512            if (dispatch_width() > 8) {
513               left = component(tmp, 8 + 3);
514               right = horiz_offset(tmp, 8 + 4);
515               set_condmod(mod, ubld.emit(opcode, right, left, right));
516            }
517         }
518
519         if (cluster_size > 8 && dispatch_width() > 8) {
520            const fs_builder ubld = exec_all().group(8, 0);
521            src_reg left = component(tmp, 7);
522            dst_reg right = horiz_offset(tmp, 8);
523            set_condmod(mod, ubld.emit(opcode, right, left, right));
524         }
525      }
526
527      /**
528       * Assorted arithmetic ops.
529       * @{
530       */
531#define ALU1(op)                                        \
532      instruction *                                     \
533      op(const dst_reg &dst, const src_reg &src0) const \
534      {                                                 \
535         return emit(BRW_OPCODE_##op, dst, src0);       \
536      }
537
538#define ALU2(op)                                                        \
539      instruction *                                                     \
540      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
541      {                                                                 \
542         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
543      }
544
545#define ALU2_ACC(op)                                                    \
546      instruction *                                                     \
547      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
548      {                                                                 \
549         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
550         inst->writes_accumulator = true;                               \
551         return inst;                                                   \
552      }
553
554#define ALU3(op)                                                        \
555      instruction *                                                     \
556      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
557         const src_reg &src2) const                                     \
558      {                                                                 \
559         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
560      }
561
562      ALU2(ADD)
563      ALU2_ACC(ADDC)
564      ALU2(AND)
565      ALU2(ASR)
566      ALU2(AVG)
567      ALU3(BFE)
568      ALU2(BFI1)
569      ALU3(BFI2)
570      ALU1(BFREV)
571      ALU1(CBIT)
572      ALU2(CMPN)
573      ALU1(DIM)
574      ALU2(DP2)
575      ALU2(DP3)
576      ALU2(DP4)
577      ALU2(DPH)
578      ALU1(F16TO32)
579      ALU1(F32TO16)
580      ALU1(FBH)
581      ALU1(FBL)
582      ALU1(FRC)
583      ALU2(LINE)
584      ALU1(LZD)
585      ALU2(MAC)
586      ALU2_ACC(MACH)
587      ALU3(MAD)
588      ALU1(MOV)
589      ALU2(MUL)
590      ALU1(NOT)
591      ALU2(OR)
592      ALU2(PLN)
593      ALU1(RNDD)
594      ALU1(RNDE)
595      ALU1(RNDU)
596      ALU1(RNDZ)
597      ALU2(SAD2)
598      ALU2_ACC(SADA2)
599      ALU2(SEL)
600      ALU2(SHL)
601      ALU2(SHR)
602      ALU2_ACC(SUBB)
603      ALU2(XOR)
604
605#undef ALU3
606#undef ALU2_ACC
607#undef ALU2
608#undef ALU1
609      /** @} */
610
611      /**
612       * CMP: Sets the low bit of the destination channels with the result
613       * of the comparison, while the upper bits are undefined, and updates
614       * the flag register with the packed 16 bits of the result.
615       */
616      instruction *
617      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
618          brw_conditional_mod condition) const
619      {
620         /* Take the instruction:
621          *
622          * CMP null<d> src0<f> src1<f>
623          *
624          * Original gen4 does type conversion to the destination type
625          * before comparison, producing garbage results for floating
626          * point comparisons.
627          *
628          * The destination type doesn't matter on newer generations,
629          * so we set the type to match src0 so we can compact the
630          * instruction.
631          */
632         return set_condmod(condition,
633                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
634                                 fix_unsigned_negate(src0),
635                                 fix_unsigned_negate(src1)));
636      }
637
638      /**
639       * Gen4 predicated IF.
640       */
641      instruction *
642      IF(brw_predicate predicate) const
643      {
644         return set_predicate(predicate, emit(BRW_OPCODE_IF));
645      }
646
647      /**
648       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
649       */
650      instruction *
651      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
652           const src_reg &src2, brw_conditional_mod condition) const
653      {
654         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
655          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
656          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
657          */
658         assert(src2.type == BRW_REGISTER_TYPE_F);
659
660         return set_condmod(condition,
661                            emit(BRW_OPCODE_CSEL,
662                                 retype(dst, BRW_REGISTER_TYPE_F),
663                                 retype(src0, BRW_REGISTER_TYPE_F),
664                                 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
665                                 fix_byte_src(src2)));
666      }
667
668      /**
669       * Emit a linear interpolation instruction.
670       */
671      instruction *
672      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
673          const src_reg &a) const
674      {
675         if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
676            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
677             * we need to reorder the operands.
678             */
679            return emit(BRW_OPCODE_LRP, dst, a, y, x);
680
681         } else {
682            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
683            const dst_reg y_times_a = vgrf(dst.type);
684            const dst_reg one_minus_a = vgrf(dst.type);
685            const dst_reg x_times_one_minus_a = vgrf(dst.type);
686
687            MUL(y_times_a, y, a);
688            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
689            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
690            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
691         }
692      }
693
694      /**
695       * Collect a number of registers in a contiguous range of registers.
696       */
697      instruction *
698      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
699                   unsigned sources, unsigned header_size) const
700      {
701         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
702         inst->header_size = header_size;
703         inst->size_written = header_size * REG_SIZE;
704         for (unsigned i = header_size; i < sources; i++) {
705            inst->size_written +=
706               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
707                     REG_SIZE);
708         }
709
710         return inst;
711      }
712
713      backend_shader *shader;
714
715      /**
716       * Byte sized operands are not supported for src1 on Gen11+.
717       */
718      src_reg
719      fix_byte_src(const src_reg &src) const
720      {
721         if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) ||
722             type_sz(src.type) != 1)
723            return src;
724
725         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
726                             BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
727         MOV(temp, src);
728         return src_reg(temp);
729      }
730
731   private:
732      /**
733       * Workaround for negation of UD registers.  See comment in
734       * fs_generator::generate_code() for more details.
735       */
736      src_reg
737      fix_unsigned_negate(const src_reg &src) const
738      {
739         if (src.type == BRW_REGISTER_TYPE_UD &&
740             src.negate) {
741            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
742            MOV(temp, src);
743            return src_reg(temp);
744         } else {
745            return src;
746         }
747      }
748
749      /**
750       * Workaround for source register modes not supported by the ternary
751       * instruction encoding.
752       */
753      src_reg
754      fix_3src_operand(const src_reg &src) const
755      {
756         switch (src.file) {
757         case FIXED_GRF:
758            /* FINISHME: Could handle scalar region, other stride=1 regions */
759            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
760                src.width != BRW_WIDTH_8 ||
761                src.hstride != BRW_HORIZONTAL_STRIDE_1)
762               break;
763            /* fallthrough */
764         case ATTR:
765         case VGRF:
766         case UNIFORM:
767         case IMM:
768            return src;
769         default:
770            break;
771         }
772
773         dst_reg expanded = vgrf(src.type);
774         MOV(expanded, src);
775         return expanded;
776      }
777
778      /**
779       * Workaround for source register modes not supported by the math
780       * instruction.
781       */
782      src_reg
783      fix_math_operand(const src_reg &src) const
784      {
785         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
786          * might be able to do better by doing execsize = 1 math and then
787          * expanding that result out, but we would need to be careful with
788          * masking.
789          *
790          * Gen6 hardware ignores source modifiers (negate and abs) on math
791          * instructions, so we also move to a temp to set those up.
792          *
793          * Gen7 relaxes most of the above restrictions, but still can't use IMM
794          * operands to math
795          */
796         if ((shader->devinfo->gen == 6 &&
797              (src.file == IMM || src.file == UNIFORM ||
798               src.abs || src.negate)) ||
799             (shader->devinfo->gen == 7 && src.file == IMM)) {
800            const dst_reg tmp = vgrf(src.type);
801            MOV(tmp, src);
802            return tmp;
803         } else {
804            return src;
805         }
806      }
807
808      bblock_t *block;
809      exec_node *cursor;
810
811      unsigned _dispatch_width;
812      unsigned _group;
813      bool force_writemask_all;
814
815      /** Debug annotation info. */
816      struct {
817         const char *str;
818         const void *ir;
819      } annotation;
820   };
821}
822
823#endif
824