brw_fs_builder.h revision 7ec681f3
1/* -*- c++ -*- */
2/*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifndef BRW_FS_BUILDER_H
26#define BRW_FS_BUILDER_H
27
28#include "brw_ir_fs.h"
29#include "brw_shader.h"
30
31namespace brw {
32   /**
33    * Toolbox to assemble an FS IR program out of individual instructions.
34    *
35    * This object is meant to have an interface consistent with
36    * brw::vec4_builder.  They cannot be fully interchangeable because
37    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38    * vector code.
39    */
40   class fs_builder {
41   public:
42      /** Type used in this IR to represent a source of an instruction. */
43      typedef fs_reg src_reg;
44
45      /** Type used in this IR to represent the destination of an instruction. */
46      typedef fs_reg dst_reg;
47
48      /** Type used in this IR to represent an instruction. */
49      typedef fs_inst instruction;
50
51      /**
52       * Construct an fs_builder that inserts instructions into \p shader.
53       * \p dispatch_width gives the native execution width of the program.
54       */
55      fs_builder(backend_shader *shader,
56                 unsigned dispatch_width) :
57         shader(shader), block(NULL), cursor(NULL),
58         _dispatch_width(dispatch_width),
59         _group(0),
60         force_writemask_all(false),
61         annotation()
62      {
63      }
64
65      /**
66       * Construct an fs_builder that inserts instructions into \p shader
67       * before instruction \p inst in basic block \p block.  The default
68       * execution controls and debug annotation are initialized from the
69       * instruction passed as argument.
70       */
71      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72         shader(shader), block(block), cursor(inst),
73         _dispatch_width(inst->exec_size),
74         _group(inst->group),
75         force_writemask_all(inst->force_writemask_all)
76      {
77         annotation.str = inst->annotation;
78         annotation.ir = inst->ir;
79      }
80
81      /**
82       * Construct an fs_builder that inserts instructions before \p cursor in
83       * basic block \p block, inheriting other code generation parameters
84       * from this.
85       */
86      fs_builder
87      at(bblock_t *block, exec_node *cursor) const
88      {
89         fs_builder bld = *this;
90         bld.block = block;
91         bld.cursor = cursor;
92         return bld;
93      }
94
95      /**
96       * Construct an fs_builder appending instructions at the end of the
97       * instruction list of the shader, inheriting other code generation
98       * parameters from this.
99       */
100      fs_builder
101      at_end() const
102      {
103         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104      }
105
106      /**
107       * Construct a builder specifying the default SIMD width and group of
108       * channel enable signals, inheriting other code generation parameters
109       * from this.
110       *
111       * \p n gives the default SIMD width, \p i gives the slot group used for
112       * predication and control flow masking in multiples of \p n channels.
113       */
114      fs_builder
115      group(unsigned n, unsigned i) const
116      {
117         fs_builder bld = *this;
118
119         if (n <= dispatch_width() && i < dispatch_width() / n) {
120            bld._group += i * n;
121         } else {
122            /* The requested channel group isn't a subset of the channel group
123             * of this builder, which means that the resulting instructions
124             * would use (potentially undefined) channel enable signals not
125             * specified by the parent builder.  That's only valid if the
126             * instruction doesn't have per-channel semantics, in which case
127             * we should clear off the default group index in order to prevent
128             * emitting instructions with channel group not aligned to their
129             * own execution size.
130             */
131            assert(force_writemask_all);
132            bld._group = 0;
133         }
134
135         bld._dispatch_width = n;
136         return bld;
137      }
138
139      /**
140       * Alias for group() with width equal to eight.
141       */
142      fs_builder
143      quarter(unsigned i) const
144      {
145         return group(8, i);
146      }
147
148      /**
149       * Construct a builder with per-channel control flow execution masking
150       * disabled if \p b is true.  If control flow execution masking is
151       * already disabled this has no effect.
152       */
153      fs_builder
154      exec_all(bool b = true) const
155      {
156         fs_builder bld = *this;
157         if (b)
158            bld.force_writemask_all = true;
159         return bld;
160      }
161
162      /**
163       * Construct a builder with the given debug annotation info.
164       */
165      fs_builder
166      annotate(const char *str, const void *ir = NULL) const
167      {
168         fs_builder bld = *this;
169         bld.annotation.str = str;
170         bld.annotation.ir = ir;
171         return bld;
172      }
173
174      /**
175       * Get the SIMD width in use.
176       */
177      unsigned
178      dispatch_width() const
179      {
180         return _dispatch_width;
181      }
182
183      /**
184       * Get the channel group in use.
185       */
186      unsigned
187      group() const
188      {
189         return _group;
190      }
191
192      /**
193       * Allocate a virtual register of natural vector size (one for this IR)
194       * and SIMD width.  \p n gives the amount of space to allocate in
195       * dispatch_width units (which is just enough space for one logical
196       * component in this IR).
197       */
198      dst_reg
199      vgrf(enum brw_reg_type type, unsigned n = 1) const
200      {
201         assert(dispatch_width() <= 32);
202
203         if (n > 0)
204            return dst_reg(VGRF, shader->alloc.allocate(
205                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206                                           REG_SIZE)),
207                           type);
208         else
209            return retype(null_reg_ud(), type);
210      }
211
212      /**
213       * Create a null register of floating type.
214       */
215      dst_reg
216      null_reg_f() const
217      {
218         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219      }
220
221      dst_reg
222      null_reg_df() const
223      {
224         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225      }
226
227      /**
228       * Create a null register of signed integer type.
229       */
230      dst_reg
231      null_reg_d() const
232      {
233         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234      }
235
236      /**
237       * Create a null register of unsigned integer type.
238       */
239      dst_reg
240      null_reg_ud() const
241      {
242         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243      }
244
245      /**
246       * Insert an instruction into the program.
247       */
248      instruction *
249      emit(const instruction &inst) const
250      {
251         return emit(new(shader->mem_ctx) instruction(inst));
252      }
253
254      /**
255       * Create and insert a nullary control instruction into the program.
256       */
257      instruction *
258      emit(enum opcode opcode) const
259      {
260         return emit(instruction(opcode, dispatch_width()));
261      }
262
263      /**
264       * Create and insert a nullary instruction into the program.
265       */
266      instruction *
267      emit(enum opcode opcode, const dst_reg &dst) const
268      {
269         return emit(instruction(opcode, dispatch_width(), dst));
270      }
271
272      /**
273       * Create and insert a unary instruction into the program.
274       */
275      instruction *
276      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
277      {
278         switch (opcode) {
279         case SHADER_OPCODE_RCP:
280         case SHADER_OPCODE_RSQ:
281         case SHADER_OPCODE_SQRT:
282         case SHADER_OPCODE_EXP2:
283         case SHADER_OPCODE_LOG2:
284         case SHADER_OPCODE_SIN:
285         case SHADER_OPCODE_COS:
286            return emit(instruction(opcode, dispatch_width(), dst,
287                                    fix_math_operand(src0)));
288
289         default:
290            return emit(instruction(opcode, dispatch_width(), dst, src0));
291         }
292      }
293
294      /**
295       * Create and insert a binary instruction into the program.
296       */
297      instruction *
298      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
299           const src_reg &src1) const
300      {
301         switch (opcode) {
302         case SHADER_OPCODE_POW:
303         case SHADER_OPCODE_INT_QUOTIENT:
304         case SHADER_OPCODE_INT_REMAINDER:
305            return emit(instruction(opcode, dispatch_width(), dst,
306                                    fix_math_operand(src0),
307                                    fix_math_operand(src1)));
308
309         default:
310            return emit(instruction(opcode, dispatch_width(), dst,
311                                    src0, src1));
312
313         }
314      }
315
316      /**
317       * Create and insert a ternary instruction into the program.
318       */
319      instruction *
320      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
321           const src_reg &src1, const src_reg &src2) const
322      {
323         switch (opcode) {
324         case BRW_OPCODE_BFE:
325         case BRW_OPCODE_BFI2:
326         case BRW_OPCODE_MAD:
327         case BRW_OPCODE_LRP:
328            return emit(instruction(opcode, dispatch_width(), dst,
329                                    fix_3src_operand(src0),
330                                    fix_3src_operand(src1),
331                                    fix_3src_operand(src2)));
332
333         default:
334            return emit(instruction(opcode, dispatch_width(), dst,
335                                    src0, src1, src2));
336         }
337      }
338
339      /**
340       * Create and insert an instruction with a variable number of sources
341       * into the program.
342       */
343      instruction *
344      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
345           unsigned n) const
346      {
347         /* Use the emit() methods for specific operand counts to ensure that
348          * opcode-specific operand fixups occur.
349          */
350         if (n == 2) {
351            return emit(opcode, dst, srcs[0], srcs[1]);
352         } else if (n == 3) {
353            return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
354         } else {
355            return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
356         }
357      }
358
359      /**
360       * Insert a preallocated instruction into the program.
361       */
362      instruction *
363      emit(instruction *inst) const
364      {
365         assert(inst->exec_size <= 32);
366         assert(inst->exec_size == dispatch_width() ||
367                force_writemask_all);
368
369         inst->group = _group;
370         inst->force_writemask_all = force_writemask_all;
371         inst->annotation = annotation.str;
372         inst->ir = annotation.ir;
373
374         if (block)
375            static_cast<instruction *>(cursor)->insert_before(block, inst);
376         else
377            cursor->insert_before(inst);
378
379         return inst;
380      }
381
382      /**
383       * Select \p src0 if the comparison of both sources with the given
384       * conditional mod evaluates to true, otherwise select \p src1.
385       *
386       * Generally useful to get the minimum or maximum of two values.
387       */
388      instruction *
389      emit_minmax(const dst_reg &dst, const src_reg &src0,
390                  const src_reg &src1, brw_conditional_mod mod) const
391      {
392         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
393
394         /* In some cases we can't have bytes as operand for src1, so use the
395          * same type for both operand.
396          */
397         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
398                                     fix_unsigned_negate(src1)));
399      }
400
401      /**
402       * Copy any live channel from \p src to the first channel of the result.
403       */
404      src_reg
405      emit_uniformize(const src_reg &src) const
406      {
407         /* FIXME: We use a vector chan_index and dst to allow constant and
408          * copy propagration to move result all the way into the consuming
409          * instruction (typically a surface index or sampler index for a
410          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
411          * dispatch. Once we teach const/copy propagation about scalars we
412          * should go back to scalar destinations here.
413          */
414         const fs_builder ubld = exec_all();
415         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
416         const dst_reg dst = vgrf(src.type);
417
418         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
419         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
420
421         return src_reg(component(dst, 0));
422      }
423
424      src_reg
425      move_to_vgrf(const src_reg &src, unsigned num_components) const
426      {
427         src_reg *const src_comps = new src_reg[num_components];
428         for (unsigned i = 0; i < num_components; i++)
429            src_comps[i] = offset(src, dispatch_width(), i);
430
431         const dst_reg dst = vgrf(src.type, num_components);
432         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
433
434         delete[] src_comps;
435
436         return src_reg(dst);
437      }
438
439      void
440      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
441                     const dst_reg &tmp,
442                     unsigned left_offset, unsigned left_stride,
443                     unsigned right_offset, unsigned right_stride) const
444      {
445         dst_reg left, right;
446         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
447         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
448         if ((tmp.type == BRW_REGISTER_TYPE_Q ||
449              tmp.type == BRW_REGISTER_TYPE_UQ) &&
450             !shader->devinfo->has_64bit_int) {
451            switch (opcode) {
452            case BRW_OPCODE_MUL:
453               /* This will get lowered by integer MUL lowering */
454               set_condmod(mod, emit(opcode, right, left, right));
455               break;
456
457            case BRW_OPCODE_SEL: {
458               /* In order for the comparisons to work out right, we need our
459                * comparisons to be strict.
460                */
461               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
462               if (mod == BRW_CONDITIONAL_GE)
463                  mod = BRW_CONDITIONAL_G;
464
465               /* We treat the bottom 32 bits as unsigned regardless of
466                * whether or not the integer as a whole is signed.
467                */
468               dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
469               dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
470
471               /* The upper bits get the same sign as the 64-bit type */
472               brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
473               dst_reg right_high = subscript(right, type32, 1);
474               dst_reg left_high = subscript(left, type32, 1);
475
476               /* Build up our comparison:
477                *
478                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
479                */
480               CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
481                                  retype(right_low, BRW_REGISTER_TYPE_UD), mod);
482               set_predicate(BRW_PREDICATE_NORMAL,
483                             CMP(null_reg_ud(), left_high, right_high,
484                                 BRW_CONDITIONAL_EQ));
485               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
486                                 CMP(null_reg_ud(), left_high, right_high, mod));
487
488               /* We could use selects here or we could use predicated MOVs
489                * because the destination and second source (if it were a SEL)
490                * are the same.
491                */
492               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
493               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
494               break;
495            }
496
497            default:
498               unreachable("Unsupported 64-bit scan op");
499            }
500         } else {
501            set_condmod(mod, emit(opcode, right, left, right));
502         }
503      }
504
505      void
506      emit_scan(enum opcode opcode, const dst_reg &tmp,
507                unsigned cluster_size, brw_conditional_mod mod) const
508      {
509         assert(dispatch_width() >= 8);
510
511         /* The instruction splitting code isn't advanced enough to split
512          * these so we need to handle that ourselves.
513          */
514         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
515            const unsigned half_width = dispatch_width() / 2;
516            const fs_builder ubld = exec_all().group(half_width, 0);
517            dst_reg left = tmp;
518            dst_reg right = horiz_offset(tmp, half_width);
519            ubld.emit_scan(opcode, left, cluster_size, mod);
520            ubld.emit_scan(opcode, right, cluster_size, mod);
521            if (cluster_size > half_width) {
522               ubld.emit_scan_step(opcode, mod, tmp,
523                                   half_width - 1, 0, half_width, 1);
524            }
525            return;
526         }
527
528         if (cluster_size > 1) {
529            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
530            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
531         }
532
533         if (cluster_size > 2) {
534            if (type_sz(tmp.type) <= 4) {
535               const fs_builder ubld =
536                  exec_all().group(dispatch_width() / 4, 0);
537               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
538               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
539            } else {
540               /* For 64-bit types, we have to do things differently because
541                * the code above would land us with destination strides that
542                * the hardware can't handle.  Fortunately, we'll only be
543                * 8-wide in that case and it's the same number of
544                * instructions.
545                */
546               const fs_builder ubld = exec_all().group(2, 0);
547               for (unsigned i = 0; i < dispatch_width(); i += 4)
548                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
549            }
550         }
551
552         for (unsigned i = 4;
553              i < MIN2(cluster_size, dispatch_width());
554              i *= 2) {
555            const fs_builder ubld = exec_all().group(i, 0);
556            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
557
558            if (dispatch_width() > i * 2)
559               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
560
561            if (dispatch_width() > i * 4) {
562               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
563               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
564            }
565         }
566      }
567
568      /**
569       * Assorted arithmetic ops.
570       * @{
571       */
572#define ALU1(op)                                        \
573      instruction *                                     \
574      op(const dst_reg &dst, const src_reg &src0) const \
575      {                                                 \
576         return emit(BRW_OPCODE_##op, dst, src0);       \
577      }
578
579#define ALU2(op)                                                        \
580      instruction *                                                     \
581      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
582      {                                                                 \
583         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
584      }
585
586#define ALU2_ACC(op)                                                    \
587      instruction *                                                     \
588      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
589      {                                                                 \
590         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
591         inst->writes_accumulator = true;                               \
592         return inst;                                                   \
593      }
594
595#define ALU3(op)                                                        \
596      instruction *                                                     \
597      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
598         const src_reg &src2) const                                     \
599      {                                                                 \
600         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
601      }
602
603      ALU2(ADD)
604      ALU3(ADD3)
605      ALU2_ACC(ADDC)
606      ALU2(AND)
607      ALU2(ASR)
608      ALU2(AVG)
609      ALU3(BFE)
610      ALU2(BFI1)
611      ALU3(BFI2)
612      ALU1(BFREV)
613      ALU1(CBIT)
614      ALU1(DIM)
615      ALU2(DP2)
616      ALU2(DP3)
617      ALU2(DP4)
618      ALU2(DPH)
619      ALU1(F16TO32)
620      ALU1(F32TO16)
621      ALU1(FBH)
622      ALU1(FBL)
623      ALU1(FRC)
624      ALU3(DP4A)
625      ALU2(LINE)
626      ALU1(LZD)
627      ALU2(MAC)
628      ALU2_ACC(MACH)
629      ALU3(MAD)
630      ALU1(MOV)
631      ALU2(MUL)
632      ALU1(NOT)
633      ALU2(OR)
634      ALU2(PLN)
635      ALU1(RNDD)
636      ALU1(RNDE)
637      ALU1(RNDU)
638      ALU1(RNDZ)
639      ALU2(ROL)
640      ALU2(ROR)
641      ALU2(SAD2)
642      ALU2_ACC(SADA2)
643      ALU2(SEL)
644      ALU2(SHL)
645      ALU2(SHR)
646      ALU2_ACC(SUBB)
647      ALU2(XOR)
648
649#undef ALU3
650#undef ALU2_ACC
651#undef ALU2
652#undef ALU1
653      /** @} */
654
655      /**
656       * CMP: Sets the low bit of the destination channels with the result
657       * of the comparison, while the upper bits are undefined, and updates
658       * the flag register with the packed 16 bits of the result.
659       */
660      instruction *
661      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
662          brw_conditional_mod condition) const
663      {
664         /* Take the instruction:
665          *
666          * CMP null<d> src0<f> src1<f>
667          *
668          * Original gfx4 does type conversion to the destination type
669          * before comparison, producing garbage results for floating
670          * point comparisons.
671          *
672          * The destination type doesn't matter on newer generations,
673          * so we set the type to match src0 so we can compact the
674          * instruction.
675          */
676         return set_condmod(condition,
677                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
678                                 fix_unsigned_negate(src0),
679                                 fix_unsigned_negate(src1)));
680      }
681
682      /**
683       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
684       */
685      instruction *
686      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
687           brw_conditional_mod condition) const
688      {
689         /* Take the instruction:
690          *
691          * CMP null<d> src0<f> src1<f>
692          *
693          * Original gfx4 does type conversion to the destination type
694          * before comparison, producing garbage results for floating
695          * point comparisons.
696          *
697          * The destination type doesn't matter on newer generations,
698          * so we set the type to match src0 so we can compact the
699          * instruction.
700          */
701         return set_condmod(condition,
702                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
703                                 fix_unsigned_negate(src0),
704                                 fix_unsigned_negate(src1)));
705      }
706
707      /**
708       * Gfx4 predicated IF.
709       */
710      instruction *
711      IF(brw_predicate predicate) const
712      {
713         return set_predicate(predicate, emit(BRW_OPCODE_IF));
714      }
715
716      /**
717       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
718       */
719      instruction *
720      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
721           const src_reg &src2, brw_conditional_mod condition) const
722      {
723         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
724          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
725          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
726          */
727         assert(src2.type == BRW_REGISTER_TYPE_F);
728
729         return set_condmod(condition,
730                            emit(BRW_OPCODE_CSEL,
731                                 retype(dst, BRW_REGISTER_TYPE_F),
732                                 retype(src0, BRW_REGISTER_TYPE_F),
733                                 retype(src1, BRW_REGISTER_TYPE_F),
734                                 src2));
735      }
736
737      /**
738       * Emit a linear interpolation instruction.
739       */
740      instruction *
741      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
742          const src_reg &a) const
743      {
744         if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
745            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
746             * we need to reorder the operands.
747             */
748            return emit(BRW_OPCODE_LRP, dst, a, y, x);
749
750         } else {
751            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
752            const dst_reg y_times_a = vgrf(dst.type);
753            const dst_reg one_minus_a = vgrf(dst.type);
754            const dst_reg x_times_one_minus_a = vgrf(dst.type);
755
756            MUL(y_times_a, y, a);
757            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
758            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
759            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
760         }
761      }
762
763      /**
764       * Collect a number of registers in a contiguous range of registers.
765       */
766      instruction *
767      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
768                   unsigned sources, unsigned header_size) const
769      {
770         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
771         inst->header_size = header_size;
772         inst->size_written = header_size * REG_SIZE;
773         for (unsigned i = header_size; i < sources; i++) {
774            inst->size_written +=
775               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
776                     REG_SIZE);
777         }
778
779         return inst;
780      }
781
782      instruction *
783      UNDEF(const dst_reg &dst) const
784      {
785         assert(dst.file == VGRF);
786         instruction *inst = emit(SHADER_OPCODE_UNDEF,
787                                  retype(dst, BRW_REGISTER_TYPE_UD));
788         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
789
790         return inst;
791      }
792
793      backend_shader *shader;
794
795   private:
796      /**
797       * Workaround for negation of UD registers.  See comment in
798       * fs_generator::generate_code() for more details.
799       */
800      src_reg
801      fix_unsigned_negate(const src_reg &src) const
802      {
803         if (src.type == BRW_REGISTER_TYPE_UD &&
804             src.negate) {
805            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
806            MOV(temp, src);
807            return src_reg(temp);
808         } else {
809            return src;
810         }
811      }
812
813      /**
814       * Workaround for source register modes not supported by the ternary
815       * instruction encoding.
816       */
817      src_reg
818      fix_3src_operand(const src_reg &src) const
819      {
820         switch (src.file) {
821         case FIXED_GRF:
822            /* FINISHME: Could handle scalar region, other stride=1 regions */
823            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
824                src.width != BRW_WIDTH_8 ||
825                src.hstride != BRW_HORIZONTAL_STRIDE_1)
826               break;
827            FALLTHROUGH;
828         case ATTR:
829         case VGRF:
830         case UNIFORM:
831         case IMM:
832            return src;
833         default:
834            break;
835         }
836
837         dst_reg expanded = vgrf(src.type);
838         MOV(expanded, src);
839         return expanded;
840      }
841
842      /**
843       * Workaround for source register modes not supported by the math
844       * instruction.
845       */
846      src_reg
847      fix_math_operand(const src_reg &src) const
848      {
849         /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
850          * might be able to do better by doing execsize = 1 math and then
851          * expanding that result out, but we would need to be careful with
852          * masking.
853          *
854          * Gfx6 hardware ignores source modifiers (negate and abs) on math
855          * instructions, so we also move to a temp to set those up.
856          *
857          * Gfx7 relaxes most of the above restrictions, but still can't use IMM
858          * operands to math
859          */
860         if ((shader->devinfo->ver == 6 &&
861              (src.file == IMM || src.file == UNIFORM ||
862               src.abs || src.negate)) ||
863             (shader->devinfo->ver == 7 && src.file == IMM)) {
864            const dst_reg tmp = vgrf(src.type);
865            MOV(tmp, src);
866            return tmp;
867         } else {
868            return src;
869         }
870      }
871
872      bblock_t *block;
873      exec_node *cursor;
874
875      unsigned _dispatch_width;
876      unsigned _group;
877      bool force_writemask_all;
878
879      /** Debug annotation info. */
880      struct {
881         const char *str;
882         const void *ir;
883      } annotation;
884   };
885}
886
887#endif
888