1/* -*- c++ -*- */
2/*
3 * Copyright © 2010-2015 Intel Corporation
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 */
24
25#ifndef BRW_VEC4_BUILDER_H
26#define BRW_VEC4_BUILDER_H
27
28#include "brw_ir_vec4.h"
29#include "brw_ir_allocator.h"
30
31namespace brw {
32   /**
33    * Toolbox to assemble a VEC4 IR program out of individual instructions.
34    *
35    * This object is meant to have an interface consistent with
36    * brw::fs_builder.  They cannot be fully interchangeable because
37    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38    * vector code.
39    */
40   class vec4_builder {
41   public:
42      /** Type used in this IR to represent a source of an instruction. */
43      typedef brw::src_reg src_reg;
44
45      /** Type used in this IR to represent the destination of an instruction. */
46      typedef brw::dst_reg dst_reg;
47
48      /** Type used in this IR to represent an instruction. */
49      typedef vec4_instruction instruction;
50
51      /**
52       * Construct a vec4_builder that inserts instructions into \p shader.
53       */
54      vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
55         shader(shader), block(NULL), cursor(NULL),
56         _dispatch_width(dispatch_width), _group(0),
57         force_writemask_all(false),
58         annotation()
59      {
60      }
61
62      /**
63       * Construct a vec4_builder that inserts instructions into \p shader
64       * before instruction \p inst in basic block \p block.  The default
65       * execution controls and debug annotation are initialized from the
66       * instruction passed as argument.
67       */
68      vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
69         shader(shader), block(block), cursor(inst),
70         _dispatch_width(inst->exec_size), _group(inst->group),
71         force_writemask_all(inst->force_writemask_all)
72      {
73         annotation.str = inst->annotation;
74         annotation.ir = inst->ir;
75      }
76
77      /**
78       * Construct a vec4_builder that inserts instructions before \p cursor
79       * in basic block \p block, inheriting other code generation parameters
80       * from this.
81       */
82      vec4_builder
83      at(bblock_t *block, exec_node *cursor) const
84      {
85         vec4_builder bld = *this;
86         bld.block = block;
87         bld.cursor = cursor;
88         return bld;
89      }
90
91      /**
92       * Construct a vec4_builder appending instructions at the end of the
93       * instruction list of the shader, inheriting other code generation
94       * parameters from this.
95       */
96      vec4_builder
97      at_end() const
98      {
99         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
100      }
101
102      /**
103       * Construct a builder specifying the default SIMD width and group of
104       * channel enable signals, inheriting other code generation parameters
105       * from this.
106       *
107       * \p n gives the default SIMD width, \p i gives the slot group used for
108       * predication and control flow masking in multiples of \p n channels.
109       */
110      vec4_builder
111      group(unsigned n, unsigned i) const
112      {
113         assert(force_writemask_all ||
114                (n <= dispatch_width() && i < dispatch_width() / n));
115         vec4_builder bld = *this;
116         bld._dispatch_width = n;
117         bld._group += i * n;
118         return bld;
119      }
120
121      /**
122       * Construct a builder with per-channel control flow execution masking
123       * disabled if \p b is true.  If control flow execution masking is
124       * already disabled this has no effect.
125       */
126      vec4_builder
127      exec_all(bool b = true) const
128      {
129         vec4_builder bld = *this;
130         if (b)
131            bld.force_writemask_all = true;
132         return bld;
133      }
134
135      /**
136       * Construct a builder with the given debug annotation info.
137       */
138      vec4_builder
139      annotate(const char *str, const void *ir = NULL) const
140      {
141         vec4_builder bld = *this;
142         bld.annotation.str = str;
143         bld.annotation.ir = ir;
144         return bld;
145      }
146
147      /**
148       * Get the SIMD width in use.
149       */
150      unsigned
151      dispatch_width() const
152      {
153         return _dispatch_width;
154      }
155
156      /**
157       * Get the channel group in use.
158       */
159      unsigned
160      group() const
161      {
162         return _group;
163      }
164
165      /**
166       * Allocate a virtual register of natural vector size (four for this IR)
167       * and SIMD width.  \p n gives the amount of space to allocate in
168       * dispatch_width units (which is just enough space for four logical
169       * components in this IR).
170       */
171      dst_reg
172      vgrf(enum brw_reg_type type, unsigned n = 1) const
173      {
174         assert(dispatch_width() <= 32);
175
176         if (n > 0)
177            return retype(dst_reg(VGRF, shader->alloc.allocate(
178                                     n * DIV_ROUND_UP(type_sz(type), 4))),
179                           type);
180         else
181            return retype(null_reg_ud(), type);
182      }
183
184      /**
185       * Create a null register of floating type.
186       */
187      dst_reg
188      null_reg_f() const
189      {
190         return dst_reg(retype(brw_null_vec(dispatch_width()),
191                               BRW_REGISTER_TYPE_F));
192      }
193
194      /**
195       * Create a null register of signed integer type.
196       */
197      dst_reg
198      null_reg_d() const
199      {
200         return dst_reg(retype(brw_null_vec(dispatch_width()),
201                               BRW_REGISTER_TYPE_D));
202      }
203
204      /**
205       * Create a null register of unsigned integer type.
206       */
207      dst_reg
208      null_reg_ud() const
209      {
210         return dst_reg(retype(brw_null_vec(dispatch_width()),
211                               BRW_REGISTER_TYPE_UD));
212      }
213
214      /**
215       * Insert an instruction into the program.
216       */
217      instruction *
218      emit(const instruction &inst) const
219      {
220         return emit(new(shader->mem_ctx) instruction(inst));
221      }
222
223      /**
224       * Create and insert a nullary control instruction into the program.
225       */
226      instruction *
227      emit(enum opcode opcode) const
228      {
229         return emit(instruction(opcode));
230      }
231
232      /**
233       * Create and insert a nullary instruction into the program.
234       */
235      instruction *
236      emit(enum opcode opcode, const dst_reg &dst) const
237      {
238         return emit(instruction(opcode, dst));
239      }
240
241      /**
242       * Create and insert a unary instruction into the program.
243       */
244      instruction *
245      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
246      {
247         switch (opcode) {
248         case SHADER_OPCODE_RCP:
249         case SHADER_OPCODE_RSQ:
250         case SHADER_OPCODE_SQRT:
251         case SHADER_OPCODE_EXP2:
252         case SHADER_OPCODE_LOG2:
253         case SHADER_OPCODE_SIN:
254         case SHADER_OPCODE_COS:
255            return fix_math_instruction(
256               emit(instruction(opcode, dst,
257                                fix_math_operand(src0))));
258
259         default:
260            return emit(instruction(opcode, dst, src0));
261         }
262      }
263
264      /**
265       * Create and insert a binary instruction into the program.
266       */
267      instruction *
268      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
269           const src_reg &src1) const
270      {
271         switch (opcode) {
272         case SHADER_OPCODE_POW:
273         case SHADER_OPCODE_INT_QUOTIENT:
274         case SHADER_OPCODE_INT_REMAINDER:
275            return fix_math_instruction(
276               emit(instruction(opcode, dst,
277                                fix_math_operand(src0),
278                                fix_math_operand(src1))));
279
280         default:
281            return emit(instruction(opcode, dst, src0, src1));
282         }
283      }
284
285      /**
286       * Create and insert a ternary instruction into the program.
287       */
288      instruction *
289      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
290           const src_reg &src1, const src_reg &src2) const
291      {
292         switch (opcode) {
293         case BRW_OPCODE_BFE:
294         case BRW_OPCODE_BFI2:
295         case BRW_OPCODE_MAD:
296         case BRW_OPCODE_LRP:
297            return emit(instruction(opcode, dst,
298                                    fix_3src_operand(src0),
299                                    fix_3src_operand(src1),
300                                    fix_3src_operand(src2)));
301
302         default:
303            return emit(instruction(opcode, dst, src0, src1, src2));
304         }
305      }
306
307      /**
308       * Insert a preallocated instruction into the program.
309       */
310      instruction *
311      emit(instruction *inst) const
312      {
313         inst->exec_size = dispatch_width();
314         inst->group = group();
315         inst->force_writemask_all = force_writemask_all;
316         inst->size_written = inst->exec_size * type_sz(inst->dst.type);
317         inst->annotation = annotation.str;
318         inst->ir = annotation.ir;
319
320         if (block)
321            static_cast<instruction *>(cursor)->insert_before(block, inst);
322         else
323            cursor->insert_before(inst);
324
325         return inst;
326      }
327
328      /**
329       * Select \p src0 if the comparison of both sources with the given
330       * conditional mod evaluates to true, otherwise select \p src1.
331       *
332       * Generally useful to get the minimum or maximum of two values.
333       */
334      instruction *
335      emit_minmax(const dst_reg &dst, const src_reg &src0,
336                  const src_reg &src1, brw_conditional_mod mod) const
337      {
338         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
339
340         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
341                                     fix_unsigned_negate(src1)));
342      }
343
344      /**
345       * Copy any live channel from \p src to the first channel of the result.
346       */
347      src_reg
348      emit_uniformize(const src_reg &src) const
349      {
350         const vec4_builder ubld = exec_all();
351         const dst_reg chan_index =
352            writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
353         const dst_reg dst = vgrf(src.type);
354
355         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
356         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
357
358         return src_reg(dst);
359      }
360
361      /**
362       * Assorted arithmetic ops.
363       * @{
364       */
365#define ALU1(op)                                        \
366      instruction *                                     \
367      op(const dst_reg &dst, const src_reg &src0) const \
368      {                                                 \
369         return emit(BRW_OPCODE_##op, dst, src0);       \
370      }
371
372#define ALU2(op)                                                        \
373      instruction *                                                     \
374      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
375      {                                                                 \
376         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
377      }
378
379#define ALU2_ACC(op)                                                    \
380      instruction *                                                     \
381      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
382      {                                                                 \
383         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
384         inst->writes_accumulator = true;                               \
385         return inst;                                                   \
386      }
387
388#define ALU3(op)                                                        \
389      instruction *                                                     \
390      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
391         const src_reg &src2) const                                     \
392      {                                                                 \
393         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
394      }
395
396      ALU2(ADD)
397      ALU2_ACC(ADDC)
398      ALU2(AND)
399      ALU2(ASR)
400      ALU2(AVG)
401      ALU3(BFE)
402      ALU2(BFI1)
403      ALU3(BFI2)
404      ALU1(BFREV)
405      ALU1(CBIT)
406      ALU3(CSEL)
407      ALU1(DIM)
408      ALU2(DP2)
409      ALU2(DP3)
410      ALU2(DP4)
411      ALU2(DPH)
412      ALU1(F16TO32)
413      ALU1(F32TO16)
414      ALU1(FBH)
415      ALU1(FBL)
416      ALU1(FRC)
417      ALU2(LINE)
418      ALU1(LZD)
419      ALU2(MAC)
420      ALU2_ACC(MACH)
421      ALU3(MAD)
422      ALU1(MOV)
423      ALU2(MUL)
424      ALU1(NOT)
425      ALU2(OR)
426      ALU2(PLN)
427      ALU1(RNDD)
428      ALU1(RNDE)
429      ALU1(RNDU)
430      ALU1(RNDZ)
431      ALU2(SAD2)
432      ALU2_ACC(SADA2)
433      ALU2(SEL)
434      ALU2(SHL)
435      ALU2(SHR)
436      ALU2_ACC(SUBB)
437      ALU2(XOR)
438
439#undef ALU3
440#undef ALU2_ACC
441#undef ALU2
442#undef ALU1
443      /** @} */
444
445      /**
446       * CMP: Sets the low bit of the destination channels with the result
447       * of the comparison, while the upper bits are undefined, and updates
448       * the flag register with the packed 16 bits of the result.
449       */
450      instruction *
451      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
452          brw_conditional_mod condition) const
453      {
454         /* Take the instruction:
455          *
456          * CMP null<d> src0<f> src1<f>
457          *
458          * Original gfx4 does type conversion to the destination type
459          * before comparison, producing garbage results for floating
460          * point comparisons.
461          *
462          * The destination type doesn't matter on newer generations,
463          * so we set the type to match src0 so we can compact the
464          * instruction.
465          */
466         return set_condmod(condition,
467                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
468                                 fix_unsigned_negate(src0),
469                                 fix_unsigned_negate(src1)));
470      }
471
472      /**
473       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
474       */
475      instruction *
476      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
477          brw_conditional_mod condition) const
478      {
479         /* Take the instruction:
480          *
481          * CMPN null<d> src0<f> src1<f>
482          *
483          * Original gfx4 does type conversion to the destination type
484          * before comparison, producing garbage results for floating
485          * point comparisons.
486          *
487          * The destination type doesn't matter on newer generations,
488          * so we set the type to match src0 so we can compact the
489          * instruction.
490          */
491         return set_condmod(condition,
492                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
493                                 fix_unsigned_negate(src0),
494                                 fix_unsigned_negate(src1)));
495      }
496
497      /**
498       * Gfx4 predicated IF.
499       */
500      instruction *
501      IF(brw_predicate predicate) const
502      {
503         return set_predicate(predicate, emit(BRW_OPCODE_IF));
504      }
505
506      /**
507       * Gfx6 IF with embedded comparison.
508       */
509      instruction *
510      IF(const src_reg &src0, const src_reg &src1,
511         brw_conditional_mod condition) const
512      {
513         assert(shader->devinfo->ver == 6);
514         return set_condmod(condition,
515                            emit(BRW_OPCODE_IF,
516                                 null_reg_d(),
517                                 fix_unsigned_negate(src0),
518                                 fix_unsigned_negate(src1)));
519      }
520
521      /**
522       * Emit a linear interpolation instruction.
523       */
524      instruction *
525      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
526          const src_reg &a) const
527      {
528         /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
529          * we need to reorder the operands.
530          */
531         assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9);
532         return emit(BRW_OPCODE_LRP, dst, a, y, x);
533      }
534
535      backend_shader *shader;
536
537   protected:
538      /**
539       * Workaround for negation of UD registers.  See comment in
540       * fs_generator::generate_code() for the details.
541       */
542      src_reg
543      fix_unsigned_negate(const src_reg &src) const
544      {
545         if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
546            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
547            MOV(temp, src);
548            return src_reg(temp);
549         } else {
550            return src;
551         }
552      }
553
554      /**
555       * Workaround for register access modes not supported by the ternary
556       * instruction encoding.
557       */
558      src_reg
559      fix_3src_operand(const src_reg &src) const
560      {
561         /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
562          * able to use vertical stride of zero to replicate the vec4 uniform, like
563          *
564          *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
565          *
566          * But you can't, since vertical stride is always four in three-source
567          * instructions. Instead, insert a MOV instruction to do the replication so
568          * that the three-source instruction can consume it.
569          */
570
571         /* The MOV is only needed if the source is a uniform or immediate. */
572         if (src.file != UNIFORM && src.file != IMM)
573            return src;
574
575         if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
576            return src;
577
578         const dst_reg expanded = vgrf(src.type);
579         emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
580         return src_reg(expanded);
581      }
582
583      /**
584       * Workaround for register access modes not supported by the math
585       * instruction.
586       */
587      src_reg
588      fix_math_operand(const src_reg &src) const
589      {
590         /* The gfx6 math instruction ignores the source modifiers --
591          * swizzle, abs, negate, and at least some parts of the register
592          * region description.
593          *
594          * Rather than trying to enumerate all these cases, *always* expand the
595          * operand to a temp GRF for gfx6.
596          *
597          * For gfx7, keep the operand as-is, except if immediate, which gfx7 still
598          * can't use.
599          */
600         if (shader->devinfo->ver == 6 ||
601             (shader->devinfo->ver == 7 && src.file == IMM)) {
602            const dst_reg tmp = vgrf(src.type);
603            MOV(tmp, src);
604            return src_reg(tmp);
605         } else {
606            return src;
607         }
608      }
609
610      /**
611       * Workaround other weirdness of the math instruction.
612       */
613      instruction *
614      fix_math_instruction(instruction *inst) const
615      {
616         if (shader->devinfo->ver == 6 &&
617             inst->dst.writemask != WRITEMASK_XYZW) {
618            const dst_reg tmp = vgrf(inst->dst.type);
619            MOV(inst->dst, src_reg(tmp));
620            inst->dst = tmp;
621
622         } else if (shader->devinfo->ver < 6) {
623            const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
624            inst->base_mrf = 1;
625            inst->mlen = sources;
626         }
627
628         return inst;
629      }
630
631      bblock_t *block;
632      exec_node *cursor;
633
634      unsigned _dispatch_width;
635      unsigned _group;
636      bool force_writemask_all;
637
638      /** Debug annotation info. */
639      struct {
640         const char *str;
641         const void *ir;
642      } annotation;
643   };
644}
645
646#endif
647