1b8e80941Smrg/* -*- c++ -*- */
2b8e80941Smrg/*
3b8e80941Smrg * Copyright © 2010-2015 Intel Corporation
4b8e80941Smrg *
5b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
6b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
7b8e80941Smrg * to deal in the Software without restriction, including without limitation
8b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
10b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
11b8e80941Smrg *
12b8e80941Smrg * The above copyright notice and this permission notice (including the next
13b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
14b8e80941Smrg * Software.
15b8e80941Smrg *
16b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22b8e80941Smrg * IN THE SOFTWARE.
23b8e80941Smrg */
24b8e80941Smrg
25b8e80941Smrg#ifndef BRW_FS_BUILDER_H
26b8e80941Smrg#define BRW_FS_BUILDER_H
27b8e80941Smrg
28b8e80941Smrg#include "brw_ir_fs.h"
29b8e80941Smrg#include "brw_shader.h"
30b8e80941Smrg
31b8e80941Smrgnamespace brw {
32b8e80941Smrg   /**
33b8e80941Smrg    * Toolbox to assemble an FS IR program out of individual instructions.
34b8e80941Smrg    *
35b8e80941Smrg    * This object is meant to have an interface consistent with
36b8e80941Smrg    * brw::vec4_builder.  They cannot be fully interchangeable because
37b8e80941Smrg    * brw::fs_builder generates scalar code while brw::vec4_builder generates
38b8e80941Smrg    * vector code.
39b8e80941Smrg    */
40b8e80941Smrg   class fs_builder {
41b8e80941Smrg   public:
42b8e80941Smrg      /** Type used in this IR to represent a source of an instruction. */
43b8e80941Smrg      typedef fs_reg src_reg;
44b8e80941Smrg
45b8e80941Smrg      /** Type used in this IR to represent the destination of an instruction. */
46b8e80941Smrg      typedef fs_reg dst_reg;
47b8e80941Smrg
48b8e80941Smrg      /** Type used in this IR to represent an instruction. */
49b8e80941Smrg      typedef fs_inst instruction;
50b8e80941Smrg
51b8e80941Smrg      /**
52b8e80941Smrg       * Construct an fs_builder that inserts instructions into \p shader.
53b8e80941Smrg       * \p dispatch_width gives the native execution width of the program.
54b8e80941Smrg       */
55b8e80941Smrg      fs_builder(backend_shader *shader,
56b8e80941Smrg                 unsigned dispatch_width) :
57b8e80941Smrg         shader(shader), block(NULL), cursor(NULL),
58b8e80941Smrg         _dispatch_width(dispatch_width),
59b8e80941Smrg         _group(0),
60b8e80941Smrg         force_writemask_all(false),
61b8e80941Smrg         annotation()
62b8e80941Smrg      {
63b8e80941Smrg      }
64b8e80941Smrg
65b8e80941Smrg      /**
66b8e80941Smrg       * Construct an fs_builder that inserts instructions into \p shader
67b8e80941Smrg       * before instruction \p inst in basic block \p block.  The default
68b8e80941Smrg       * execution controls and debug annotation are initialized from the
69b8e80941Smrg       * instruction passed as argument.
70b8e80941Smrg       */
71b8e80941Smrg      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
72b8e80941Smrg         shader(shader), block(block), cursor(inst),
73b8e80941Smrg         _dispatch_width(inst->exec_size),
74b8e80941Smrg         _group(inst->group),
75b8e80941Smrg         force_writemask_all(inst->force_writemask_all)
76b8e80941Smrg      {
77b8e80941Smrg         annotation.str = inst->annotation;
78b8e80941Smrg         annotation.ir = inst->ir;
79b8e80941Smrg      }
80b8e80941Smrg
81b8e80941Smrg      /**
82b8e80941Smrg       * Construct an fs_builder that inserts instructions before \p cursor in
83b8e80941Smrg       * basic block \p block, inheriting other code generation parameters
84b8e80941Smrg       * from this.
85b8e80941Smrg       */
86b8e80941Smrg      fs_builder
87b8e80941Smrg      at(bblock_t *block, exec_node *cursor) const
88b8e80941Smrg      {
89b8e80941Smrg         fs_builder bld = *this;
90b8e80941Smrg         bld.block = block;
91b8e80941Smrg         bld.cursor = cursor;
92b8e80941Smrg         return bld;
93b8e80941Smrg      }
94b8e80941Smrg
95b8e80941Smrg      /**
96b8e80941Smrg       * Construct an fs_builder appending instructions at the end of the
97b8e80941Smrg       * instruction list of the shader, inheriting other code generation
98b8e80941Smrg       * parameters from this.
99b8e80941Smrg       */
100b8e80941Smrg      fs_builder
101b8e80941Smrg      at_end() const
102b8e80941Smrg      {
103b8e80941Smrg         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
104b8e80941Smrg      }
105b8e80941Smrg
106b8e80941Smrg      /**
107b8e80941Smrg       * Construct a builder specifying the default SIMD width and group of
108b8e80941Smrg       * channel enable signals, inheriting other code generation parameters
109b8e80941Smrg       * from this.
110b8e80941Smrg       *
111b8e80941Smrg       * \p n gives the default SIMD width, \p i gives the slot group used for
112b8e80941Smrg       * predication and control flow masking in multiples of \p n channels.
113b8e80941Smrg       */
114b8e80941Smrg      fs_builder
115b8e80941Smrg      group(unsigned n, unsigned i) const
116b8e80941Smrg      {
117b8e80941Smrg         fs_builder bld = *this;
118b8e80941Smrg
119b8e80941Smrg         if (n <= dispatch_width() && i < dispatch_width() / n) {
120b8e80941Smrg            bld._group += i * n;
121b8e80941Smrg         } else {
122b8e80941Smrg            /* The requested channel group isn't a subset of the channel group
123b8e80941Smrg             * of this builder, which means that the resulting instructions
124b8e80941Smrg             * would use (potentially undefined) channel enable signals not
125b8e80941Smrg             * specified by the parent builder.  That's only valid if the
126b8e80941Smrg             * instruction doesn't have per-channel semantics, in which case
127b8e80941Smrg             * we should clear off the default group index in order to prevent
128b8e80941Smrg             * emitting instructions with channel group not aligned to their
129b8e80941Smrg             * own execution size.
130b8e80941Smrg             */
131b8e80941Smrg            assert(force_writemask_all);
132b8e80941Smrg            bld._group = 0;
133b8e80941Smrg         }
134b8e80941Smrg
135b8e80941Smrg         bld._dispatch_width = n;
136b8e80941Smrg         return bld;
137b8e80941Smrg      }
138b8e80941Smrg
139b8e80941Smrg      /**
140b8e80941Smrg       * Alias for group() with width equal to eight.
141b8e80941Smrg       */
142b8e80941Smrg      fs_builder
143b8e80941Smrg      half(unsigned i) const
144b8e80941Smrg      {
145b8e80941Smrg         return group(8, i);
146b8e80941Smrg      }
147b8e80941Smrg
148b8e80941Smrg      /**
149b8e80941Smrg       * Construct a builder with per-channel control flow execution masking
150b8e80941Smrg       * disabled if \p b is true.  If control flow execution masking is
151b8e80941Smrg       * already disabled this has no effect.
152b8e80941Smrg       */
153b8e80941Smrg      fs_builder
154b8e80941Smrg      exec_all(bool b = true) const
155b8e80941Smrg      {
156b8e80941Smrg         fs_builder bld = *this;
157b8e80941Smrg         if (b)
158b8e80941Smrg            bld.force_writemask_all = true;
159b8e80941Smrg         return bld;
160b8e80941Smrg      }
161b8e80941Smrg
162b8e80941Smrg      /**
163b8e80941Smrg       * Construct a builder with the given debug annotation info.
164b8e80941Smrg       */
165b8e80941Smrg      fs_builder
166b8e80941Smrg      annotate(const char *str, const void *ir = NULL) const
167b8e80941Smrg      {
168b8e80941Smrg         fs_builder bld = *this;
169b8e80941Smrg         bld.annotation.str = str;
170b8e80941Smrg         bld.annotation.ir = ir;
171b8e80941Smrg         return bld;
172b8e80941Smrg      }
173b8e80941Smrg
174b8e80941Smrg      /**
175b8e80941Smrg       * Get the SIMD width in use.
176b8e80941Smrg       */
177b8e80941Smrg      unsigned
178b8e80941Smrg      dispatch_width() const
179b8e80941Smrg      {
180b8e80941Smrg         return _dispatch_width;
181b8e80941Smrg      }
182b8e80941Smrg
183b8e80941Smrg      /**
184b8e80941Smrg       * Get the channel group in use.
185b8e80941Smrg       */
186b8e80941Smrg      unsigned
187b8e80941Smrg      group() const
188b8e80941Smrg      {
189b8e80941Smrg         return _group;
190b8e80941Smrg      }
191b8e80941Smrg
192b8e80941Smrg      /**
193b8e80941Smrg       * Allocate a virtual register of natural vector size (one for this IR)
194b8e80941Smrg       * and SIMD width.  \p n gives the amount of space to allocate in
195b8e80941Smrg       * dispatch_width units (which is just enough space for one logical
196b8e80941Smrg       * component in this IR).
197b8e80941Smrg       */
198b8e80941Smrg      dst_reg
199b8e80941Smrg      vgrf(enum brw_reg_type type, unsigned n = 1) const
200b8e80941Smrg      {
201b8e80941Smrg         assert(dispatch_width() <= 32);
202b8e80941Smrg
203b8e80941Smrg         if (n > 0)
204b8e80941Smrg            return dst_reg(VGRF, shader->alloc.allocate(
205b8e80941Smrg                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
206b8e80941Smrg                                           REG_SIZE)),
207b8e80941Smrg                           type);
208b8e80941Smrg         else
209b8e80941Smrg            return retype(null_reg_ud(), type);
210b8e80941Smrg      }
211b8e80941Smrg
212b8e80941Smrg      /**
213b8e80941Smrg       * Create a null register of floating type.
214b8e80941Smrg       */
215b8e80941Smrg      dst_reg
216b8e80941Smrg      null_reg_f() const
217b8e80941Smrg      {
218b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
219b8e80941Smrg      }
220b8e80941Smrg
221b8e80941Smrg      dst_reg
222b8e80941Smrg      null_reg_df() const
223b8e80941Smrg      {
224b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
225b8e80941Smrg      }
226b8e80941Smrg
227b8e80941Smrg      /**
228b8e80941Smrg       * Create a null register of signed integer type.
229b8e80941Smrg       */
230b8e80941Smrg      dst_reg
231b8e80941Smrg      null_reg_d() const
232b8e80941Smrg      {
233b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
234b8e80941Smrg      }
235b8e80941Smrg
236b8e80941Smrg      /**
237b8e80941Smrg       * Create a null register of unsigned integer type.
238b8e80941Smrg       */
239b8e80941Smrg      dst_reg
240b8e80941Smrg      null_reg_ud() const
241b8e80941Smrg      {
242b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
243b8e80941Smrg      }
244b8e80941Smrg
245b8e80941Smrg      /**
246b8e80941Smrg       * Get the mask of SIMD channels enabled by dispatch and not yet
247b8e80941Smrg       * disabled by discard.
248b8e80941Smrg       */
249b8e80941Smrg      src_reg
250b8e80941Smrg      sample_mask_reg() const
251b8e80941Smrg      {
252b8e80941Smrg         if (shader->stage != MESA_SHADER_FRAGMENT) {
253b8e80941Smrg            return brw_imm_d(0xffffffff);
254b8e80941Smrg         } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
255b8e80941Smrg            return brw_flag_reg(0, 1);
256b8e80941Smrg         } else {
257b8e80941Smrg            assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
258b8e80941Smrg            return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
259b8e80941Smrg                          BRW_REGISTER_TYPE_UD);
260b8e80941Smrg         }
261b8e80941Smrg      }
262b8e80941Smrg
263b8e80941Smrg      /**
264b8e80941Smrg       * Insert an instruction into the program.
265b8e80941Smrg       */
266b8e80941Smrg      instruction *
267b8e80941Smrg      emit(const instruction &inst) const
268b8e80941Smrg      {
269b8e80941Smrg         return emit(new(shader->mem_ctx) instruction(inst));
270b8e80941Smrg      }
271b8e80941Smrg
272b8e80941Smrg      /**
273b8e80941Smrg       * Create and insert a nullary control instruction into the program.
274b8e80941Smrg       */
275b8e80941Smrg      instruction *
276b8e80941Smrg      emit(enum opcode opcode) const
277b8e80941Smrg      {
278b8e80941Smrg         return emit(instruction(opcode, dispatch_width()));
279b8e80941Smrg      }
280b8e80941Smrg
281b8e80941Smrg      /**
282b8e80941Smrg       * Create and insert a nullary instruction into the program.
283b8e80941Smrg       */
284b8e80941Smrg      instruction *
285b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst) const
286b8e80941Smrg      {
287b8e80941Smrg         return emit(instruction(opcode, dispatch_width(), dst));
288b8e80941Smrg      }
289b8e80941Smrg
290b8e80941Smrg      /**
291b8e80941Smrg       * Create and insert a unary instruction into the program.
292b8e80941Smrg       */
293b8e80941Smrg      instruction *
294b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
295b8e80941Smrg      {
296b8e80941Smrg         switch (opcode) {
297b8e80941Smrg         case SHADER_OPCODE_RCP:
298b8e80941Smrg         case SHADER_OPCODE_RSQ:
299b8e80941Smrg         case SHADER_OPCODE_SQRT:
300b8e80941Smrg         case SHADER_OPCODE_EXP2:
301b8e80941Smrg         case SHADER_OPCODE_LOG2:
302b8e80941Smrg         case SHADER_OPCODE_SIN:
303b8e80941Smrg         case SHADER_OPCODE_COS:
304b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
305b8e80941Smrg                                    fix_math_operand(src0)));
306b8e80941Smrg
307b8e80941Smrg         default:
308b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst, src0));
309b8e80941Smrg         }
310b8e80941Smrg      }
311b8e80941Smrg
312b8e80941Smrg      /**
313b8e80941Smrg       * Create and insert a binary instruction into the program.
314b8e80941Smrg       */
315b8e80941Smrg      instruction *
316b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
317b8e80941Smrg           const src_reg &src1) const
318b8e80941Smrg      {
319b8e80941Smrg         switch (opcode) {
320b8e80941Smrg         case SHADER_OPCODE_POW:
321b8e80941Smrg         case SHADER_OPCODE_INT_QUOTIENT:
322b8e80941Smrg         case SHADER_OPCODE_INT_REMAINDER:
323b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
324b8e80941Smrg                                    fix_math_operand(src0),
325b8e80941Smrg                                    fix_math_operand(fix_byte_src(src1))));
326b8e80941Smrg
327b8e80941Smrg         default:
328b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
329b8e80941Smrg                                    src0, fix_byte_src(src1)));
330b8e80941Smrg
331b8e80941Smrg         }
332b8e80941Smrg      }
333b8e80941Smrg
334b8e80941Smrg      /**
335b8e80941Smrg       * Create and insert a ternary instruction into the program.
336b8e80941Smrg       */
337b8e80941Smrg      instruction *
338b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
339b8e80941Smrg           const src_reg &src1, const src_reg &src2) const
340b8e80941Smrg      {
341b8e80941Smrg         switch (opcode) {
342b8e80941Smrg         case BRW_OPCODE_BFE:
343b8e80941Smrg         case BRW_OPCODE_BFI2:
344b8e80941Smrg         case BRW_OPCODE_MAD:
345b8e80941Smrg         case BRW_OPCODE_LRP:
346b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
347b8e80941Smrg                                    fix_3src_operand(src0),
348b8e80941Smrg                                    fix_3src_operand(fix_byte_src(src1)),
349b8e80941Smrg                                    fix_3src_operand(fix_byte_src(src2))));
350b8e80941Smrg
351b8e80941Smrg         default:
352b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
353b8e80941Smrg                                    src0, fix_byte_src(src1), fix_byte_src(src2)));
354b8e80941Smrg         }
355b8e80941Smrg      }
356b8e80941Smrg
357b8e80941Smrg      /**
358b8e80941Smrg       * Create and insert an instruction with a variable number of sources
359b8e80941Smrg       * into the program.
360b8e80941Smrg       */
361b8e80941Smrg      instruction *
362b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
363b8e80941Smrg           unsigned n) const
364b8e80941Smrg      {
365b8e80941Smrg         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
366b8e80941Smrg      }
367b8e80941Smrg
368b8e80941Smrg      /**
369b8e80941Smrg       * Insert a preallocated instruction into the program.
370b8e80941Smrg       */
371b8e80941Smrg      instruction *
372b8e80941Smrg      emit(instruction *inst) const
373b8e80941Smrg      {
374b8e80941Smrg         assert(inst->exec_size <= 32);
375b8e80941Smrg         assert(inst->exec_size == dispatch_width() ||
376b8e80941Smrg                force_writemask_all);
377b8e80941Smrg
378b8e80941Smrg         inst->group = _group;
379b8e80941Smrg         inst->force_writemask_all = force_writemask_all;
380b8e80941Smrg         inst->annotation = annotation.str;
381b8e80941Smrg         inst->ir = annotation.ir;
382b8e80941Smrg
383b8e80941Smrg         if (block)
384b8e80941Smrg            static_cast<instruction *>(cursor)->insert_before(block, inst);
385b8e80941Smrg         else
386b8e80941Smrg            cursor->insert_before(inst);
387b8e80941Smrg
388b8e80941Smrg         return inst;
389b8e80941Smrg      }
390b8e80941Smrg
391b8e80941Smrg      /**
392b8e80941Smrg       * Select \p src0 if the comparison of both sources with the given
393b8e80941Smrg       * conditional mod evaluates to true, otherwise select \p src1.
394b8e80941Smrg       *
395b8e80941Smrg       * Generally useful to get the minimum or maximum of two values.
396b8e80941Smrg       */
397b8e80941Smrg      instruction *
398b8e80941Smrg      emit_minmax(const dst_reg &dst, const src_reg &src0,
399b8e80941Smrg                  const src_reg &src1, brw_conditional_mod mod) const
400b8e80941Smrg      {
401b8e80941Smrg         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
402b8e80941Smrg
403b8e80941Smrg         /* In some cases we can't have bytes as operand for src1, so use the
404b8e80941Smrg          * same type for both operand.
405b8e80941Smrg          */
406b8e80941Smrg         return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
407b8e80941Smrg                                     fix_unsigned_negate(fix_byte_src(src1))));
408b8e80941Smrg      }
409b8e80941Smrg
410b8e80941Smrg      /**
411b8e80941Smrg       * Copy any live channel from \p src to the first channel of the result.
412b8e80941Smrg       */
413b8e80941Smrg      src_reg
414b8e80941Smrg      emit_uniformize(const src_reg &src) const
415b8e80941Smrg      {
416b8e80941Smrg         /* FIXME: We use a vector chan_index and dst to allow constant and
417b8e80941Smrg          * copy propagration to move result all the way into the consuming
418b8e80941Smrg          * instruction (typically a surface index or sampler index for a
419b8e80941Smrg          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
420b8e80941Smrg          * dispatch. Once we teach const/copy propagation about scalars we
421b8e80941Smrg          * should go back to scalar destinations here.
422b8e80941Smrg          */
423b8e80941Smrg         const fs_builder ubld = exec_all();
424b8e80941Smrg         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
425b8e80941Smrg         const dst_reg dst = vgrf(src.type);
426b8e80941Smrg
427b8e80941Smrg         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
428b8e80941Smrg         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
429b8e80941Smrg
430b8e80941Smrg         return src_reg(component(dst, 0));
431b8e80941Smrg      }
432b8e80941Smrg
433b8e80941Smrg      src_reg
434b8e80941Smrg      move_to_vgrf(const src_reg &src, unsigned num_components) const
435b8e80941Smrg      {
436b8e80941Smrg         src_reg *const src_comps = new src_reg[num_components];
437b8e80941Smrg         for (unsigned i = 0; i < num_components; i++)
438b8e80941Smrg            src_comps[i] = offset(src, dispatch_width(), i);
439b8e80941Smrg
440b8e80941Smrg         const dst_reg dst = vgrf(src.type, num_components);
441b8e80941Smrg         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
442b8e80941Smrg
443b8e80941Smrg         delete[] src_comps;
444b8e80941Smrg
445b8e80941Smrg         return src_reg(dst);
446b8e80941Smrg      }
447b8e80941Smrg
448b8e80941Smrg      void
449b8e80941Smrg      emit_scan(enum opcode opcode, const dst_reg &tmp,
450b8e80941Smrg                unsigned cluster_size, brw_conditional_mod mod) const
451b8e80941Smrg      {
452b8e80941Smrg         assert(dispatch_width() >= 8);
453b8e80941Smrg
454b8e80941Smrg         /* The instruction splitting code isn't advanced enough to split
455b8e80941Smrg          * these so we need to handle that ourselves.
456b8e80941Smrg          */
457b8e80941Smrg         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
458b8e80941Smrg            const unsigned half_width = dispatch_width() / 2;
459b8e80941Smrg            const fs_builder ubld = exec_all().group(half_width, 0);
460b8e80941Smrg            dst_reg left = tmp;
461b8e80941Smrg            dst_reg right = horiz_offset(tmp, half_width);
462b8e80941Smrg            ubld.emit_scan(opcode, left, cluster_size, mod);
463b8e80941Smrg            ubld.emit_scan(opcode, right, cluster_size, mod);
464b8e80941Smrg            if (cluster_size > half_width) {
465b8e80941Smrg               src_reg left_comp = component(left, half_width - 1);
466b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
467b8e80941Smrg            }
468b8e80941Smrg            return;
469b8e80941Smrg         }
470b8e80941Smrg
471b8e80941Smrg         if (cluster_size > 1) {
472b8e80941Smrg            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
473b8e80941Smrg            const dst_reg left = horiz_stride(tmp, 2);
474b8e80941Smrg            const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
475b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
476b8e80941Smrg         }
477b8e80941Smrg
478b8e80941Smrg         if (cluster_size > 2) {
479b8e80941Smrg            if (type_sz(tmp.type) <= 4) {
480b8e80941Smrg               const fs_builder ubld =
481b8e80941Smrg                  exec_all().group(dispatch_width() / 4, 0);
482b8e80941Smrg               src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
483b8e80941Smrg
484b8e80941Smrg               dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
485b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
486b8e80941Smrg
487b8e80941Smrg               right = horiz_stride(horiz_offset(tmp, 3), 4);
488b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
489b8e80941Smrg            } else {
490b8e80941Smrg               /* For 64-bit types, we have to do things differently because
491b8e80941Smrg                * the code above would land us with destination strides that
492b8e80941Smrg                * the hardware can't handle.  Fortunately, we'll only be
493b8e80941Smrg                * 8-wide in that case and it's the same number of
494b8e80941Smrg                * instructions.
495b8e80941Smrg                */
496b8e80941Smrg               const fs_builder ubld = exec_all().group(2, 0);
497b8e80941Smrg
498b8e80941Smrg               for (unsigned i = 0; i < dispatch_width(); i += 4) {
499b8e80941Smrg                  src_reg left = component(tmp, i + 1);
500b8e80941Smrg                  dst_reg right = horiz_offset(tmp, i + 2);
501b8e80941Smrg                  set_condmod(mod, ubld.emit(opcode, right, left, right));
502b8e80941Smrg               }
503b8e80941Smrg            }
504b8e80941Smrg         }
505b8e80941Smrg
506b8e80941Smrg         if (cluster_size > 4) {
507b8e80941Smrg            const fs_builder ubld = exec_all().group(4, 0);
508b8e80941Smrg            src_reg left = component(tmp, 3);
509b8e80941Smrg            dst_reg right = horiz_offset(tmp, 4);
510b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
511b8e80941Smrg
512b8e80941Smrg            if (dispatch_width() > 8) {
513b8e80941Smrg               left = component(tmp, 8 + 3);
514b8e80941Smrg               right = horiz_offset(tmp, 8 + 4);
515b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
516b8e80941Smrg            }
517b8e80941Smrg         }
518b8e80941Smrg
519b8e80941Smrg         if (cluster_size > 8 && dispatch_width() > 8) {
520b8e80941Smrg            const fs_builder ubld = exec_all().group(8, 0);
521b8e80941Smrg            src_reg left = component(tmp, 7);
522b8e80941Smrg            dst_reg right = horiz_offset(tmp, 8);
523b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
524b8e80941Smrg         }
525b8e80941Smrg      }
526b8e80941Smrg
527b8e80941Smrg      /**
528b8e80941Smrg       * Assorted arithmetic ops.
529b8e80941Smrg       * @{
530b8e80941Smrg       */
531b8e80941Smrg#define ALU1(op)                                        \
532b8e80941Smrg      instruction *                                     \
533b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0) const \
534b8e80941Smrg      {                                                 \
535b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0);       \
536b8e80941Smrg      }
537b8e80941Smrg
538b8e80941Smrg#define ALU2(op)                                                        \
539b8e80941Smrg      instruction *                                                     \
540b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
541b8e80941Smrg      {                                                                 \
542b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
543b8e80941Smrg      }
544b8e80941Smrg
545b8e80941Smrg#define ALU2_ACC(op)                                                    \
546b8e80941Smrg      instruction *                                                     \
547b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
548b8e80941Smrg      {                                                                 \
549b8e80941Smrg         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
550b8e80941Smrg         inst->writes_accumulator = true;                               \
551b8e80941Smrg         return inst;                                                   \
552b8e80941Smrg      }
553b8e80941Smrg
554b8e80941Smrg#define ALU3(op)                                                        \
555b8e80941Smrg      instruction *                                                     \
556b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
557b8e80941Smrg         const src_reg &src2) const                                     \
558b8e80941Smrg      {                                                                 \
559b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
560b8e80941Smrg      }
561b8e80941Smrg
562b8e80941Smrg      ALU2(ADD)
563b8e80941Smrg      ALU2_ACC(ADDC)
564b8e80941Smrg      ALU2(AND)
565b8e80941Smrg      ALU2(ASR)
566b8e80941Smrg      ALU2(AVG)
567b8e80941Smrg      ALU3(BFE)
568b8e80941Smrg      ALU2(BFI1)
569b8e80941Smrg      ALU3(BFI2)
570b8e80941Smrg      ALU1(BFREV)
571b8e80941Smrg      ALU1(CBIT)
572b8e80941Smrg      ALU2(CMPN)
573b8e80941Smrg      ALU1(DIM)
574b8e80941Smrg      ALU2(DP2)
575b8e80941Smrg      ALU2(DP3)
576b8e80941Smrg      ALU2(DP4)
577b8e80941Smrg      ALU2(DPH)
578b8e80941Smrg      ALU1(F16TO32)
579b8e80941Smrg      ALU1(F32TO16)
580b8e80941Smrg      ALU1(FBH)
581b8e80941Smrg      ALU1(FBL)
582b8e80941Smrg      ALU1(FRC)
583b8e80941Smrg      ALU2(LINE)
584b8e80941Smrg      ALU1(LZD)
585b8e80941Smrg      ALU2(MAC)
586b8e80941Smrg      ALU2_ACC(MACH)
587b8e80941Smrg      ALU3(MAD)
588b8e80941Smrg      ALU1(MOV)
589b8e80941Smrg      ALU2(MUL)
590b8e80941Smrg      ALU1(NOT)
591b8e80941Smrg      ALU2(OR)
592b8e80941Smrg      ALU2(PLN)
593b8e80941Smrg      ALU1(RNDD)
594b8e80941Smrg      ALU1(RNDE)
595b8e80941Smrg      ALU1(RNDU)
596b8e80941Smrg      ALU1(RNDZ)
597b8e80941Smrg      ALU2(SAD2)
598b8e80941Smrg      ALU2_ACC(SADA2)
599b8e80941Smrg      ALU2(SEL)
600b8e80941Smrg      ALU2(SHL)
601b8e80941Smrg      ALU2(SHR)
602b8e80941Smrg      ALU2_ACC(SUBB)
603b8e80941Smrg      ALU2(XOR)
604b8e80941Smrg
605b8e80941Smrg#undef ALU3
606b8e80941Smrg#undef ALU2_ACC
607b8e80941Smrg#undef ALU2
608b8e80941Smrg#undef ALU1
609b8e80941Smrg      /** @} */
610b8e80941Smrg
611b8e80941Smrg      /**
612b8e80941Smrg       * CMP: Sets the low bit of the destination channels with the result
613b8e80941Smrg       * of the comparison, while the upper bits are undefined, and updates
614b8e80941Smrg       * the flag register with the packed 16 bits of the result.
615b8e80941Smrg       */
616b8e80941Smrg      instruction *
617b8e80941Smrg      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
618b8e80941Smrg          brw_conditional_mod condition) const
619b8e80941Smrg      {
620b8e80941Smrg         /* Take the instruction:
621b8e80941Smrg          *
622b8e80941Smrg          * CMP null<d> src0<f> src1<f>
623b8e80941Smrg          *
624b8e80941Smrg          * Original gen4 does type conversion to the destination type
625b8e80941Smrg          * before comparison, producing garbage results for floating
626b8e80941Smrg          * point comparisons.
627b8e80941Smrg          *
628b8e80941Smrg          * The destination type doesn't matter on newer generations,
629b8e80941Smrg          * so we set the type to match src0 so we can compact the
630b8e80941Smrg          * instruction.
631b8e80941Smrg          */
632b8e80941Smrg         return set_condmod(condition,
633b8e80941Smrg                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
634b8e80941Smrg                                 fix_unsigned_negate(src0),
635b8e80941Smrg                                 fix_unsigned_negate(src1)));
636b8e80941Smrg      }
637b8e80941Smrg
638b8e80941Smrg      /**
639b8e80941Smrg       * Gen4 predicated IF.
640b8e80941Smrg       */
641b8e80941Smrg      instruction *
642b8e80941Smrg      IF(brw_predicate predicate) const
643b8e80941Smrg      {
644b8e80941Smrg         return set_predicate(predicate, emit(BRW_OPCODE_IF));
645b8e80941Smrg      }
646b8e80941Smrg
647b8e80941Smrg      /**
648b8e80941Smrg       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
649b8e80941Smrg       */
650b8e80941Smrg      instruction *
651b8e80941Smrg      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
652b8e80941Smrg           const src_reg &src2, brw_conditional_mod condition) const
653b8e80941Smrg      {
654b8e80941Smrg         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
655b8e80941Smrg          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
656b8e80941Smrg          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
657b8e80941Smrg          */
658b8e80941Smrg         assert(src2.type == BRW_REGISTER_TYPE_F);
659b8e80941Smrg
660b8e80941Smrg         return set_condmod(condition,
661b8e80941Smrg                            emit(BRW_OPCODE_CSEL,
662b8e80941Smrg                                 retype(dst, BRW_REGISTER_TYPE_F),
663b8e80941Smrg                                 retype(src0, BRW_REGISTER_TYPE_F),
664b8e80941Smrg                                 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
665b8e80941Smrg                                 fix_byte_src(src2)));
666b8e80941Smrg      }
667b8e80941Smrg
668b8e80941Smrg      /**
669b8e80941Smrg       * Emit a linear interpolation instruction.
670b8e80941Smrg       */
671b8e80941Smrg      instruction *
672b8e80941Smrg      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
673b8e80941Smrg          const src_reg &a) const
674b8e80941Smrg      {
675b8e80941Smrg         if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
676b8e80941Smrg            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
677b8e80941Smrg             * we need to reorder the operands.
678b8e80941Smrg             */
679b8e80941Smrg            return emit(BRW_OPCODE_LRP, dst, a, y, x);
680b8e80941Smrg
681b8e80941Smrg         } else {
682b8e80941Smrg            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
683b8e80941Smrg            const dst_reg y_times_a = vgrf(dst.type);
684b8e80941Smrg            const dst_reg one_minus_a = vgrf(dst.type);
685b8e80941Smrg            const dst_reg x_times_one_minus_a = vgrf(dst.type);
686b8e80941Smrg
687b8e80941Smrg            MUL(y_times_a, y, a);
688b8e80941Smrg            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
689b8e80941Smrg            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
690b8e80941Smrg            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
691b8e80941Smrg         }
692b8e80941Smrg      }
693b8e80941Smrg
694b8e80941Smrg      /**
695b8e80941Smrg       * Collect a number of registers in a contiguous range of registers.
696b8e80941Smrg       */
697b8e80941Smrg      instruction *
698b8e80941Smrg      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
699b8e80941Smrg                   unsigned sources, unsigned header_size) const
700b8e80941Smrg      {
701b8e80941Smrg         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
702b8e80941Smrg         inst->header_size = header_size;
703b8e80941Smrg         inst->size_written = header_size * REG_SIZE;
704b8e80941Smrg         for (unsigned i = header_size; i < sources; i++) {
705b8e80941Smrg            inst->size_written +=
706b8e80941Smrg               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
707b8e80941Smrg                     REG_SIZE);
708b8e80941Smrg         }
709b8e80941Smrg
710b8e80941Smrg         return inst;
711b8e80941Smrg      }
712b8e80941Smrg
713b8e80941Smrg      backend_shader *shader;
714b8e80941Smrg
715b8e80941Smrg      /**
716b8e80941Smrg       * Byte sized operands are not supported for src1 on Gen11+.
717b8e80941Smrg       */
718b8e80941Smrg      src_reg
719b8e80941Smrg      fix_byte_src(const src_reg &src) const
720b8e80941Smrg      {
721b8e80941Smrg         if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) ||
722b8e80941Smrg             type_sz(src.type) != 1)
723b8e80941Smrg            return src;
724b8e80941Smrg
725b8e80941Smrg         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
726b8e80941Smrg                             BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
727b8e80941Smrg         MOV(temp, src);
728b8e80941Smrg         return src_reg(temp);
729b8e80941Smrg      }
730b8e80941Smrg
731b8e80941Smrg   private:
732b8e80941Smrg      /**
733b8e80941Smrg       * Workaround for negation of UD registers.  See comment in
734b8e80941Smrg       * fs_generator::generate_code() for more details.
735b8e80941Smrg       */
736b8e80941Smrg      src_reg
737b8e80941Smrg      fix_unsigned_negate(const src_reg &src) const
738b8e80941Smrg      {
739b8e80941Smrg         if (src.type == BRW_REGISTER_TYPE_UD &&
740b8e80941Smrg             src.negate) {
741b8e80941Smrg            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
742b8e80941Smrg            MOV(temp, src);
743b8e80941Smrg            return src_reg(temp);
744b8e80941Smrg         } else {
745b8e80941Smrg            return src;
746b8e80941Smrg         }
747b8e80941Smrg      }
748b8e80941Smrg
749b8e80941Smrg      /**
750b8e80941Smrg       * Workaround for source register modes not supported by the ternary
751b8e80941Smrg       * instruction encoding.
752b8e80941Smrg       */
753b8e80941Smrg      src_reg
754b8e80941Smrg      fix_3src_operand(const src_reg &src) const
755b8e80941Smrg      {
756b8e80941Smrg         switch (src.file) {
757b8e80941Smrg         case FIXED_GRF:
758b8e80941Smrg            /* FINISHME: Could handle scalar region, other stride=1 regions */
759b8e80941Smrg            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
760b8e80941Smrg                src.width != BRW_WIDTH_8 ||
761b8e80941Smrg                src.hstride != BRW_HORIZONTAL_STRIDE_1)
762b8e80941Smrg               break;
763b8e80941Smrg            /* fallthrough */
764b8e80941Smrg         case ATTR:
765b8e80941Smrg         case VGRF:
766b8e80941Smrg         case UNIFORM:
767b8e80941Smrg         case IMM:
768b8e80941Smrg            return src;
769b8e80941Smrg         default:
770b8e80941Smrg            break;
771b8e80941Smrg         }
772b8e80941Smrg
773b8e80941Smrg         dst_reg expanded = vgrf(src.type);
774b8e80941Smrg         MOV(expanded, src);
775b8e80941Smrg         return expanded;
776b8e80941Smrg      }
777b8e80941Smrg
778b8e80941Smrg      /**
779b8e80941Smrg       * Workaround for source register modes not supported by the math
780b8e80941Smrg       * instruction.
781b8e80941Smrg       */
782b8e80941Smrg      src_reg
783b8e80941Smrg      fix_math_operand(const src_reg &src) const
784b8e80941Smrg      {
785b8e80941Smrg         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
786b8e80941Smrg          * might be able to do better by doing execsize = 1 math and then
787b8e80941Smrg          * expanding that result out, but we would need to be careful with
788b8e80941Smrg          * masking.
789b8e80941Smrg          *
790b8e80941Smrg          * Gen6 hardware ignores source modifiers (negate and abs) on math
791b8e80941Smrg          * instructions, so we also move to a temp to set those up.
792b8e80941Smrg          *
793b8e80941Smrg          * Gen7 relaxes most of the above restrictions, but still can't use IMM
794b8e80941Smrg          * operands to math
795b8e80941Smrg          */
796b8e80941Smrg         if ((shader->devinfo->gen == 6 &&
797b8e80941Smrg              (src.file == IMM || src.file == UNIFORM ||
798b8e80941Smrg               src.abs || src.negate)) ||
799b8e80941Smrg             (shader->devinfo->gen == 7 && src.file == IMM)) {
800b8e80941Smrg            const dst_reg tmp = vgrf(src.type);
801b8e80941Smrg            MOV(tmp, src);
802b8e80941Smrg            return tmp;
803b8e80941Smrg         } else {
804b8e80941Smrg            return src;
805b8e80941Smrg         }
806b8e80941Smrg      }
807b8e80941Smrg
808b8e80941Smrg      bblock_t *block;
809b8e80941Smrg      exec_node *cursor;
810b8e80941Smrg
811b8e80941Smrg      unsigned _dispatch_width;
812b8e80941Smrg      unsigned _group;
813b8e80941Smrg      bool force_writemask_all;
814b8e80941Smrg
815b8e80941Smrg      /** Debug annotation info. */
816b8e80941Smrg      struct {
817b8e80941Smrg         const char *str;
818b8e80941Smrg         const void *ir;
819b8e80941Smrg      } annotation;
820b8e80941Smrg   };
821b8e80941Smrg}
822b8e80941Smrg
823b8e80941Smrg#endif
824