intel/compiler/brw_fs_builder.h

b8e80941Smrg/* -*- c++ -*- */
b8e80941Smrg/*
b8e80941Smrg * Copyright © 2010-2015 Intel Corporation
b8e80941Smrg *
b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
b8e80941Smrg * to deal in the Software without restriction, including without limitation
b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
b8e80941Smrg *
b8e80941Smrg * The above copyright notice and this permission notice (including the next
b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
b8e80941Smrg * Software.
b8e80941Smrg *
b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
b8e80941Smrg * IN THE SOFTWARE.
b8e80941Smrg */
b8e80941Smrg
b8e80941Smrg#ifndef BRW_FS_BUILDER_H
b8e80941Smrg#define BRW_FS_BUILDER_H
b8e80941Smrg
b8e80941Smrg#include "brw_ir_fs.h"
b8e80941Smrg#include "brw_shader.h"
b8e80941Smrg
b8e80941Smrgnamespace brw {
b8e80941Smrg   /**
b8e80941Smrg    * Toolbox to assemble an FS IR program out of individual instructions.
b8e80941Smrg    *
b8e80941Smrg    * This object is meant to have an interface consistent with
b8e80941Smrg    * brw::vec4_builder.  They cannot be fully interchangeable because
b8e80941Smrg    * brw::fs_builder generates scalar code while brw::vec4_builder generates
b8e80941Smrg    * vector code.
b8e80941Smrg    */
b8e80941Smrg   class fs_builder {
b8e80941Smrg   public:
b8e80941Smrg      /** Type used in this IR to represent a source of an instruction. */
b8e80941Smrg      typedef fs_reg src_reg;
b8e80941Smrg
b8e80941Smrg      /** Type used in this IR to represent the destination of an instruction. */
b8e80941Smrg      typedef fs_reg dst_reg;
b8e80941Smrg
b8e80941Smrg      /** Type used in this IR to represent an instruction. */
b8e80941Smrg      typedef fs_inst instruction;
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct an fs_builder that inserts instructions into \p shader.
b8e80941Smrg       * \p dispatch_width gives the native execution width of the program.
b8e80941Smrg       */
b8e80941Smrg      fs_builder(backend_shader *shader,
b8e80941Smrg                 unsigned dispatch_width) :
b8e80941Smrg         shader(shader), block(NULL), cursor(NULL),
b8e80941Smrg         _dispatch_width(dispatch_width),
b8e80941Smrg         _group(0),
b8e80941Smrg         force_writemask_all(false),
b8e80941Smrg         annotation()
b8e80941Smrg      {
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct an fs_builder that inserts instructions into \p shader
b8e80941Smrg       * before instruction \p inst in basic block \p block.  The default
b8e80941Smrg       * execution controls and debug annotation are initialized from the
b8e80941Smrg       * instruction passed as argument.
b8e80941Smrg       */
b8e80941Smrg      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
b8e80941Smrg         shader(shader), block(block), cursor(inst),
b8e80941Smrg         _dispatch_width(inst->exec_size),
b8e80941Smrg         _group(inst->group),
b8e80941Smrg         force_writemask_all(inst->force_writemask_all)
b8e80941Smrg      {
b8e80941Smrg         annotation.str = inst->annotation;
b8e80941Smrg         annotation.ir = inst->ir;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct an fs_builder that inserts instructions before \p cursor in
b8e80941Smrg       * basic block \p block, inheriting other code generation parameters
b8e80941Smrg       * from this.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      at(bblock_t *block, exec_node *cursor) const
b8e80941Smrg      {
b8e80941Smrg         fs_builder bld = *this;
b8e80941Smrg         bld.block = block;
b8e80941Smrg         bld.cursor = cursor;
b8e80941Smrg         return bld;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct an fs_builder appending instructions at the end of the
b8e80941Smrg       * instruction list of the shader, inheriting other code generation
b8e80941Smrg       * parameters from this.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      at_end() const
b8e80941Smrg      {
b8e80941Smrg         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct a builder specifying the default SIMD width and group of
b8e80941Smrg       * channel enable signals, inheriting other code generation parameters
b8e80941Smrg       * from this.
b8e80941Smrg       *
b8e80941Smrg       * \p n gives the default SIMD width, \p i gives the slot group used for
b8e80941Smrg       * predication and control flow masking in multiples of \p n channels.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      group(unsigned n, unsigned i) const
b8e80941Smrg      {
b8e80941Smrg         fs_builder bld = *this;
b8e80941Smrg
b8e80941Smrg         if (n <= dispatch_width() && i < dispatch_width() / n) {
b8e80941Smrg            bld._group += i * n;
b8e80941Smrg         } else {
b8e80941Smrg            /* The requested channel group isn't a subset of the channel group
b8e80941Smrg             * of this builder, which means that the resulting instructions
b8e80941Smrg             * would use (potentially undefined) channel enable signals not
b8e80941Smrg             * specified by the parent builder.  That's only valid if the
b8e80941Smrg             * instruction doesn't have per-channel semantics, in which case
b8e80941Smrg             * we should clear off the default group index in order to prevent
b8e80941Smrg             * emitting instructions with channel group not aligned to their
b8e80941Smrg             * own execution size.
b8e80941Smrg             */
b8e80941Smrg            assert(force_writemask_all);
b8e80941Smrg            bld._group = 0;
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         bld._dispatch_width = n;
b8e80941Smrg         return bld;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Alias for group() with width equal to eight.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      half(unsigned i) const
b8e80941Smrg      {
b8e80941Smrg         return group(8, i);
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct a builder with per-channel control flow execution masking
b8e80941Smrg       * disabled if \p b is true.  If control flow execution masking is
b8e80941Smrg       * already disabled this has no effect.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      exec_all(bool b = true) const
b8e80941Smrg      {
b8e80941Smrg         fs_builder bld = *this;
b8e80941Smrg         if (b)
b8e80941Smrg            bld.force_writemask_all = true;
b8e80941Smrg         return bld;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Construct a builder with the given debug annotation info.
b8e80941Smrg       */
b8e80941Smrg      fs_builder
b8e80941Smrg      annotate(const char *str, const void *ir = NULL) const
b8e80941Smrg      {
b8e80941Smrg         fs_builder bld = *this;
b8e80941Smrg         bld.annotation.str = str;
b8e80941Smrg         bld.annotation.ir = ir;
b8e80941Smrg         return bld;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Get the SIMD width in use.
b8e80941Smrg       */
b8e80941Smrg      unsigned
b8e80941Smrg      dispatch_width() const
b8e80941Smrg      {
b8e80941Smrg         return _dispatch_width;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Get the channel group in use.
b8e80941Smrg       */
b8e80941Smrg      unsigned
b8e80941Smrg      group() const
b8e80941Smrg      {
b8e80941Smrg         return _group;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Allocate a virtual register of natural vector size (one for this IR)
b8e80941Smrg       * and SIMD width.  \p n gives the amount of space to allocate in
b8e80941Smrg       * dispatch_width units (which is just enough space for one logical
b8e80941Smrg       * component in this IR).
b8e80941Smrg       */
b8e80941Smrg      dst_reg
b8e80941Smrg      vgrf(enum brw_reg_type type, unsigned n = 1) const
b8e80941Smrg      {
b8e80941Smrg         assert(dispatch_width() <= 32);
b8e80941Smrg
b8e80941Smrg         if (n > 0)
b8e80941Smrg            return dst_reg(VGRF, shader->alloc.allocate(
b8e80941Smrg                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
b8e80941Smrg                                           REG_SIZE)),
b8e80941Smrg                           type);
b8e80941Smrg         else
b8e80941Smrg            return retype(null_reg_ud(), type);
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create a null register of floating type.
b8e80941Smrg       */
b8e80941Smrg      dst_reg
b8e80941Smrg      null_reg_f() const
b8e80941Smrg      {
b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      dst_reg
b8e80941Smrg      null_reg_df() const
b8e80941Smrg      {
b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create a null register of signed integer type.
b8e80941Smrg       */
b8e80941Smrg      dst_reg
b8e80941Smrg      null_reg_d() const
b8e80941Smrg      {
b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create a null register of unsigned integer type.
b8e80941Smrg       */
b8e80941Smrg      dst_reg
b8e80941Smrg      null_reg_ud() const
b8e80941Smrg      {
b8e80941Smrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Get the mask of SIMD channels enabled by dispatch and not yet
b8e80941Smrg       * disabled by discard.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      sample_mask_reg() const
b8e80941Smrg      {
b8e80941Smrg         if (shader->stage != MESA_SHADER_FRAGMENT) {
b8e80941Smrg            return brw_imm_d(0xffffffff);
b8e80941Smrg         } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
b8e80941Smrg            return brw_flag_reg(0, 1);
b8e80941Smrg         } else {
b8e80941Smrg            assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16);
b8e80941Smrg            return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7),
b8e80941Smrg                          BRW_REGISTER_TYPE_UD);
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Insert an instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(const instruction &inst) const
b8e80941Smrg      {
b8e80941Smrg         return emit(new(shader->mem_ctx) instruction(inst));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert a nullary control instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode) const
b8e80941Smrg      {
b8e80941Smrg         return emit(instruction(opcode, dispatch_width()));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert a nullary instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst) const
b8e80941Smrg      {
b8e80941Smrg         return emit(instruction(opcode, dispatch_width(), dst));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert a unary instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
b8e80941Smrg      {
b8e80941Smrg         switch (opcode) {
b8e80941Smrg         case SHADER_OPCODE_RCP:
b8e80941Smrg         case SHADER_OPCODE_RSQ:
b8e80941Smrg         case SHADER_OPCODE_SQRT:
b8e80941Smrg         case SHADER_OPCODE_EXP2:
b8e80941Smrg         case SHADER_OPCODE_LOG2:
b8e80941Smrg         case SHADER_OPCODE_SIN:
b8e80941Smrg         case SHADER_OPCODE_COS:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
b8e80941Smrg                                    fix_math_operand(src0)));
b8e80941Smrg
b8e80941Smrg         default:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst, src0));
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert a binary instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
b8e80941Smrg           const src_reg &src1) const
b8e80941Smrg      {
b8e80941Smrg         switch (opcode) {
b8e80941Smrg         case SHADER_OPCODE_POW:
b8e80941Smrg         case SHADER_OPCODE_INT_QUOTIENT:
b8e80941Smrg         case SHADER_OPCODE_INT_REMAINDER:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
b8e80941Smrg                                    fix_math_operand(src0),
b8e80941Smrg                                    fix_math_operand(fix_byte_src(src1))));
b8e80941Smrg
b8e80941Smrg         default:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
b8e80941Smrg                                    src0, fix_byte_src(src1)));
b8e80941Smrg
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert a ternary instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
b8e80941Smrg           const src_reg &src1, const src_reg &src2) const
b8e80941Smrg      {
b8e80941Smrg         switch (opcode) {
b8e80941Smrg         case BRW_OPCODE_BFE:
b8e80941Smrg         case BRW_OPCODE_BFI2:
b8e80941Smrg         case BRW_OPCODE_MAD:
b8e80941Smrg         case BRW_OPCODE_LRP:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
b8e80941Smrg                                    fix_3src_operand(src0),
b8e80941Smrg                                    fix_3src_operand(fix_byte_src(src1)),
b8e80941Smrg                                    fix_3src_operand(fix_byte_src(src2))));
b8e80941Smrg
b8e80941Smrg         default:
b8e80941Smrg            return emit(instruction(opcode, dispatch_width(), dst,
b8e80941Smrg                                    src0, fix_byte_src(src1), fix_byte_src(src2)));
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Create and insert an instruction with a variable number of sources
b8e80941Smrg       * into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
b8e80941Smrg           unsigned n) const
b8e80941Smrg      {
b8e80941Smrg         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Insert a preallocated instruction into the program.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit(instruction *inst) const
b8e80941Smrg      {
b8e80941Smrg         assert(inst->exec_size <= 32);
b8e80941Smrg         assert(inst->exec_size == dispatch_width() ||
b8e80941Smrg                force_writemask_all);
b8e80941Smrg
b8e80941Smrg         inst->group = _group;
b8e80941Smrg         inst->force_writemask_all = force_writemask_all;
b8e80941Smrg         inst->annotation = annotation.str;
b8e80941Smrg         inst->ir = annotation.ir;
b8e80941Smrg
b8e80941Smrg         if (block)
b8e80941Smrg            static_cast<instruction *>(cursor)->insert_before(block, inst);
b8e80941Smrg         else
b8e80941Smrg            cursor->insert_before(inst);
b8e80941Smrg
b8e80941Smrg         return inst;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Select \p src0 if the comparison of both sources with the given
b8e80941Smrg       * conditional mod evaluates to true, otherwise select \p src1.
b8e80941Smrg       *
b8e80941Smrg       * Generally useful to get the minimum or maximum of two values.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      emit_minmax(const dst_reg &dst, const src_reg &src0,
b8e80941Smrg                  const src_reg &src1, brw_conditional_mod mod) const
b8e80941Smrg      {
b8e80941Smrg         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
b8e80941Smrg
b8e80941Smrg         /* In some cases we can't have bytes as operand for src1, so use the
b8e80941Smrg          * same type for both operand.
b8e80941Smrg          */
b8e80941Smrg         return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)),
b8e80941Smrg                                     fix_unsigned_negate(fix_byte_src(src1))));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Copy any live channel from \p src to the first channel of the result.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      emit_uniformize(const src_reg &src) const
b8e80941Smrg      {
b8e80941Smrg         /* FIXME: We use a vector chan_index and dst to allow constant and
b8e80941Smrg          * copy propagration to move result all the way into the consuming
b8e80941Smrg          * instruction (typically a surface index or sampler index for a
b8e80941Smrg          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
b8e80941Smrg          * dispatch. Once we teach const/copy propagation about scalars we
b8e80941Smrg          * should go back to scalar destinations here.
b8e80941Smrg          */
b8e80941Smrg         const fs_builder ubld = exec_all();
b8e80941Smrg         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
b8e80941Smrg         const dst_reg dst = vgrf(src.type);
b8e80941Smrg
b8e80941Smrg         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2;
b8e80941Smrg         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
b8e80941Smrg
b8e80941Smrg         return src_reg(component(dst, 0));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      src_reg
b8e80941Smrg      move_to_vgrf(const src_reg &src, unsigned num_components) const
b8e80941Smrg      {
b8e80941Smrg         src_reg *const src_comps = new src_reg[num_components];
b8e80941Smrg         for (unsigned i = 0; i < num_components; i++)
b8e80941Smrg            src_comps[i] = offset(src, dispatch_width(), i);
b8e80941Smrg
b8e80941Smrg         const dst_reg dst = vgrf(src.type, num_components);
b8e80941Smrg         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
b8e80941Smrg
b8e80941Smrg         delete[] src_comps;
b8e80941Smrg
b8e80941Smrg         return src_reg(dst);
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      void
b8e80941Smrg      emit_scan(enum opcode opcode, const dst_reg &tmp,
b8e80941Smrg                unsigned cluster_size, brw_conditional_mod mod) const
b8e80941Smrg      {
b8e80941Smrg         assert(dispatch_width() >= 8);
b8e80941Smrg
b8e80941Smrg         /* The instruction splitting code isn't advanced enough to split
b8e80941Smrg          * these so we need to handle that ourselves.
b8e80941Smrg          */
b8e80941Smrg         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
b8e80941Smrg            const unsigned half_width = dispatch_width() / 2;
b8e80941Smrg            const fs_builder ubld = exec_all().group(half_width, 0);
b8e80941Smrg            dst_reg left = tmp;
b8e80941Smrg            dst_reg right = horiz_offset(tmp, half_width);
b8e80941Smrg            ubld.emit_scan(opcode, left, cluster_size, mod);
b8e80941Smrg            ubld.emit_scan(opcode, right, cluster_size, mod);
b8e80941Smrg            if (cluster_size > half_width) {
b8e80941Smrg               src_reg left_comp = component(left, half_width - 1);
b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left_comp, right));
b8e80941Smrg            }
b8e80941Smrg            return;
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         if (cluster_size > 1) {
b8e80941Smrg            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
b8e80941Smrg            const dst_reg left = horiz_stride(tmp, 2);
b8e80941Smrg            const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2);
b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         if (cluster_size > 2) {
b8e80941Smrg            if (type_sz(tmp.type) <= 4) {
b8e80941Smrg               const fs_builder ubld =
b8e80941Smrg                  exec_all().group(dispatch_width() / 4, 0);
b8e80941Smrg               src_reg left = horiz_stride(horiz_offset(tmp, 1), 4);
b8e80941Smrg
b8e80941Smrg               dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4);
b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg
b8e80941Smrg               right = horiz_stride(horiz_offset(tmp, 3), 4);
b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg            } else {
b8e80941Smrg               /* For 64-bit types, we have to do things differently because
b8e80941Smrg                * the code above would land us with destination strides that
b8e80941Smrg                * the hardware can't handle.  Fortunately, we'll only be
b8e80941Smrg                * 8-wide in that case and it's the same number of
b8e80941Smrg                * instructions.
b8e80941Smrg                */
b8e80941Smrg               const fs_builder ubld = exec_all().group(2, 0);
b8e80941Smrg
b8e80941Smrg               for (unsigned i = 0; i < dispatch_width(); i += 4) {
b8e80941Smrg                  src_reg left = component(tmp, i + 1);
b8e80941Smrg                  dst_reg right = horiz_offset(tmp, i + 2);
b8e80941Smrg                  set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg               }
b8e80941Smrg            }
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         if (cluster_size > 4) {
b8e80941Smrg            const fs_builder ubld = exec_all().group(4, 0);
b8e80941Smrg            src_reg left = component(tmp, 3);
b8e80941Smrg            dst_reg right = horiz_offset(tmp, 4);
b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg
b8e80941Smrg            if (dispatch_width() > 8) {
b8e80941Smrg               left = component(tmp, 8 + 3);
b8e80941Smrg               right = horiz_offset(tmp, 8 + 4);
b8e80941Smrg               set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg            }
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         if (cluster_size > 8 && dispatch_width() > 8) {
b8e80941Smrg            const fs_builder ubld = exec_all().group(8, 0);
b8e80941Smrg            src_reg left = component(tmp, 7);
b8e80941Smrg            dst_reg right = horiz_offset(tmp, 8);
b8e80941Smrg            set_condmod(mod, ubld.emit(opcode, right, left, right));
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Assorted arithmetic ops.
b8e80941Smrg       * @{
b8e80941Smrg       */
b8e80941Smrg#define ALU1(op)                                        \
b8e80941Smrg      instruction *                                     \
b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0) const \
b8e80941Smrg      {                                                 \
b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0);       \
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg#define ALU2(op)                                                        \
b8e80941Smrg      instruction *                                                     \
b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
b8e80941Smrg      {                                                                 \
b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg#define ALU2_ACC(op)                                                    \
b8e80941Smrg      instruction *                                                     \
b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
b8e80941Smrg      {                                                                 \
b8e80941Smrg         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
b8e80941Smrg         inst->writes_accumulator = true;                               \
b8e80941Smrg         return inst;                                                   \
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg#define ALU3(op)                                                        \
b8e80941Smrg      instruction *                                                     \
b8e80941Smrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
b8e80941Smrg         const src_reg &src2) const                                     \
b8e80941Smrg      {                                                                 \
b8e80941Smrg         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      ALU2(ADD)
b8e80941Smrg      ALU2_ACC(ADDC)
b8e80941Smrg      ALU2(AND)
b8e80941Smrg      ALU2(ASR)
b8e80941Smrg      ALU2(AVG)
b8e80941Smrg      ALU3(BFE)
b8e80941Smrg      ALU2(BFI1)
b8e80941Smrg      ALU3(BFI2)
b8e80941Smrg      ALU1(BFREV)
b8e80941Smrg      ALU1(CBIT)
b8e80941Smrg      ALU2(CMPN)
b8e80941Smrg      ALU1(DIM)
b8e80941Smrg      ALU2(DP2)
b8e80941Smrg      ALU2(DP3)
b8e80941Smrg      ALU2(DP4)
b8e80941Smrg      ALU2(DPH)
b8e80941Smrg      ALU1(F16TO32)
b8e80941Smrg      ALU1(F32TO16)
b8e80941Smrg      ALU1(FBH)
b8e80941Smrg      ALU1(FBL)
b8e80941Smrg      ALU1(FRC)
b8e80941Smrg      ALU2(LINE)
b8e80941Smrg      ALU1(LZD)
b8e80941Smrg      ALU2(MAC)
b8e80941Smrg      ALU2_ACC(MACH)
b8e80941Smrg      ALU3(MAD)
b8e80941Smrg      ALU1(MOV)
b8e80941Smrg      ALU2(MUL)
b8e80941Smrg      ALU1(NOT)
b8e80941Smrg      ALU2(OR)
b8e80941Smrg      ALU2(PLN)
b8e80941Smrg      ALU1(RNDD)
b8e80941Smrg      ALU1(RNDE)
b8e80941Smrg      ALU1(RNDU)
b8e80941Smrg      ALU1(RNDZ)
b8e80941Smrg      ALU2(SAD2)
b8e80941Smrg      ALU2_ACC(SADA2)
b8e80941Smrg      ALU2(SEL)
b8e80941Smrg      ALU2(SHL)
b8e80941Smrg      ALU2(SHR)
b8e80941Smrg      ALU2_ACC(SUBB)
b8e80941Smrg      ALU2(XOR)
b8e80941Smrg
b8e80941Smrg#undef ALU3
b8e80941Smrg#undef ALU2_ACC
b8e80941Smrg#undef ALU2
b8e80941Smrg#undef ALU1
b8e80941Smrg      /** @} */
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * CMP: Sets the low bit of the destination channels with the result
b8e80941Smrg       * of the comparison, while the upper bits are undefined, and updates
b8e80941Smrg       * the flag register with the packed 16 bits of the result.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
b8e80941Smrg          brw_conditional_mod condition) const
b8e80941Smrg      {
b8e80941Smrg         /* Take the instruction:
b8e80941Smrg          *
b8e80941Smrg          * CMP null<d> src0<f> src1<f>
b8e80941Smrg          *
b8e80941Smrg          * Original gen4 does type conversion to the destination type
b8e80941Smrg          * before comparison, producing garbage results for floating
b8e80941Smrg          * point comparisons.
b8e80941Smrg          *
b8e80941Smrg          * The destination type doesn't matter on newer generations,
b8e80941Smrg          * so we set the type to match src0 so we can compact the
b8e80941Smrg          * instruction.
b8e80941Smrg          */
b8e80941Smrg         return set_condmod(condition,
b8e80941Smrg                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
b8e80941Smrg                                 fix_unsigned_negate(src0),
b8e80941Smrg                                 fix_unsigned_negate(src1)));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Gen4 predicated IF.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      IF(brw_predicate predicate) const
b8e80941Smrg      {
b8e80941Smrg         return set_predicate(predicate, emit(BRW_OPCODE_IF));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
b8e80941Smrg           const src_reg &src2, brw_conditional_mod condition) const
b8e80941Smrg      {
b8e80941Smrg         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
b8e80941Smrg          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
b8e80941Smrg          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
b8e80941Smrg          */
b8e80941Smrg         assert(src2.type == BRW_REGISTER_TYPE_F);
b8e80941Smrg
b8e80941Smrg         return set_condmod(condition,
b8e80941Smrg                            emit(BRW_OPCODE_CSEL,
b8e80941Smrg                                 retype(dst, BRW_REGISTER_TYPE_F),
b8e80941Smrg                                 retype(src0, BRW_REGISTER_TYPE_F),
b8e80941Smrg                                 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F),
b8e80941Smrg                                 fix_byte_src(src2)));
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Emit a linear interpolation instruction.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
b8e80941Smrg          const src_reg &a) const
b8e80941Smrg      {
b8e80941Smrg         if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) {
b8e80941Smrg            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
b8e80941Smrg             * we need to reorder the operands.
b8e80941Smrg             */
b8e80941Smrg            return emit(BRW_OPCODE_LRP, dst, a, y, x);
b8e80941Smrg
b8e80941Smrg         } else {
b8e80941Smrg            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
b8e80941Smrg            const dst_reg y_times_a = vgrf(dst.type);
b8e80941Smrg            const dst_reg one_minus_a = vgrf(dst.type);
b8e80941Smrg            const dst_reg x_times_one_minus_a = vgrf(dst.type);
b8e80941Smrg
b8e80941Smrg            MUL(y_times_a, y, a);
b8e80941Smrg            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
b8e80941Smrg            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
b8e80941Smrg            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Collect a number of registers in a contiguous range of registers.
b8e80941Smrg       */
b8e80941Smrg      instruction *
b8e80941Smrg      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
b8e80941Smrg                   unsigned sources, unsigned header_size) const
b8e80941Smrg      {
b8e80941Smrg         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
b8e80941Smrg         inst->header_size = header_size;
b8e80941Smrg         inst->size_written = header_size * REG_SIZE;
b8e80941Smrg         for (unsigned i = header_size; i < sources; i++) {
b8e80941Smrg            inst->size_written +=
b8e80941Smrg               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
b8e80941Smrg                     REG_SIZE);
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         return inst;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      backend_shader *shader;
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Byte sized operands are not supported for src1 on Gen11+.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      fix_byte_src(const src_reg &src) const
b8e80941Smrg      {
b8e80941Smrg         if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) ||
b8e80941Smrg             type_sz(src.type) != 1)
b8e80941Smrg            return src;
b8e80941Smrg
b8e80941Smrg         dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ?
b8e80941Smrg                             BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D);
b8e80941Smrg         MOV(temp, src);
b8e80941Smrg         return src_reg(temp);
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg   private:
b8e80941Smrg      /**
b8e80941Smrg       * Workaround for negation of UD registers.  See comment in
b8e80941Smrg       * fs_generator::generate_code() for more details.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      fix_unsigned_negate(const src_reg &src) const
b8e80941Smrg      {
b8e80941Smrg         if (src.type == BRW_REGISTER_TYPE_UD &&
b8e80941Smrg             src.negate) {
b8e80941Smrg            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
b8e80941Smrg            MOV(temp, src);
b8e80941Smrg            return src_reg(temp);
b8e80941Smrg         } else {
b8e80941Smrg            return src;
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Workaround for source register modes not supported by the ternary
b8e80941Smrg       * instruction encoding.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      fix_3src_operand(const src_reg &src) const
b8e80941Smrg      {
b8e80941Smrg         switch (src.file) {
b8e80941Smrg         case FIXED_GRF:
b8e80941Smrg            /* FINISHME: Could handle scalar region, other stride=1 regions */
b8e80941Smrg            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
b8e80941Smrg                src.width != BRW_WIDTH_8 ||
b8e80941Smrg                src.hstride != BRW_HORIZONTAL_STRIDE_1)
b8e80941Smrg               break;
b8e80941Smrg            /* fallthrough */
b8e80941Smrg         case ATTR:
b8e80941Smrg         case VGRF:
b8e80941Smrg         case UNIFORM:
b8e80941Smrg         case IMM:
b8e80941Smrg            return src;
b8e80941Smrg         default:
b8e80941Smrg            break;
b8e80941Smrg         }
b8e80941Smrg
b8e80941Smrg         dst_reg expanded = vgrf(src.type);
b8e80941Smrg         MOV(expanded, src);
b8e80941Smrg         return expanded;
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      /**
b8e80941Smrg       * Workaround for source register modes not supported by the math
b8e80941Smrg       * instruction.
b8e80941Smrg       */
b8e80941Smrg      src_reg
b8e80941Smrg      fix_math_operand(const src_reg &src) const
b8e80941Smrg      {
b8e80941Smrg         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
b8e80941Smrg          * might be able to do better by doing execsize = 1 math and then
b8e80941Smrg          * expanding that result out, but we would need to be careful with
b8e80941Smrg          * masking.
b8e80941Smrg          *
b8e80941Smrg          * Gen6 hardware ignores source modifiers (negate and abs) on math
b8e80941Smrg          * instructions, so we also move to a temp to set those up.
b8e80941Smrg          *
b8e80941Smrg          * Gen7 relaxes most of the above restrictions, but still can't use IMM
b8e80941Smrg          * operands to math
b8e80941Smrg          */
b8e80941Smrg         if ((shader->devinfo->gen == 6 &&
b8e80941Smrg              (src.file == IMM || src.file == UNIFORM ||
b8e80941Smrg               src.abs || src.negate)) ||
b8e80941Smrg             (shader->devinfo->gen == 7 && src.file == IMM)) {
b8e80941Smrg            const dst_reg tmp = vgrf(src.type);
b8e80941Smrg            MOV(tmp, src);
b8e80941Smrg            return tmp;
b8e80941Smrg         } else {
b8e80941Smrg            return src;
b8e80941Smrg         }
b8e80941Smrg      }
b8e80941Smrg
b8e80941Smrg      bblock_t *block;
b8e80941Smrg      exec_node *cursor;
b8e80941Smrg
b8e80941Smrg      unsigned _dispatch_width;
b8e80941Smrg      unsigned _group;
b8e80941Smrg      bool force_writemask_all;
b8e80941Smrg
b8e80941Smrg      /** Debug annotation info. */
b8e80941Smrg      struct {
b8e80941Smrg         const char *str;
b8e80941Smrg         const void *ir;
b8e80941Smrg      } annotation;
b8e80941Smrg   };
b8e80941Smrg}
b8e80941Smrg
b8e80941Smrg#endif