intel/compiler/brw_fs_builder.h

01e04c3fSmrg/* -*- c++ -*- */
01e04c3fSmrg/*
01e04c3fSmrg * Copyright © 2010-2015 Intel Corporation
01e04c3fSmrg *
01e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
01e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
01e04c3fSmrg * to deal in the Software without restriction, including without limitation
01e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
01e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
01e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
01e04c3fSmrg *
01e04c3fSmrg * The above copyright notice and this permission notice (including the next
01e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
01e04c3fSmrg * Software.
01e04c3fSmrg *
01e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
01e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
01e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
01e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
01e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
01e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
01e04c3fSmrg * IN THE SOFTWARE.
01e04c3fSmrg */
01e04c3fSmrg
01e04c3fSmrg#ifndef BRW_FS_BUILDER_H
01e04c3fSmrg#define BRW_FS_BUILDER_H
01e04c3fSmrg
01e04c3fSmrg#include "brw_ir_fs.h"
01e04c3fSmrg#include "brw_shader.h"
01e04c3fSmrg
01e04c3fSmrgnamespace brw {
01e04c3fSmrg   /**
01e04c3fSmrg    * Toolbox to assemble an FS IR program out of individual instructions.
01e04c3fSmrg    *
01e04c3fSmrg    * This object is meant to have an interface consistent with
01e04c3fSmrg    * brw::vec4_builder.  They cannot be fully interchangeable because
01e04c3fSmrg    * brw::fs_builder generates scalar code while brw::vec4_builder generates
01e04c3fSmrg    * vector code.
01e04c3fSmrg    */
01e04c3fSmrg   class fs_builder {
01e04c3fSmrg   public:
01e04c3fSmrg      /** Type used in this IR to represent a source of an instruction. */
01e04c3fSmrg      typedef fs_reg src_reg;
01e04c3fSmrg
01e04c3fSmrg      /** Type used in this IR to represent the destination of an instruction. */
01e04c3fSmrg      typedef fs_reg dst_reg;
01e04c3fSmrg
01e04c3fSmrg      /** Type used in this IR to represent an instruction. */
01e04c3fSmrg      typedef fs_inst instruction;
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct an fs_builder that inserts instructions into \p shader.
01e04c3fSmrg       * \p dispatch_width gives the native execution width of the program.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder(backend_shader *shader,
01e04c3fSmrg                 unsigned dispatch_width) :
01e04c3fSmrg         shader(shader), block(NULL), cursor(NULL),
01e04c3fSmrg         _dispatch_width(dispatch_width),
01e04c3fSmrg         _group(0),
01e04c3fSmrg         force_writemask_all(false),
01e04c3fSmrg         annotation()
01e04c3fSmrg      {
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct an fs_builder that inserts instructions into \p shader
01e04c3fSmrg       * before instruction \p inst in basic block \p block.  The default
01e04c3fSmrg       * execution controls and debug annotation are initialized from the
01e04c3fSmrg       * instruction passed as argument.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
01e04c3fSmrg         shader(shader), block(block), cursor(inst),
01e04c3fSmrg         _dispatch_width(inst->exec_size),
01e04c3fSmrg         _group(inst->group),
01e04c3fSmrg         force_writemask_all(inst->force_writemask_all)
01e04c3fSmrg      {
01e04c3fSmrg         annotation.str = inst->annotation;
01e04c3fSmrg         annotation.ir = inst->ir;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct an fs_builder that inserts instructions before \p cursor in
01e04c3fSmrg       * basic block \p block, inheriting other code generation parameters
01e04c3fSmrg       * from this.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
01e04c3fSmrg      at(bblock_t *block, exec_node *cursor) const
01e04c3fSmrg      {
01e04c3fSmrg         fs_builder bld = *this;
01e04c3fSmrg         bld.block = block;
01e04c3fSmrg         bld.cursor = cursor;
01e04c3fSmrg         return bld;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct an fs_builder appending instructions at the end of the
01e04c3fSmrg       * instruction list of the shader, inheriting other code generation
01e04c3fSmrg       * parameters from this.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
01e04c3fSmrg      at_end() const
01e04c3fSmrg      {
01e04c3fSmrg         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct a builder specifying the default SIMD width and group of
01e04c3fSmrg       * channel enable signals, inheriting other code generation parameters
01e04c3fSmrg       * from this.
01e04c3fSmrg       *
01e04c3fSmrg       * \p n gives the default SIMD width, \p i gives the slot group used for
01e04c3fSmrg       * predication and control flow masking in multiples of \p n channels.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
01e04c3fSmrg      group(unsigned n, unsigned i) const
01e04c3fSmrg      {
01e04c3fSmrg         fs_builder bld = *this;
9f464c52Smaya
9f464c52Smaya         if (n <= dispatch_width() && i < dispatch_width() / n) {
9f464c52Smaya            bld._group += i * n;
9f464c52Smaya         } else {
9f464c52Smaya            /* The requested channel group isn't a subset of the channel group
9f464c52Smaya             * of this builder, which means that the resulting instructions
9f464c52Smaya             * would use (potentially undefined) channel enable signals not
9f464c52Smaya             * specified by the parent builder.  That's only valid if the
9f464c52Smaya             * instruction doesn't have per-channel semantics, in which case
9f464c52Smaya             * we should clear off the default group index in order to prevent
9f464c52Smaya             * emitting instructions with channel group not aligned to their
9f464c52Smaya             * own execution size.
9f464c52Smaya             */
9f464c52Smaya            assert(force_writemask_all);
9f464c52Smaya            bld._group = 0;
9f464c52Smaya         }
9f464c52Smaya
01e04c3fSmrg         bld._dispatch_width = n;
01e04c3fSmrg         return bld;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Alias for group() with width equal to eight.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
7ec681f3Smrg      quarter(unsigned i) const
01e04c3fSmrg      {
01e04c3fSmrg         return group(8, i);
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct a builder with per-channel control flow execution masking
01e04c3fSmrg       * disabled if \p b is true.  If control flow execution masking is
01e04c3fSmrg       * already disabled this has no effect.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
01e04c3fSmrg      exec_all(bool b = true) const
01e04c3fSmrg      {
01e04c3fSmrg         fs_builder bld = *this;
01e04c3fSmrg         if (b)
01e04c3fSmrg            bld.force_writemask_all = true;
01e04c3fSmrg         return bld;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Construct a builder with the given debug annotation info.
01e04c3fSmrg       */
01e04c3fSmrg      fs_builder
01e04c3fSmrg      annotate(const char *str, const void *ir = NULL) const
01e04c3fSmrg      {
01e04c3fSmrg         fs_builder bld = *this;
01e04c3fSmrg         bld.annotation.str = str;
01e04c3fSmrg         bld.annotation.ir = ir;
01e04c3fSmrg         return bld;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Get the SIMD width in use.
01e04c3fSmrg       */
01e04c3fSmrg      unsigned
01e04c3fSmrg      dispatch_width() const
01e04c3fSmrg      {
01e04c3fSmrg         return _dispatch_width;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Get the channel group in use.
01e04c3fSmrg       */
01e04c3fSmrg      unsigned
01e04c3fSmrg      group() const
01e04c3fSmrg      {
01e04c3fSmrg         return _group;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Allocate a virtual register of natural vector size (one for this IR)
01e04c3fSmrg       * and SIMD width.  \p n gives the amount of space to allocate in
01e04c3fSmrg       * dispatch_width units (which is just enough space for one logical
01e04c3fSmrg       * component in this IR).
01e04c3fSmrg       */
01e04c3fSmrg      dst_reg
01e04c3fSmrg      vgrf(enum brw_reg_type type, unsigned n = 1) const
01e04c3fSmrg      {
01e04c3fSmrg         assert(dispatch_width() <= 32);
01e04c3fSmrg
01e04c3fSmrg         if (n > 0)
01e04c3fSmrg            return dst_reg(VGRF, shader->alloc.allocate(
01e04c3fSmrg                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
01e04c3fSmrg                                           REG_SIZE)),
01e04c3fSmrg                           type);
01e04c3fSmrg         else
01e04c3fSmrg            return retype(null_reg_ud(), type);
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create a null register of floating type.
01e04c3fSmrg       */
01e04c3fSmrg      dst_reg
01e04c3fSmrg      null_reg_f() const
01e04c3fSmrg      {
01e04c3fSmrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      dst_reg
01e04c3fSmrg      null_reg_df() const
01e04c3fSmrg      {
01e04c3fSmrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create a null register of signed integer type.
01e04c3fSmrg       */
01e04c3fSmrg      dst_reg
01e04c3fSmrg      null_reg_d() const
01e04c3fSmrg      {
01e04c3fSmrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create a null register of unsigned integer type.
01e04c3fSmrg       */
01e04c3fSmrg      dst_reg
01e04c3fSmrg      null_reg_ud() const
01e04c3fSmrg      {
01e04c3fSmrg         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Insert an instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(const instruction &inst) const
01e04c3fSmrg      {
01e04c3fSmrg         return emit(new(shader->mem_ctx) instruction(inst));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert a nullary control instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode) const
01e04c3fSmrg      {
01e04c3fSmrg         return emit(instruction(opcode, dispatch_width()));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert a nullary instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode, const dst_reg &dst) const
01e04c3fSmrg      {
01e04c3fSmrg         return emit(instruction(opcode, dispatch_width(), dst));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert a unary instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
01e04c3fSmrg      {
01e04c3fSmrg         switch (opcode) {
01e04c3fSmrg         case SHADER_OPCODE_RCP:
01e04c3fSmrg         case SHADER_OPCODE_RSQ:
01e04c3fSmrg         case SHADER_OPCODE_SQRT:
01e04c3fSmrg         case SHADER_OPCODE_EXP2:
01e04c3fSmrg         case SHADER_OPCODE_LOG2:
01e04c3fSmrg         case SHADER_OPCODE_SIN:
01e04c3fSmrg         case SHADER_OPCODE_COS:
01e04c3fSmrg            return emit(instruction(opcode, dispatch_width(), dst,
01e04c3fSmrg                                    fix_math_operand(src0)));
01e04c3fSmrg
01e04c3fSmrg         default:
01e04c3fSmrg            return emit(instruction(opcode, dispatch_width(), dst, src0));
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert a binary instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
01e04c3fSmrg           const src_reg &src1) const
01e04c3fSmrg      {
01e04c3fSmrg         switch (opcode) {
01e04c3fSmrg         case SHADER_OPCODE_POW:
01e04c3fSmrg         case SHADER_OPCODE_INT_QUOTIENT:
01e04c3fSmrg         case SHADER_OPCODE_INT_REMAINDER:
01e04c3fSmrg            return emit(instruction(opcode, dispatch_width(), dst,
01e04c3fSmrg                                    fix_math_operand(src0),
7ec681f3Smrg                                    fix_math_operand(src1)));
01e04c3fSmrg
01e04c3fSmrg         default:
9f464c52Smaya            return emit(instruction(opcode, dispatch_width(), dst,
7ec681f3Smrg                                    src0, src1));
01e04c3fSmrg
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert a ternary instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
01e04c3fSmrg           const src_reg &src1, const src_reg &src2) const
01e04c3fSmrg      {
01e04c3fSmrg         switch (opcode) {
01e04c3fSmrg         case BRW_OPCODE_BFE:
01e04c3fSmrg         case BRW_OPCODE_BFI2:
01e04c3fSmrg         case BRW_OPCODE_MAD:
01e04c3fSmrg         case BRW_OPCODE_LRP:
01e04c3fSmrg            return emit(instruction(opcode, dispatch_width(), dst,
01e04c3fSmrg                                    fix_3src_operand(src0),
7ec681f3Smrg                                    fix_3src_operand(src1),
7ec681f3Smrg                                    fix_3src_operand(src2)));
01e04c3fSmrg
01e04c3fSmrg         default:
01e04c3fSmrg            return emit(instruction(opcode, dispatch_width(), dst,
7ec681f3Smrg                                    src0, src1, src2));
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Create and insert an instruction with a variable number of sources
01e04c3fSmrg       * into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
01e04c3fSmrg           unsigned n) const
01e04c3fSmrg      {
7ec681f3Smrg         /* Use the emit() methods for specific operand counts to ensure that
7ec681f3Smrg          * opcode-specific operand fixups occur.
7ec681f3Smrg          */
7ec681f3Smrg         if (n == 2) {
7ec681f3Smrg            return emit(opcode, dst, srcs[0], srcs[1]);
7ec681f3Smrg         } else if (n == 3) {
7ec681f3Smrg            return emit(opcode, dst, srcs[0], srcs[1], srcs[2]);
7ec681f3Smrg         } else {
7ec681f3Smrg            return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
7ec681f3Smrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Insert a preallocated instruction into the program.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit(instruction *inst) const
01e04c3fSmrg      {
01e04c3fSmrg         assert(inst->exec_size <= 32);
01e04c3fSmrg         assert(inst->exec_size == dispatch_width() ||
01e04c3fSmrg                force_writemask_all);
01e04c3fSmrg
01e04c3fSmrg         inst->group = _group;
01e04c3fSmrg         inst->force_writemask_all = force_writemask_all;
01e04c3fSmrg         inst->annotation = annotation.str;
01e04c3fSmrg         inst->ir = annotation.ir;
01e04c3fSmrg
01e04c3fSmrg         if (block)
01e04c3fSmrg            static_cast<instruction *>(cursor)->insert_before(block, inst);
01e04c3fSmrg         else
01e04c3fSmrg            cursor->insert_before(inst);
01e04c3fSmrg
01e04c3fSmrg         return inst;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Select \p src0 if the comparison of both sources with the given
01e04c3fSmrg       * conditional mod evaluates to true, otherwise select \p src1.
01e04c3fSmrg       *
01e04c3fSmrg       * Generally useful to get the minimum or maximum of two values.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      emit_minmax(const dst_reg &dst, const src_reg &src0,
01e04c3fSmrg                  const src_reg &src1, brw_conditional_mod mod) const
01e04c3fSmrg      {
01e04c3fSmrg         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
01e04c3fSmrg
9f464c52Smaya         /* In some cases we can't have bytes as operand for src1, so use the
9f464c52Smaya          * same type for both operand.
9f464c52Smaya          */
7ec681f3Smrg         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
7ec681f3Smrg                                     fix_unsigned_negate(src1)));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Copy any live channel from \p src to the first channel of the result.
01e04c3fSmrg       */
01e04c3fSmrg      src_reg
01e04c3fSmrg      emit_uniformize(const src_reg &src) const
01e04c3fSmrg      {
01e04c3fSmrg         /* FIXME: We use a vector chan_index and dst to allow constant and
01e04c3fSmrg          * copy propagration to move result all the way into the consuming
01e04c3fSmrg          * instruction (typically a surface index or sampler index for a
01e04c3fSmrg          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
01e04c3fSmrg          * dispatch. Once we teach const/copy propagation about scalars we
01e04c3fSmrg          * should go back to scalar destinations here.
01e04c3fSmrg          */
01e04c3fSmrg         const fs_builder ubld = exec_all();
01e04c3fSmrg         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
01e04c3fSmrg         const dst_reg dst = vgrf(src.type);
01e04c3fSmrg
7ec681f3Smrg         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
01e04c3fSmrg         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
01e04c3fSmrg
01e04c3fSmrg         return src_reg(component(dst, 0));
01e04c3fSmrg      }
01e04c3fSmrg
9f464c52Smaya      src_reg
9f464c52Smaya      move_to_vgrf(const src_reg &src, unsigned num_components) const
9f464c52Smaya      {
9f464c52Smaya         src_reg *const src_comps = new src_reg[num_components];
9f464c52Smaya         for (unsigned i = 0; i < num_components; i++)
9f464c52Smaya            src_comps[i] = offset(src, dispatch_width(), i);
9f464c52Smaya
9f464c52Smaya         const dst_reg dst = vgrf(src.type, num_components);
9f464c52Smaya         LOAD_PAYLOAD(dst, src_comps, num_components, 0);
9f464c52Smaya
9f464c52Smaya         delete[] src_comps;
9f464c52Smaya
9f464c52Smaya         return src_reg(dst);
9f464c52Smaya      }
9f464c52Smaya
7ec681f3Smrg      void
7ec681f3Smrg      emit_scan_step(enum opcode opcode, brw_conditional_mod mod,
7ec681f3Smrg                     const dst_reg &tmp,
7ec681f3Smrg                     unsigned left_offset, unsigned left_stride,
7ec681f3Smrg                     unsigned right_offset, unsigned right_stride) const
7ec681f3Smrg      {
7ec681f3Smrg         dst_reg left, right;
7ec681f3Smrg         left = horiz_stride(horiz_offset(tmp, left_offset), left_stride);
7ec681f3Smrg         right = horiz_stride(horiz_offset(tmp, right_offset), right_stride);
7ec681f3Smrg         if ((tmp.type == BRW_REGISTER_TYPE_Q ||
7ec681f3Smrg              tmp.type == BRW_REGISTER_TYPE_UQ) &&
7ec681f3Smrg             !shader->devinfo->has_64bit_int) {
7ec681f3Smrg            switch (opcode) {
7ec681f3Smrg            case BRW_OPCODE_MUL:
7ec681f3Smrg               /* This will get lowered by integer MUL lowering */
7ec681f3Smrg               set_condmod(mod, emit(opcode, right, left, right));
7ec681f3Smrg               break;
7ec681f3Smrg
7ec681f3Smrg            case BRW_OPCODE_SEL: {
7ec681f3Smrg               /* In order for the comparisons to work out right, we need our
7ec681f3Smrg                * comparisons to be strict.
7ec681f3Smrg                */
7ec681f3Smrg               assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE);
7ec681f3Smrg               if (mod == BRW_CONDITIONAL_GE)
7ec681f3Smrg                  mod = BRW_CONDITIONAL_G;
7ec681f3Smrg
7ec681f3Smrg               /* We treat the bottom 32 bits as unsigned regardless of
7ec681f3Smrg                * whether or not the integer as a whole is signed.
7ec681f3Smrg                */
7ec681f3Smrg               dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0);
7ec681f3Smrg               dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0);
7ec681f3Smrg
7ec681f3Smrg               /* The upper bits get the same sign as the 64-bit type */
7ec681f3Smrg               brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type);
7ec681f3Smrg               dst_reg right_high = subscript(right, type32, 1);
7ec681f3Smrg               dst_reg left_high = subscript(left, type32, 1);
7ec681f3Smrg
7ec681f3Smrg               /* Build up our comparison:
7ec681f3Smrg                *
7ec681f3Smrg                *   l_hi < r_hi || (l_hi == r_hi && l_low < r_low)
7ec681f3Smrg                */
7ec681f3Smrg               CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD),
7ec681f3Smrg                                  retype(right_low, BRW_REGISTER_TYPE_UD), mod);
7ec681f3Smrg               set_predicate(BRW_PREDICATE_NORMAL,
7ec681f3Smrg                             CMP(null_reg_ud(), left_high, right_high,
7ec681f3Smrg                                 BRW_CONDITIONAL_EQ));
7ec681f3Smrg               set_predicate_inv(BRW_PREDICATE_NORMAL, true,
7ec681f3Smrg                                 CMP(null_reg_ud(), left_high, right_high, mod));
7ec681f3Smrg
7ec681f3Smrg               /* We could use selects here or we could use predicated MOVs
7ec681f3Smrg                * because the destination and second source (if it were a SEL)
7ec681f3Smrg                * are the same.
7ec681f3Smrg                */
7ec681f3Smrg               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low));
7ec681f3Smrg               set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high));
7ec681f3Smrg               break;
7ec681f3Smrg            }
7ec681f3Smrg
7ec681f3Smrg            default:
7ec681f3Smrg               unreachable("Unsupported 64-bit scan op");
7ec681f3Smrg            }
7ec681f3Smrg         } else {
7ec681f3Smrg            set_condmod(mod, emit(opcode, right, left, right));
7ec681f3Smrg         }
7ec681f3Smrg      }
7ec681f3Smrg
01e04c3fSmrg      void
01e04c3fSmrg      emit_scan(enum opcode opcode, const dst_reg &tmp,
01e04c3fSmrg                unsigned cluster_size, brw_conditional_mod mod) const
01e04c3fSmrg      {
01e04c3fSmrg         assert(dispatch_width() >= 8);
01e04c3fSmrg
01e04c3fSmrg         /* The instruction splitting code isn't advanced enough to split
01e04c3fSmrg          * these so we need to handle that ourselves.
01e04c3fSmrg          */
01e04c3fSmrg         if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) {
01e04c3fSmrg            const unsigned half_width = dispatch_width() / 2;
01e04c3fSmrg            const fs_builder ubld = exec_all().group(half_width, 0);
01e04c3fSmrg            dst_reg left = tmp;
01e04c3fSmrg            dst_reg right = horiz_offset(tmp, half_width);
01e04c3fSmrg            ubld.emit_scan(opcode, left, cluster_size, mod);
01e04c3fSmrg            ubld.emit_scan(opcode, right, cluster_size, mod);
01e04c3fSmrg            if (cluster_size > half_width) {
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp,
7ec681f3Smrg                                   half_width - 1, 0, half_width, 1);
01e04c3fSmrg            }
01e04c3fSmrg            return;
01e04c3fSmrg         }
01e04c3fSmrg
01e04c3fSmrg         if (cluster_size > 1) {
01e04c3fSmrg            const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0);
7ec681f3Smrg            ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2);
01e04c3fSmrg         }
01e04c3fSmrg
01e04c3fSmrg         if (cluster_size > 2) {
9f464c52Smaya            if (type_sz(tmp.type) <= 4) {
01e04c3fSmrg               const fs_builder ubld =
01e04c3fSmrg                  exec_all().group(dispatch_width() / 4, 0);
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4);
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4);
01e04c3fSmrg            } else {
01e04c3fSmrg               /* For 64-bit types, we have to do things differently because
01e04c3fSmrg                * the code above would land us with destination strides that
01e04c3fSmrg                * the hardware can't handle.  Fortunately, we'll only be
01e04c3fSmrg                * 8-wide in that case and it's the same number of
01e04c3fSmrg                * instructions.
01e04c3fSmrg                */
01e04c3fSmrg               const fs_builder ubld = exec_all().group(2, 0);
7ec681f3Smrg               for (unsigned i = 0; i < dispatch_width(); i += 4)
7ec681f3Smrg                  ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1);
01e04c3fSmrg            }
01e04c3fSmrg         }
01e04c3fSmrg
7ec681f3Smrg         for (unsigned i = 4;
7ec681f3Smrg              i < MIN2(cluster_size, dispatch_width());
7ec681f3Smrg              i *= 2) {
7ec681f3Smrg            const fs_builder ubld = exec_all().group(i, 0);
7ec681f3Smrg            ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1);
01e04c3fSmrg
7ec681f3Smrg            if (dispatch_width() > i * 2)
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1);
01e04c3fSmrg
7ec681f3Smrg            if (dispatch_width() > i * 4) {
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1);
7ec681f3Smrg               ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1);
7ec681f3Smrg            }
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Assorted arithmetic ops.
01e04c3fSmrg       * @{
01e04c3fSmrg       */
01e04c3fSmrg#define ALU1(op)                                        \
01e04c3fSmrg      instruction *                                     \
01e04c3fSmrg      op(const dst_reg &dst, const src_reg &src0) const \
01e04c3fSmrg      {                                                 \
01e04c3fSmrg         return emit(BRW_OPCODE_##op, dst, src0);       \
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg#define ALU2(op)                                                        \
01e04c3fSmrg      instruction *                                                     \
01e04c3fSmrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
01e04c3fSmrg      {                                                                 \
01e04c3fSmrg         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg#define ALU2_ACC(op)                                                    \
01e04c3fSmrg      instruction *                                                     \
01e04c3fSmrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
01e04c3fSmrg      {                                                                 \
01e04c3fSmrg         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
01e04c3fSmrg         inst->writes_accumulator = true;                               \
01e04c3fSmrg         return inst;                                                   \
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg#define ALU3(op)                                                        \
01e04c3fSmrg      instruction *                                                     \
01e04c3fSmrg      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
01e04c3fSmrg         const src_reg &src2) const                                     \
01e04c3fSmrg      {                                                                 \
01e04c3fSmrg         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      ALU2(ADD)
7ec681f3Smrg      ALU3(ADD3)
01e04c3fSmrg      ALU2_ACC(ADDC)
01e04c3fSmrg      ALU2(AND)
01e04c3fSmrg      ALU2(ASR)
01e04c3fSmrg      ALU2(AVG)
01e04c3fSmrg      ALU3(BFE)
01e04c3fSmrg      ALU2(BFI1)
01e04c3fSmrg      ALU3(BFI2)
01e04c3fSmrg      ALU1(BFREV)
01e04c3fSmrg      ALU1(CBIT)
01e04c3fSmrg      ALU1(DIM)
01e04c3fSmrg      ALU2(DP2)
01e04c3fSmrg      ALU2(DP3)
01e04c3fSmrg      ALU2(DP4)
01e04c3fSmrg      ALU2(DPH)
01e04c3fSmrg      ALU1(F16TO32)
01e04c3fSmrg      ALU1(F32TO16)
01e04c3fSmrg      ALU1(FBH)
01e04c3fSmrg      ALU1(FBL)
01e04c3fSmrg      ALU1(FRC)
7ec681f3Smrg      ALU3(DP4A)
01e04c3fSmrg      ALU2(LINE)
01e04c3fSmrg      ALU1(LZD)
01e04c3fSmrg      ALU2(MAC)
01e04c3fSmrg      ALU2_ACC(MACH)
01e04c3fSmrg      ALU3(MAD)
01e04c3fSmrg      ALU1(MOV)
01e04c3fSmrg      ALU2(MUL)
01e04c3fSmrg      ALU1(NOT)
01e04c3fSmrg      ALU2(OR)
01e04c3fSmrg      ALU2(PLN)
01e04c3fSmrg      ALU1(RNDD)
01e04c3fSmrg      ALU1(RNDE)
01e04c3fSmrg      ALU1(RNDU)
01e04c3fSmrg      ALU1(RNDZ)
7ec681f3Smrg      ALU2(ROL)
7ec681f3Smrg      ALU2(ROR)
01e04c3fSmrg      ALU2(SAD2)
01e04c3fSmrg      ALU2_ACC(SADA2)
01e04c3fSmrg      ALU2(SEL)
01e04c3fSmrg      ALU2(SHL)
01e04c3fSmrg      ALU2(SHR)
01e04c3fSmrg      ALU2_ACC(SUBB)
01e04c3fSmrg      ALU2(XOR)
01e04c3fSmrg
01e04c3fSmrg#undef ALU3
01e04c3fSmrg#undef ALU2_ACC
01e04c3fSmrg#undef ALU2
01e04c3fSmrg#undef ALU1
01e04c3fSmrg      /** @} */
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * CMP: Sets the low bit of the destination channels with the result
01e04c3fSmrg       * of the comparison, while the upper bits are undefined, and updates
01e04c3fSmrg       * the flag register with the packed 16 bits of the result.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
01e04c3fSmrg          brw_conditional_mod condition) const
01e04c3fSmrg      {
01e04c3fSmrg         /* Take the instruction:
01e04c3fSmrg          *
01e04c3fSmrg          * CMP null<d> src0<f> src1<f>
01e04c3fSmrg          *
7ec681f3Smrg          * Original gfx4 does type conversion to the destination type
01e04c3fSmrg          * before comparison, producing garbage results for floating
01e04c3fSmrg          * point comparisons.
01e04c3fSmrg          *
01e04c3fSmrg          * The destination type doesn't matter on newer generations,
01e04c3fSmrg          * so we set the type to match src0 so we can compact the
01e04c3fSmrg          * instruction.
01e04c3fSmrg          */
01e04c3fSmrg         return set_condmod(condition,
01e04c3fSmrg                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
01e04c3fSmrg                                 fix_unsigned_negate(src0),
01e04c3fSmrg                                 fix_unsigned_negate(src1)));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
7ec681f3Smrg       * CMPN: Behaves like CMP, but produces true if src1 is NaN.
7ec681f3Smrg       */
7ec681f3Smrg      instruction *
7ec681f3Smrg      CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
7ec681f3Smrg           brw_conditional_mod condition) const
7ec681f3Smrg      {
7ec681f3Smrg         /* Take the instruction:
7ec681f3Smrg          *
7ec681f3Smrg          * CMP null<d> src0<f> src1<f>
7ec681f3Smrg          *
7ec681f3Smrg          * Original gfx4 does type conversion to the destination type
7ec681f3Smrg          * before comparison, producing garbage results for floating
7ec681f3Smrg          * point comparisons.
7ec681f3Smrg          *
7ec681f3Smrg          * The destination type doesn't matter on newer generations,
7ec681f3Smrg          * so we set the type to match src0 so we can compact the
7ec681f3Smrg          * instruction.
7ec681f3Smrg          */
7ec681f3Smrg         return set_condmod(condition,
7ec681f3Smrg                            emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
7ec681f3Smrg                                 fix_unsigned_negate(src0),
7ec681f3Smrg                                 fix_unsigned_negate(src1)));
7ec681f3Smrg      }
7ec681f3Smrg
7ec681f3Smrg      /**
7ec681f3Smrg       * Gfx4 predicated IF.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      IF(brw_predicate predicate) const
01e04c3fSmrg      {
01e04c3fSmrg         return set_predicate(predicate, emit(BRW_OPCODE_IF));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * CSEL: dst = src2 <op> 0.0f ? src0 : src1
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
01e04c3fSmrg           const src_reg &src2, brw_conditional_mod condition) const
01e04c3fSmrg      {
01e04c3fSmrg         /* CSEL only operates on floats, so we can't do integer </<=/>=/>
01e04c3fSmrg          * comparisons.  Zero/non-zero (== and !=) comparisons almost work.
01e04c3fSmrg          * 0x80000000 fails because it is -0.0, and -0.0 == 0.0.
01e04c3fSmrg          */
01e04c3fSmrg         assert(src2.type == BRW_REGISTER_TYPE_F);
01e04c3fSmrg
01e04c3fSmrg         return set_condmod(condition,
01e04c3fSmrg                            emit(BRW_OPCODE_CSEL,
01e04c3fSmrg                                 retype(dst, BRW_REGISTER_TYPE_F),
01e04c3fSmrg                                 retype(src0, BRW_REGISTER_TYPE_F),
7ec681f3Smrg                                 retype(src1, BRW_REGISTER_TYPE_F),
7ec681f3Smrg                                 src2));
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Emit a linear interpolation instruction.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
01e04c3fSmrg          const src_reg &a) const
01e04c3fSmrg      {
7ec681f3Smrg         if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) {
01e04c3fSmrg            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
01e04c3fSmrg             * we need to reorder the operands.
01e04c3fSmrg             */
01e04c3fSmrg            return emit(BRW_OPCODE_LRP, dst, a, y, x);
01e04c3fSmrg
01e04c3fSmrg         } else {
01e04c3fSmrg            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
01e04c3fSmrg            const dst_reg y_times_a = vgrf(dst.type);
01e04c3fSmrg            const dst_reg one_minus_a = vgrf(dst.type);
01e04c3fSmrg            const dst_reg x_times_one_minus_a = vgrf(dst.type);
01e04c3fSmrg
01e04c3fSmrg            MUL(y_times_a, y, a);
01e04c3fSmrg            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
01e04c3fSmrg            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
01e04c3fSmrg            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Collect a number of registers in a contiguous range of registers.
01e04c3fSmrg       */
01e04c3fSmrg      instruction *
01e04c3fSmrg      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
01e04c3fSmrg                   unsigned sources, unsigned header_size) const
01e04c3fSmrg      {
01e04c3fSmrg         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
01e04c3fSmrg         inst->header_size = header_size;
01e04c3fSmrg         inst->size_written = header_size * REG_SIZE;
01e04c3fSmrg         for (unsigned i = header_size; i < sources; i++) {
01e04c3fSmrg            inst->size_written +=
01e04c3fSmrg               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
01e04c3fSmrg                     REG_SIZE);
01e04c3fSmrg         }
01e04c3fSmrg
01e04c3fSmrg         return inst;
01e04c3fSmrg      }
01e04c3fSmrg
7ec681f3Smrg      instruction *
7ec681f3Smrg      UNDEF(const dst_reg &dst) const
9f464c52Smaya      {
7ec681f3Smrg         assert(dst.file == VGRF);
7ec681f3Smrg         instruction *inst = emit(SHADER_OPCODE_UNDEF,
7ec681f3Smrg                                  retype(dst, BRW_REGISTER_TYPE_UD));
7ec681f3Smrg         inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE;
9f464c52Smaya
7ec681f3Smrg         return inst;
9f464c52Smaya      }
9f464c52Smaya
7ec681f3Smrg      backend_shader *shader;
7ec681f3Smrg
01e04c3fSmrg   private:
01e04c3fSmrg      /**
01e04c3fSmrg       * Workaround for negation of UD registers.  See comment in
01e04c3fSmrg       * fs_generator::generate_code() for more details.
01e04c3fSmrg       */
01e04c3fSmrg      src_reg
01e04c3fSmrg      fix_unsigned_negate(const src_reg &src) const
01e04c3fSmrg      {
01e04c3fSmrg         if (src.type == BRW_REGISTER_TYPE_UD &&
01e04c3fSmrg             src.negate) {
01e04c3fSmrg            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
01e04c3fSmrg            MOV(temp, src);
01e04c3fSmrg            return src_reg(temp);
01e04c3fSmrg         } else {
01e04c3fSmrg            return src;
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Workaround for source register modes not supported by the ternary
01e04c3fSmrg       * instruction encoding.
01e04c3fSmrg       */
01e04c3fSmrg      src_reg
01e04c3fSmrg      fix_3src_operand(const src_reg &src) const
01e04c3fSmrg      {
9f464c52Smaya         switch (src.file) {
9f464c52Smaya         case FIXED_GRF:
9f464c52Smaya            /* FINISHME: Could handle scalar region, other stride=1 regions */
9f464c52Smaya            if (src.vstride != BRW_VERTICAL_STRIDE_8 ||
9f464c52Smaya                src.width != BRW_WIDTH_8 ||
9f464c52Smaya                src.hstride != BRW_HORIZONTAL_STRIDE_1)
9f464c52Smaya               break;
7ec681f3Smrg            FALLTHROUGH;
9f464c52Smaya         case ATTR:
9f464c52Smaya         case VGRF:
9f464c52Smaya         case UNIFORM:
9f464c52Smaya         case IMM:
01e04c3fSmrg            return src;
9f464c52Smaya         default:
9f464c52Smaya            break;
01e04c3fSmrg         }
9f464c52Smaya
9f464c52Smaya         dst_reg expanded = vgrf(src.type);
9f464c52Smaya         MOV(expanded, src);
9f464c52Smaya         return expanded;
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      /**
01e04c3fSmrg       * Workaround for source register modes not supported by the math
01e04c3fSmrg       * instruction.
01e04c3fSmrg       */
01e04c3fSmrg      src_reg
01e04c3fSmrg      fix_math_operand(const src_reg &src) const
01e04c3fSmrg      {
7ec681f3Smrg         /* Can't do hstride == 0 args on gfx6 math, so expand it out. We
01e04c3fSmrg          * might be able to do better by doing execsize = 1 math and then
01e04c3fSmrg          * expanding that result out, but we would need to be careful with
01e04c3fSmrg          * masking.
01e04c3fSmrg          *
7ec681f3Smrg          * Gfx6 hardware ignores source modifiers (negate and abs) on math
01e04c3fSmrg          * instructions, so we also move to a temp to set those up.
01e04c3fSmrg          *
7ec681f3Smrg          * Gfx7 relaxes most of the above restrictions, but still can't use IMM
01e04c3fSmrg          * operands to math
01e04c3fSmrg          */
7ec681f3Smrg         if ((shader->devinfo->ver == 6 &&
01e04c3fSmrg              (src.file == IMM || src.file == UNIFORM ||
01e04c3fSmrg               src.abs || src.negate)) ||
7ec681f3Smrg             (shader->devinfo->ver == 7 && src.file == IMM)) {
01e04c3fSmrg            const dst_reg tmp = vgrf(src.type);
01e04c3fSmrg            MOV(tmp, src);
01e04c3fSmrg            return tmp;
01e04c3fSmrg         } else {
01e04c3fSmrg            return src;
01e04c3fSmrg         }
01e04c3fSmrg      }
01e04c3fSmrg
01e04c3fSmrg      bblock_t *block;
01e04c3fSmrg      exec_node *cursor;
01e04c3fSmrg
01e04c3fSmrg      unsigned _dispatch_width;
01e04c3fSmrg      unsigned _group;
01e04c3fSmrg      bool force_writemask_all;
01e04c3fSmrg
01e04c3fSmrg      /** Debug annotation info. */
01e04c3fSmrg      struct {
01e04c3fSmrg         const char *str;
01e04c3fSmrg         const void *ir;
01e04c3fSmrg      } annotation;
01e04c3fSmrg   };
01e04c3fSmrg}
01e04c3fSmrg
01e04c3fSmrg#endif