101e04c3fSmrg/* -*- c++ -*- */ 201e04c3fSmrg/* 301e04c3fSmrg * Copyright © 2010-2015 Intel Corporation 401e04c3fSmrg * 501e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 601e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 701e04c3fSmrg * to deal in the Software without restriction, including without limitation 801e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 901e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 1001e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1101e04c3fSmrg * 1201e04c3fSmrg * The above copyright notice and this permission notice (including the next 1301e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1401e04c3fSmrg * Software. 1501e04c3fSmrg * 1601e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1701e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1801e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1901e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 2001e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2101e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2201e04c3fSmrg * IN THE SOFTWARE. 2301e04c3fSmrg */ 2401e04c3fSmrg 2501e04c3fSmrg#ifndef BRW_FS_BUILDER_H 2601e04c3fSmrg#define BRW_FS_BUILDER_H 2701e04c3fSmrg 2801e04c3fSmrg#include "brw_ir_fs.h" 2901e04c3fSmrg#include "brw_shader.h" 3001e04c3fSmrg 3101e04c3fSmrgnamespace brw { 3201e04c3fSmrg /** 3301e04c3fSmrg * Toolbox to assemble an FS IR program out of individual instructions. 3401e04c3fSmrg * 3501e04c3fSmrg * This object is meant to have an interface consistent with 3601e04c3fSmrg * brw::vec4_builder. They cannot be fully interchangeable because 3701e04c3fSmrg * brw::fs_builder generates scalar code while brw::vec4_builder generates 3801e04c3fSmrg * vector code. 3901e04c3fSmrg */ 4001e04c3fSmrg class fs_builder { 4101e04c3fSmrg public: 4201e04c3fSmrg /** Type used in this IR to represent a source of an instruction. */ 4301e04c3fSmrg typedef fs_reg src_reg; 4401e04c3fSmrg 4501e04c3fSmrg /** Type used in this IR to represent the destination of an instruction. */ 4601e04c3fSmrg typedef fs_reg dst_reg; 4701e04c3fSmrg 4801e04c3fSmrg /** Type used in this IR to represent an instruction. */ 4901e04c3fSmrg typedef fs_inst instruction; 5001e04c3fSmrg 5101e04c3fSmrg /** 5201e04c3fSmrg * Construct an fs_builder that inserts instructions into \p shader. 5301e04c3fSmrg * \p dispatch_width gives the native execution width of the program. 5401e04c3fSmrg */ 5501e04c3fSmrg fs_builder(backend_shader *shader, 5601e04c3fSmrg unsigned dispatch_width) : 5701e04c3fSmrg shader(shader), block(NULL), cursor(NULL), 5801e04c3fSmrg _dispatch_width(dispatch_width), 5901e04c3fSmrg _group(0), 6001e04c3fSmrg force_writemask_all(false), 6101e04c3fSmrg annotation() 6201e04c3fSmrg { 6301e04c3fSmrg } 6401e04c3fSmrg 6501e04c3fSmrg /** 6601e04c3fSmrg * Construct an fs_builder that inserts instructions into \p shader 6701e04c3fSmrg * before instruction \p inst in basic block \p block. The default 6801e04c3fSmrg * execution controls and debug annotation are initialized from the 6901e04c3fSmrg * instruction passed as argument. 7001e04c3fSmrg */ 7101e04c3fSmrg fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 7201e04c3fSmrg shader(shader), block(block), cursor(inst), 7301e04c3fSmrg _dispatch_width(inst->exec_size), 7401e04c3fSmrg _group(inst->group), 7501e04c3fSmrg force_writemask_all(inst->force_writemask_all) 7601e04c3fSmrg { 7701e04c3fSmrg annotation.str = inst->annotation; 7801e04c3fSmrg annotation.ir = inst->ir; 7901e04c3fSmrg } 8001e04c3fSmrg 8101e04c3fSmrg /** 8201e04c3fSmrg * Construct an fs_builder that inserts instructions before \p cursor in 8301e04c3fSmrg * basic block \p block, inheriting other code generation parameters 8401e04c3fSmrg * from this. 8501e04c3fSmrg */ 8601e04c3fSmrg fs_builder 8701e04c3fSmrg at(bblock_t *block, exec_node *cursor) const 8801e04c3fSmrg { 8901e04c3fSmrg fs_builder bld = *this; 9001e04c3fSmrg bld.block = block; 9101e04c3fSmrg bld.cursor = cursor; 9201e04c3fSmrg return bld; 9301e04c3fSmrg } 9401e04c3fSmrg 9501e04c3fSmrg /** 9601e04c3fSmrg * Construct an fs_builder appending instructions at the end of the 9701e04c3fSmrg * instruction list of the shader, inheriting other code generation 9801e04c3fSmrg * parameters from this. 9901e04c3fSmrg */ 10001e04c3fSmrg fs_builder 10101e04c3fSmrg at_end() const 10201e04c3fSmrg { 10301e04c3fSmrg return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 10401e04c3fSmrg } 10501e04c3fSmrg 10601e04c3fSmrg /** 10701e04c3fSmrg * Construct a builder specifying the default SIMD width and group of 10801e04c3fSmrg * channel enable signals, inheriting other code generation parameters 10901e04c3fSmrg * from this. 11001e04c3fSmrg * 11101e04c3fSmrg * \p n gives the default SIMD width, \p i gives the slot group used for 11201e04c3fSmrg * predication and control flow masking in multiples of \p n channels. 11301e04c3fSmrg */ 11401e04c3fSmrg fs_builder 11501e04c3fSmrg group(unsigned n, unsigned i) const 11601e04c3fSmrg { 11701e04c3fSmrg fs_builder bld = *this; 1189f464c52Smaya 1199f464c52Smaya if (n <= dispatch_width() && i < dispatch_width() / n) { 1209f464c52Smaya bld._group += i * n; 1219f464c52Smaya } else { 1229f464c52Smaya /* The requested channel group isn't a subset of the channel group 1239f464c52Smaya * of this builder, which means that the resulting instructions 1249f464c52Smaya * would use (potentially undefined) channel enable signals not 1259f464c52Smaya * specified by the parent builder. That's only valid if the 1269f464c52Smaya * instruction doesn't have per-channel semantics, in which case 1279f464c52Smaya * we should clear off the default group index in order to prevent 1289f464c52Smaya * emitting instructions with channel group not aligned to their 1299f464c52Smaya * own execution size. 1309f464c52Smaya */ 1319f464c52Smaya assert(force_writemask_all); 1329f464c52Smaya bld._group = 0; 1339f464c52Smaya } 1349f464c52Smaya 13501e04c3fSmrg bld._dispatch_width = n; 13601e04c3fSmrg return bld; 13701e04c3fSmrg } 13801e04c3fSmrg 13901e04c3fSmrg /** 14001e04c3fSmrg * Alias for group() with width equal to eight. 14101e04c3fSmrg */ 14201e04c3fSmrg fs_builder 1437ec681f3Smrg quarter(unsigned i) const 14401e04c3fSmrg { 14501e04c3fSmrg return group(8, i); 14601e04c3fSmrg } 14701e04c3fSmrg 14801e04c3fSmrg /** 14901e04c3fSmrg * Construct a builder with per-channel control flow execution masking 15001e04c3fSmrg * disabled if \p b is true. If control flow execution masking is 15101e04c3fSmrg * already disabled this has no effect. 15201e04c3fSmrg */ 15301e04c3fSmrg fs_builder 15401e04c3fSmrg exec_all(bool b = true) const 15501e04c3fSmrg { 15601e04c3fSmrg fs_builder bld = *this; 15701e04c3fSmrg if (b) 15801e04c3fSmrg bld.force_writemask_all = true; 15901e04c3fSmrg return bld; 16001e04c3fSmrg } 16101e04c3fSmrg 16201e04c3fSmrg /** 16301e04c3fSmrg * Construct a builder with the given debug annotation info. 16401e04c3fSmrg */ 16501e04c3fSmrg fs_builder 16601e04c3fSmrg annotate(const char *str, const void *ir = NULL) const 16701e04c3fSmrg { 16801e04c3fSmrg fs_builder bld = *this; 16901e04c3fSmrg bld.annotation.str = str; 17001e04c3fSmrg bld.annotation.ir = ir; 17101e04c3fSmrg return bld; 17201e04c3fSmrg } 17301e04c3fSmrg 17401e04c3fSmrg /** 17501e04c3fSmrg * Get the SIMD width in use. 17601e04c3fSmrg */ 17701e04c3fSmrg unsigned 17801e04c3fSmrg dispatch_width() const 17901e04c3fSmrg { 18001e04c3fSmrg return _dispatch_width; 18101e04c3fSmrg } 18201e04c3fSmrg 18301e04c3fSmrg /** 18401e04c3fSmrg * Get the channel group in use. 18501e04c3fSmrg */ 18601e04c3fSmrg unsigned 18701e04c3fSmrg group() const 18801e04c3fSmrg { 18901e04c3fSmrg return _group; 19001e04c3fSmrg } 19101e04c3fSmrg 19201e04c3fSmrg /** 19301e04c3fSmrg * Allocate a virtual register of natural vector size (one for this IR) 19401e04c3fSmrg * and SIMD width. \p n gives the amount of space to allocate in 19501e04c3fSmrg * dispatch_width units (which is just enough space for one logical 19601e04c3fSmrg * component in this IR). 19701e04c3fSmrg */ 19801e04c3fSmrg dst_reg 19901e04c3fSmrg vgrf(enum brw_reg_type type, unsigned n = 1) const 20001e04c3fSmrg { 20101e04c3fSmrg assert(dispatch_width() <= 32); 20201e04c3fSmrg 20301e04c3fSmrg if (n > 0) 20401e04c3fSmrg return dst_reg(VGRF, shader->alloc.allocate( 20501e04c3fSmrg DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 20601e04c3fSmrg REG_SIZE)), 20701e04c3fSmrg type); 20801e04c3fSmrg else 20901e04c3fSmrg return retype(null_reg_ud(), type); 21001e04c3fSmrg } 21101e04c3fSmrg 21201e04c3fSmrg /** 21301e04c3fSmrg * Create a null register of floating type. 21401e04c3fSmrg */ 21501e04c3fSmrg dst_reg 21601e04c3fSmrg null_reg_f() const 21701e04c3fSmrg { 21801e04c3fSmrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 21901e04c3fSmrg } 22001e04c3fSmrg 22101e04c3fSmrg dst_reg 22201e04c3fSmrg null_reg_df() const 22301e04c3fSmrg { 22401e04c3fSmrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 22501e04c3fSmrg } 22601e04c3fSmrg 22701e04c3fSmrg /** 22801e04c3fSmrg * Create a null register of signed integer type. 22901e04c3fSmrg */ 23001e04c3fSmrg dst_reg 23101e04c3fSmrg null_reg_d() const 23201e04c3fSmrg { 23301e04c3fSmrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 23401e04c3fSmrg } 23501e04c3fSmrg 23601e04c3fSmrg /** 23701e04c3fSmrg * Create a null register of unsigned integer type. 23801e04c3fSmrg */ 23901e04c3fSmrg dst_reg 24001e04c3fSmrg null_reg_ud() const 24101e04c3fSmrg { 24201e04c3fSmrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 24301e04c3fSmrg } 24401e04c3fSmrg 24501e04c3fSmrg /** 24601e04c3fSmrg * Insert an instruction into the program. 24701e04c3fSmrg */ 24801e04c3fSmrg instruction * 24901e04c3fSmrg emit(const instruction &inst) const 25001e04c3fSmrg { 25101e04c3fSmrg return emit(new(shader->mem_ctx) instruction(inst)); 25201e04c3fSmrg } 25301e04c3fSmrg 25401e04c3fSmrg /** 25501e04c3fSmrg * Create and insert a nullary control instruction into the program. 25601e04c3fSmrg */ 25701e04c3fSmrg instruction * 25801e04c3fSmrg emit(enum opcode opcode) const 25901e04c3fSmrg { 26001e04c3fSmrg return emit(instruction(opcode, dispatch_width())); 26101e04c3fSmrg } 26201e04c3fSmrg 26301e04c3fSmrg /** 26401e04c3fSmrg * Create and insert a nullary instruction into the program. 26501e04c3fSmrg */ 26601e04c3fSmrg instruction * 26701e04c3fSmrg emit(enum opcode opcode, const dst_reg &dst) const 26801e04c3fSmrg { 26901e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst)); 27001e04c3fSmrg } 27101e04c3fSmrg 27201e04c3fSmrg /** 27301e04c3fSmrg * Create and insert a unary instruction into the program. 27401e04c3fSmrg */ 27501e04c3fSmrg instruction * 27601e04c3fSmrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 27701e04c3fSmrg { 27801e04c3fSmrg switch (opcode) { 27901e04c3fSmrg case SHADER_OPCODE_RCP: 28001e04c3fSmrg case SHADER_OPCODE_RSQ: 28101e04c3fSmrg case SHADER_OPCODE_SQRT: 28201e04c3fSmrg case SHADER_OPCODE_EXP2: 28301e04c3fSmrg case SHADER_OPCODE_LOG2: 28401e04c3fSmrg case SHADER_OPCODE_SIN: 28501e04c3fSmrg case SHADER_OPCODE_COS: 28601e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst, 28701e04c3fSmrg fix_math_operand(src0))); 28801e04c3fSmrg 28901e04c3fSmrg default: 29001e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst, src0)); 29101e04c3fSmrg } 29201e04c3fSmrg } 29301e04c3fSmrg 29401e04c3fSmrg /** 29501e04c3fSmrg * Create and insert a binary instruction into the program. 29601e04c3fSmrg */ 29701e04c3fSmrg instruction * 29801e04c3fSmrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 29901e04c3fSmrg const src_reg &src1) const 30001e04c3fSmrg { 30101e04c3fSmrg switch (opcode) { 30201e04c3fSmrg case SHADER_OPCODE_POW: 30301e04c3fSmrg case SHADER_OPCODE_INT_QUOTIENT: 30401e04c3fSmrg case SHADER_OPCODE_INT_REMAINDER: 30501e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst, 30601e04c3fSmrg fix_math_operand(src0), 3077ec681f3Smrg fix_math_operand(src1))); 30801e04c3fSmrg 30901e04c3fSmrg default: 3109f464c52Smaya return emit(instruction(opcode, dispatch_width(), dst, 3117ec681f3Smrg src0, src1)); 31201e04c3fSmrg 31301e04c3fSmrg } 31401e04c3fSmrg } 31501e04c3fSmrg 31601e04c3fSmrg /** 31701e04c3fSmrg * Create and insert a ternary instruction into the program. 31801e04c3fSmrg */ 31901e04c3fSmrg instruction * 32001e04c3fSmrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 32101e04c3fSmrg const src_reg &src1, const src_reg &src2) const 32201e04c3fSmrg { 32301e04c3fSmrg switch (opcode) { 32401e04c3fSmrg case BRW_OPCODE_BFE: 32501e04c3fSmrg case BRW_OPCODE_BFI2: 32601e04c3fSmrg case BRW_OPCODE_MAD: 32701e04c3fSmrg case BRW_OPCODE_LRP: 32801e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst, 32901e04c3fSmrg fix_3src_operand(src0), 3307ec681f3Smrg fix_3src_operand(src1), 3317ec681f3Smrg fix_3src_operand(src2))); 33201e04c3fSmrg 33301e04c3fSmrg default: 33401e04c3fSmrg return emit(instruction(opcode, dispatch_width(), dst, 3357ec681f3Smrg src0, src1, src2)); 33601e04c3fSmrg } 33701e04c3fSmrg } 33801e04c3fSmrg 33901e04c3fSmrg /** 34001e04c3fSmrg * Create and insert an instruction with a variable number of sources 34101e04c3fSmrg * into the program. 34201e04c3fSmrg */ 34301e04c3fSmrg instruction * 34401e04c3fSmrg emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 34501e04c3fSmrg unsigned n) const 34601e04c3fSmrg { 3477ec681f3Smrg /* Use the emit() methods for specific operand counts to ensure that 3487ec681f3Smrg * opcode-specific operand fixups occur. 3497ec681f3Smrg */ 3507ec681f3Smrg if (n == 2) { 3517ec681f3Smrg return emit(opcode, dst, srcs[0], srcs[1]); 3527ec681f3Smrg } else if (n == 3) { 3537ec681f3Smrg return emit(opcode, dst, srcs[0], srcs[1], srcs[2]); 3547ec681f3Smrg } else { 3557ec681f3Smrg return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 3567ec681f3Smrg } 35701e04c3fSmrg } 35801e04c3fSmrg 35901e04c3fSmrg /** 36001e04c3fSmrg * Insert a preallocated instruction into the program. 36101e04c3fSmrg */ 36201e04c3fSmrg instruction * 36301e04c3fSmrg emit(instruction *inst) const 36401e04c3fSmrg { 36501e04c3fSmrg assert(inst->exec_size <= 32); 36601e04c3fSmrg assert(inst->exec_size == dispatch_width() || 36701e04c3fSmrg force_writemask_all); 36801e04c3fSmrg 36901e04c3fSmrg inst->group = _group; 37001e04c3fSmrg inst->force_writemask_all = force_writemask_all; 37101e04c3fSmrg inst->annotation = annotation.str; 37201e04c3fSmrg inst->ir = annotation.ir; 37301e04c3fSmrg 37401e04c3fSmrg if (block) 37501e04c3fSmrg static_cast<instruction *>(cursor)->insert_before(block, inst); 37601e04c3fSmrg else 37701e04c3fSmrg cursor->insert_before(inst); 37801e04c3fSmrg 37901e04c3fSmrg return inst; 38001e04c3fSmrg } 38101e04c3fSmrg 38201e04c3fSmrg /** 38301e04c3fSmrg * Select \p src0 if the comparison of both sources with the given 38401e04c3fSmrg * conditional mod evaluates to true, otherwise select \p src1. 38501e04c3fSmrg * 38601e04c3fSmrg * Generally useful to get the minimum or maximum of two values. 38701e04c3fSmrg */ 38801e04c3fSmrg instruction * 38901e04c3fSmrg emit_minmax(const dst_reg &dst, const src_reg &src0, 39001e04c3fSmrg const src_reg &src1, brw_conditional_mod mod) const 39101e04c3fSmrg { 39201e04c3fSmrg assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 39301e04c3fSmrg 3949f464c52Smaya /* In some cases we can't have bytes as operand for src1, so use the 3959f464c52Smaya * same type for both operand. 3969f464c52Smaya */ 3977ec681f3Smrg return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 3987ec681f3Smrg fix_unsigned_negate(src1))); 39901e04c3fSmrg } 40001e04c3fSmrg 40101e04c3fSmrg /** 40201e04c3fSmrg * Copy any live channel from \p src to the first channel of the result. 40301e04c3fSmrg */ 40401e04c3fSmrg src_reg 40501e04c3fSmrg emit_uniformize(const src_reg &src) const 40601e04c3fSmrg { 40701e04c3fSmrg /* FIXME: We use a vector chan_index and dst to allow constant and 40801e04c3fSmrg * copy propagration to move result all the way into the consuming 40901e04c3fSmrg * instruction (typically a surface index or sampler index for a 41001e04c3fSmrg * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 41101e04c3fSmrg * dispatch. Once we teach const/copy propagation about scalars we 41201e04c3fSmrg * should go back to scalar destinations here. 41301e04c3fSmrg */ 41401e04c3fSmrg const fs_builder ubld = exec_all(); 41501e04c3fSmrg const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 41601e04c3fSmrg const dst_reg dst = vgrf(src.type); 41701e04c3fSmrg 4187ec681f3Smrg ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); 41901e04c3fSmrg ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 42001e04c3fSmrg 42101e04c3fSmrg return src_reg(component(dst, 0)); 42201e04c3fSmrg } 42301e04c3fSmrg 4249f464c52Smaya src_reg 4259f464c52Smaya move_to_vgrf(const src_reg &src, unsigned num_components) const 4269f464c52Smaya { 4279f464c52Smaya src_reg *const src_comps = new src_reg[num_components]; 4289f464c52Smaya for (unsigned i = 0; i < num_components; i++) 4299f464c52Smaya src_comps[i] = offset(src, dispatch_width(), i); 4309f464c52Smaya 4319f464c52Smaya const dst_reg dst = vgrf(src.type, num_components); 4329f464c52Smaya LOAD_PAYLOAD(dst, src_comps, num_components, 0); 4339f464c52Smaya 4349f464c52Smaya delete[] src_comps; 4359f464c52Smaya 4369f464c52Smaya return src_reg(dst); 4379f464c52Smaya } 4389f464c52Smaya 4397ec681f3Smrg void 4407ec681f3Smrg emit_scan_step(enum opcode opcode, brw_conditional_mod mod, 4417ec681f3Smrg const dst_reg &tmp, 4427ec681f3Smrg unsigned left_offset, unsigned left_stride, 4437ec681f3Smrg unsigned right_offset, unsigned right_stride) const 4447ec681f3Smrg { 4457ec681f3Smrg dst_reg left, right; 4467ec681f3Smrg left = horiz_stride(horiz_offset(tmp, left_offset), left_stride); 4477ec681f3Smrg right = horiz_stride(horiz_offset(tmp, right_offset), right_stride); 4487ec681f3Smrg if ((tmp.type == BRW_REGISTER_TYPE_Q || 4497ec681f3Smrg tmp.type == BRW_REGISTER_TYPE_UQ) && 4507ec681f3Smrg !shader->devinfo->has_64bit_int) { 4517ec681f3Smrg switch (opcode) { 4527ec681f3Smrg case BRW_OPCODE_MUL: 4537ec681f3Smrg /* This will get lowered by integer MUL lowering */ 4547ec681f3Smrg set_condmod(mod, emit(opcode, right, left, right)); 4557ec681f3Smrg break; 4567ec681f3Smrg 4577ec681f3Smrg case BRW_OPCODE_SEL: { 4587ec681f3Smrg /* In order for the comparisons to work out right, we need our 4597ec681f3Smrg * comparisons to be strict. 4607ec681f3Smrg */ 4617ec681f3Smrg assert(mod == BRW_CONDITIONAL_L || mod == BRW_CONDITIONAL_GE); 4627ec681f3Smrg if (mod == BRW_CONDITIONAL_GE) 4637ec681f3Smrg mod = BRW_CONDITIONAL_G; 4647ec681f3Smrg 4657ec681f3Smrg /* We treat the bottom 32 bits as unsigned regardless of 4667ec681f3Smrg * whether or not the integer as a whole is signed. 4677ec681f3Smrg */ 4687ec681f3Smrg dst_reg right_low = subscript(right, BRW_REGISTER_TYPE_UD, 0); 4697ec681f3Smrg dst_reg left_low = subscript(left, BRW_REGISTER_TYPE_UD, 0); 4707ec681f3Smrg 4717ec681f3Smrg /* The upper bits get the same sign as the 64-bit type */ 4727ec681f3Smrg brw_reg_type type32 = brw_reg_type_from_bit_size(32, tmp.type); 4737ec681f3Smrg dst_reg right_high = subscript(right, type32, 1); 4747ec681f3Smrg dst_reg left_high = subscript(left, type32, 1); 4757ec681f3Smrg 4767ec681f3Smrg /* Build up our comparison: 4777ec681f3Smrg * 4787ec681f3Smrg * l_hi < r_hi || (l_hi == r_hi && l_low < r_low) 4797ec681f3Smrg */ 4807ec681f3Smrg CMP(null_reg_ud(), retype(left_low, BRW_REGISTER_TYPE_UD), 4817ec681f3Smrg retype(right_low, BRW_REGISTER_TYPE_UD), mod); 4827ec681f3Smrg set_predicate(BRW_PREDICATE_NORMAL, 4837ec681f3Smrg CMP(null_reg_ud(), left_high, right_high, 4847ec681f3Smrg BRW_CONDITIONAL_EQ)); 4857ec681f3Smrg set_predicate_inv(BRW_PREDICATE_NORMAL, true, 4867ec681f3Smrg CMP(null_reg_ud(), left_high, right_high, mod)); 4877ec681f3Smrg 4887ec681f3Smrg /* We could use selects here or we could use predicated MOVs 4897ec681f3Smrg * because the destination and second source (if it were a SEL) 4907ec681f3Smrg * are the same. 4917ec681f3Smrg */ 4927ec681f3Smrg set_predicate(BRW_PREDICATE_NORMAL, MOV(right_low, left_low)); 4937ec681f3Smrg set_predicate(BRW_PREDICATE_NORMAL, MOV(right_high, left_high)); 4947ec681f3Smrg break; 4957ec681f3Smrg } 4967ec681f3Smrg 4977ec681f3Smrg default: 4987ec681f3Smrg unreachable("Unsupported 64-bit scan op"); 4997ec681f3Smrg } 5007ec681f3Smrg } else { 5017ec681f3Smrg set_condmod(mod, emit(opcode, right, left, right)); 5027ec681f3Smrg } 5037ec681f3Smrg } 5047ec681f3Smrg 50501e04c3fSmrg void 50601e04c3fSmrg emit_scan(enum opcode opcode, const dst_reg &tmp, 50701e04c3fSmrg unsigned cluster_size, brw_conditional_mod mod) const 50801e04c3fSmrg { 50901e04c3fSmrg assert(dispatch_width() >= 8); 51001e04c3fSmrg 51101e04c3fSmrg /* The instruction splitting code isn't advanced enough to split 51201e04c3fSmrg * these so we need to handle that ourselves. 51301e04c3fSmrg */ 51401e04c3fSmrg if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 51501e04c3fSmrg const unsigned half_width = dispatch_width() / 2; 51601e04c3fSmrg const fs_builder ubld = exec_all().group(half_width, 0); 51701e04c3fSmrg dst_reg left = tmp; 51801e04c3fSmrg dst_reg right = horiz_offset(tmp, half_width); 51901e04c3fSmrg ubld.emit_scan(opcode, left, cluster_size, mod); 52001e04c3fSmrg ubld.emit_scan(opcode, right, cluster_size, mod); 52101e04c3fSmrg if (cluster_size > half_width) { 5227ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, 5237ec681f3Smrg half_width - 1, 0, half_width, 1); 52401e04c3fSmrg } 52501e04c3fSmrg return; 52601e04c3fSmrg } 52701e04c3fSmrg 52801e04c3fSmrg if (cluster_size > 1) { 52901e04c3fSmrg const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 5307ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, 0, 2, 1, 2); 53101e04c3fSmrg } 53201e04c3fSmrg 53301e04c3fSmrg if (cluster_size > 2) { 5349f464c52Smaya if (type_sz(tmp.type) <= 4) { 53501e04c3fSmrg const fs_builder ubld = 53601e04c3fSmrg exec_all().group(dispatch_width() / 4, 0); 5377ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 2, 4); 5387ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, 1, 4, 3, 4); 53901e04c3fSmrg } else { 54001e04c3fSmrg /* For 64-bit types, we have to do things differently because 54101e04c3fSmrg * the code above would land us with destination strides that 54201e04c3fSmrg * the hardware can't handle. Fortunately, we'll only be 54301e04c3fSmrg * 8-wide in that case and it's the same number of 54401e04c3fSmrg * instructions. 54501e04c3fSmrg */ 54601e04c3fSmrg const fs_builder ubld = exec_all().group(2, 0); 5477ec681f3Smrg for (unsigned i = 0; i < dispatch_width(); i += 4) 5487ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, i + 1, 0, i + 2, 1); 54901e04c3fSmrg } 55001e04c3fSmrg } 55101e04c3fSmrg 5527ec681f3Smrg for (unsigned i = 4; 5537ec681f3Smrg i < MIN2(cluster_size, dispatch_width()); 5547ec681f3Smrg i *= 2) { 5557ec681f3Smrg const fs_builder ubld = exec_all().group(i, 0); 5567ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, i - 1, 0, i, 1); 55701e04c3fSmrg 5587ec681f3Smrg if (dispatch_width() > i * 2) 5597ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, i * 3 - 1, 0, i * 3, 1); 56001e04c3fSmrg 5617ec681f3Smrg if (dispatch_width() > i * 4) { 5627ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, i * 5 - 1, 0, i * 5, 1); 5637ec681f3Smrg ubld.emit_scan_step(opcode, mod, tmp, i * 7 - 1, 0, i * 7, 1); 5647ec681f3Smrg } 56501e04c3fSmrg } 56601e04c3fSmrg } 56701e04c3fSmrg 56801e04c3fSmrg /** 56901e04c3fSmrg * Assorted arithmetic ops. 57001e04c3fSmrg * @{ 57101e04c3fSmrg */ 57201e04c3fSmrg#define ALU1(op) \ 57301e04c3fSmrg instruction * \ 57401e04c3fSmrg op(const dst_reg &dst, const src_reg &src0) const \ 57501e04c3fSmrg { \ 57601e04c3fSmrg return emit(BRW_OPCODE_##op, dst, src0); \ 57701e04c3fSmrg } 57801e04c3fSmrg 57901e04c3fSmrg#define ALU2(op) \ 58001e04c3fSmrg instruction * \ 58101e04c3fSmrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 58201e04c3fSmrg { \ 58301e04c3fSmrg return emit(BRW_OPCODE_##op, dst, src0, src1); \ 58401e04c3fSmrg } 58501e04c3fSmrg 58601e04c3fSmrg#define ALU2_ACC(op) \ 58701e04c3fSmrg instruction * \ 58801e04c3fSmrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 58901e04c3fSmrg { \ 59001e04c3fSmrg instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 59101e04c3fSmrg inst->writes_accumulator = true; \ 59201e04c3fSmrg return inst; \ 59301e04c3fSmrg } 59401e04c3fSmrg 59501e04c3fSmrg#define ALU3(op) \ 59601e04c3fSmrg instruction * \ 59701e04c3fSmrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 59801e04c3fSmrg const src_reg &src2) const \ 59901e04c3fSmrg { \ 60001e04c3fSmrg return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 60101e04c3fSmrg } 60201e04c3fSmrg 60301e04c3fSmrg ALU2(ADD) 6047ec681f3Smrg ALU3(ADD3) 60501e04c3fSmrg ALU2_ACC(ADDC) 60601e04c3fSmrg ALU2(AND) 60701e04c3fSmrg ALU2(ASR) 60801e04c3fSmrg ALU2(AVG) 60901e04c3fSmrg ALU3(BFE) 61001e04c3fSmrg ALU2(BFI1) 61101e04c3fSmrg ALU3(BFI2) 61201e04c3fSmrg ALU1(BFREV) 61301e04c3fSmrg ALU1(CBIT) 61401e04c3fSmrg ALU1(DIM) 61501e04c3fSmrg ALU2(DP2) 61601e04c3fSmrg ALU2(DP3) 61701e04c3fSmrg ALU2(DP4) 61801e04c3fSmrg ALU2(DPH) 61901e04c3fSmrg ALU1(F16TO32) 62001e04c3fSmrg ALU1(F32TO16) 62101e04c3fSmrg ALU1(FBH) 62201e04c3fSmrg ALU1(FBL) 62301e04c3fSmrg ALU1(FRC) 6247ec681f3Smrg ALU3(DP4A) 62501e04c3fSmrg ALU2(LINE) 62601e04c3fSmrg ALU1(LZD) 62701e04c3fSmrg ALU2(MAC) 62801e04c3fSmrg ALU2_ACC(MACH) 62901e04c3fSmrg ALU3(MAD) 63001e04c3fSmrg ALU1(MOV) 63101e04c3fSmrg ALU2(MUL) 63201e04c3fSmrg ALU1(NOT) 63301e04c3fSmrg ALU2(OR) 63401e04c3fSmrg ALU2(PLN) 63501e04c3fSmrg ALU1(RNDD) 63601e04c3fSmrg ALU1(RNDE) 63701e04c3fSmrg ALU1(RNDU) 63801e04c3fSmrg ALU1(RNDZ) 6397ec681f3Smrg ALU2(ROL) 6407ec681f3Smrg ALU2(ROR) 64101e04c3fSmrg ALU2(SAD2) 64201e04c3fSmrg ALU2_ACC(SADA2) 64301e04c3fSmrg ALU2(SEL) 64401e04c3fSmrg ALU2(SHL) 64501e04c3fSmrg ALU2(SHR) 64601e04c3fSmrg ALU2_ACC(SUBB) 64701e04c3fSmrg ALU2(XOR) 64801e04c3fSmrg 64901e04c3fSmrg#undef ALU3 65001e04c3fSmrg#undef ALU2_ACC 65101e04c3fSmrg#undef ALU2 65201e04c3fSmrg#undef ALU1 65301e04c3fSmrg /** @} */ 65401e04c3fSmrg 65501e04c3fSmrg /** 65601e04c3fSmrg * CMP: Sets the low bit of the destination channels with the result 65701e04c3fSmrg * of the comparison, while the upper bits are undefined, and updates 65801e04c3fSmrg * the flag register with the packed 16 bits of the result. 65901e04c3fSmrg */ 66001e04c3fSmrg instruction * 66101e04c3fSmrg CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 66201e04c3fSmrg brw_conditional_mod condition) const 66301e04c3fSmrg { 66401e04c3fSmrg /* Take the instruction: 66501e04c3fSmrg * 66601e04c3fSmrg * CMP null<d> src0<f> src1<f> 66701e04c3fSmrg * 6687ec681f3Smrg * Original gfx4 does type conversion to the destination type 66901e04c3fSmrg * before comparison, producing garbage results for floating 67001e04c3fSmrg * point comparisons. 67101e04c3fSmrg * 67201e04c3fSmrg * The destination type doesn't matter on newer generations, 67301e04c3fSmrg * so we set the type to match src0 so we can compact the 67401e04c3fSmrg * instruction. 67501e04c3fSmrg */ 67601e04c3fSmrg return set_condmod(condition, 67701e04c3fSmrg emit(BRW_OPCODE_CMP, retype(dst, src0.type), 67801e04c3fSmrg fix_unsigned_negate(src0), 67901e04c3fSmrg fix_unsigned_negate(src1))); 68001e04c3fSmrg } 68101e04c3fSmrg 68201e04c3fSmrg /** 6837ec681f3Smrg * CMPN: Behaves like CMP, but produces true if src1 is NaN. 6847ec681f3Smrg */ 6857ec681f3Smrg instruction * 6867ec681f3Smrg CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 6877ec681f3Smrg brw_conditional_mod condition) const 6887ec681f3Smrg { 6897ec681f3Smrg /* Take the instruction: 6907ec681f3Smrg * 6917ec681f3Smrg * CMP null<d> src0<f> src1<f> 6927ec681f3Smrg * 6937ec681f3Smrg * Original gfx4 does type conversion to the destination type 6947ec681f3Smrg * before comparison, producing garbage results for floating 6957ec681f3Smrg * point comparisons. 6967ec681f3Smrg * 6977ec681f3Smrg * The destination type doesn't matter on newer generations, 6987ec681f3Smrg * so we set the type to match src0 so we can compact the 6997ec681f3Smrg * instruction. 7007ec681f3Smrg */ 7017ec681f3Smrg return set_condmod(condition, 7027ec681f3Smrg emit(BRW_OPCODE_CMPN, retype(dst, src0.type), 7037ec681f3Smrg fix_unsigned_negate(src0), 7047ec681f3Smrg fix_unsigned_negate(src1))); 7057ec681f3Smrg } 7067ec681f3Smrg 7077ec681f3Smrg /** 7087ec681f3Smrg * Gfx4 predicated IF. 70901e04c3fSmrg */ 71001e04c3fSmrg instruction * 71101e04c3fSmrg IF(brw_predicate predicate) const 71201e04c3fSmrg { 71301e04c3fSmrg return set_predicate(predicate, emit(BRW_OPCODE_IF)); 71401e04c3fSmrg } 71501e04c3fSmrg 71601e04c3fSmrg /** 71701e04c3fSmrg * CSEL: dst = src2 <op> 0.0f ? src0 : src1 71801e04c3fSmrg */ 71901e04c3fSmrg instruction * 72001e04c3fSmrg CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 72101e04c3fSmrg const src_reg &src2, brw_conditional_mod condition) const 72201e04c3fSmrg { 72301e04c3fSmrg /* CSEL only operates on floats, so we can't do integer </<=/>=/> 72401e04c3fSmrg * comparisons. Zero/non-zero (== and !=) comparisons almost work. 72501e04c3fSmrg * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 72601e04c3fSmrg */ 72701e04c3fSmrg assert(src2.type == BRW_REGISTER_TYPE_F); 72801e04c3fSmrg 72901e04c3fSmrg return set_condmod(condition, 73001e04c3fSmrg emit(BRW_OPCODE_CSEL, 73101e04c3fSmrg retype(dst, BRW_REGISTER_TYPE_F), 73201e04c3fSmrg retype(src0, BRW_REGISTER_TYPE_F), 7337ec681f3Smrg retype(src1, BRW_REGISTER_TYPE_F), 7347ec681f3Smrg src2)); 73501e04c3fSmrg } 73601e04c3fSmrg 73701e04c3fSmrg /** 73801e04c3fSmrg * Emit a linear interpolation instruction. 73901e04c3fSmrg */ 74001e04c3fSmrg instruction * 74101e04c3fSmrg LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 74201e04c3fSmrg const src_reg &a) const 74301e04c3fSmrg { 7447ec681f3Smrg if (shader->devinfo->ver >= 6 && shader->devinfo->ver <= 10) { 74501e04c3fSmrg /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 74601e04c3fSmrg * we need to reorder the operands. 74701e04c3fSmrg */ 74801e04c3fSmrg return emit(BRW_OPCODE_LRP, dst, a, y, x); 74901e04c3fSmrg 75001e04c3fSmrg } else { 75101e04c3fSmrg /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 75201e04c3fSmrg const dst_reg y_times_a = vgrf(dst.type); 75301e04c3fSmrg const dst_reg one_minus_a = vgrf(dst.type); 75401e04c3fSmrg const dst_reg x_times_one_minus_a = vgrf(dst.type); 75501e04c3fSmrg 75601e04c3fSmrg MUL(y_times_a, y, a); 75701e04c3fSmrg ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 75801e04c3fSmrg MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 75901e04c3fSmrg return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 76001e04c3fSmrg } 76101e04c3fSmrg } 76201e04c3fSmrg 76301e04c3fSmrg /** 76401e04c3fSmrg * Collect a number of registers in a contiguous range of registers. 76501e04c3fSmrg */ 76601e04c3fSmrg instruction * 76701e04c3fSmrg LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 76801e04c3fSmrg unsigned sources, unsigned header_size) const 76901e04c3fSmrg { 77001e04c3fSmrg instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 77101e04c3fSmrg inst->header_size = header_size; 77201e04c3fSmrg inst->size_written = header_size * REG_SIZE; 77301e04c3fSmrg for (unsigned i = header_size; i < sources; i++) { 77401e04c3fSmrg inst->size_written += 77501e04c3fSmrg ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 77601e04c3fSmrg REG_SIZE); 77701e04c3fSmrg } 77801e04c3fSmrg 77901e04c3fSmrg return inst; 78001e04c3fSmrg } 78101e04c3fSmrg 7827ec681f3Smrg instruction * 7837ec681f3Smrg UNDEF(const dst_reg &dst) const 7849f464c52Smaya { 7857ec681f3Smrg assert(dst.file == VGRF); 7867ec681f3Smrg instruction *inst = emit(SHADER_OPCODE_UNDEF, 7877ec681f3Smrg retype(dst, BRW_REGISTER_TYPE_UD)); 7887ec681f3Smrg inst->size_written = shader->alloc.sizes[dst.nr] * REG_SIZE; 7899f464c52Smaya 7907ec681f3Smrg return inst; 7919f464c52Smaya } 7929f464c52Smaya 7937ec681f3Smrg backend_shader *shader; 7947ec681f3Smrg 79501e04c3fSmrg private: 79601e04c3fSmrg /** 79701e04c3fSmrg * Workaround for negation of UD registers. See comment in 79801e04c3fSmrg * fs_generator::generate_code() for more details. 79901e04c3fSmrg */ 80001e04c3fSmrg src_reg 80101e04c3fSmrg fix_unsigned_negate(const src_reg &src) const 80201e04c3fSmrg { 80301e04c3fSmrg if (src.type == BRW_REGISTER_TYPE_UD && 80401e04c3fSmrg src.negate) { 80501e04c3fSmrg dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 80601e04c3fSmrg MOV(temp, src); 80701e04c3fSmrg return src_reg(temp); 80801e04c3fSmrg } else { 80901e04c3fSmrg return src; 81001e04c3fSmrg } 81101e04c3fSmrg } 81201e04c3fSmrg 81301e04c3fSmrg /** 81401e04c3fSmrg * Workaround for source register modes not supported by the ternary 81501e04c3fSmrg * instruction encoding. 81601e04c3fSmrg */ 81701e04c3fSmrg src_reg 81801e04c3fSmrg fix_3src_operand(const src_reg &src) const 81901e04c3fSmrg { 8209f464c52Smaya switch (src.file) { 8219f464c52Smaya case FIXED_GRF: 8229f464c52Smaya /* FINISHME: Could handle scalar region, other stride=1 regions */ 8239f464c52Smaya if (src.vstride != BRW_VERTICAL_STRIDE_8 || 8249f464c52Smaya src.width != BRW_WIDTH_8 || 8259f464c52Smaya src.hstride != BRW_HORIZONTAL_STRIDE_1) 8269f464c52Smaya break; 8277ec681f3Smrg FALLTHROUGH; 8289f464c52Smaya case ATTR: 8299f464c52Smaya case VGRF: 8309f464c52Smaya case UNIFORM: 8319f464c52Smaya case IMM: 83201e04c3fSmrg return src; 8339f464c52Smaya default: 8349f464c52Smaya break; 83501e04c3fSmrg } 8369f464c52Smaya 8379f464c52Smaya dst_reg expanded = vgrf(src.type); 8389f464c52Smaya MOV(expanded, src); 8399f464c52Smaya return expanded; 84001e04c3fSmrg } 84101e04c3fSmrg 84201e04c3fSmrg /** 84301e04c3fSmrg * Workaround for source register modes not supported by the math 84401e04c3fSmrg * instruction. 84501e04c3fSmrg */ 84601e04c3fSmrg src_reg 84701e04c3fSmrg fix_math_operand(const src_reg &src) const 84801e04c3fSmrg { 8497ec681f3Smrg /* Can't do hstride == 0 args on gfx6 math, so expand it out. We 85001e04c3fSmrg * might be able to do better by doing execsize = 1 math and then 85101e04c3fSmrg * expanding that result out, but we would need to be careful with 85201e04c3fSmrg * masking. 85301e04c3fSmrg * 8547ec681f3Smrg * Gfx6 hardware ignores source modifiers (negate and abs) on math 85501e04c3fSmrg * instructions, so we also move to a temp to set those up. 85601e04c3fSmrg * 8577ec681f3Smrg * Gfx7 relaxes most of the above restrictions, but still can't use IMM 85801e04c3fSmrg * operands to math 85901e04c3fSmrg */ 8607ec681f3Smrg if ((shader->devinfo->ver == 6 && 86101e04c3fSmrg (src.file == IMM || src.file == UNIFORM || 86201e04c3fSmrg src.abs || src.negate)) || 8637ec681f3Smrg (shader->devinfo->ver == 7 && src.file == IMM)) { 86401e04c3fSmrg const dst_reg tmp = vgrf(src.type); 86501e04c3fSmrg MOV(tmp, src); 86601e04c3fSmrg return tmp; 86701e04c3fSmrg } else { 86801e04c3fSmrg return src; 86901e04c3fSmrg } 87001e04c3fSmrg } 87101e04c3fSmrg 87201e04c3fSmrg bblock_t *block; 87301e04c3fSmrg exec_node *cursor; 87401e04c3fSmrg 87501e04c3fSmrg unsigned _dispatch_width; 87601e04c3fSmrg unsigned _group; 87701e04c3fSmrg bool force_writemask_all; 87801e04c3fSmrg 87901e04c3fSmrg /** Debug annotation info. */ 88001e04c3fSmrg struct { 88101e04c3fSmrg const char *str; 88201e04c3fSmrg const void *ir; 88301e04c3fSmrg } annotation; 88401e04c3fSmrg }; 88501e04c3fSmrg} 88601e04c3fSmrg 88701e04c3fSmrg#endif 888