1b8e80941Smrg/* -*- c++ -*- */ 2b8e80941Smrg/* 3b8e80941Smrg * Copyright © 2010-2015 Intel Corporation 4b8e80941Smrg * 5b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 6b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 7b8e80941Smrg * to deal in the Software without restriction, including without limitation 8b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 10b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 11b8e80941Smrg * 12b8e80941Smrg * The above copyright notice and this permission notice (including the next 13b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 14b8e80941Smrg * Software. 15b8e80941Smrg * 16b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22b8e80941Smrg * IN THE SOFTWARE. 23b8e80941Smrg */ 24b8e80941Smrg 25b8e80941Smrg#ifndef BRW_FS_BUILDER_H 26b8e80941Smrg#define BRW_FS_BUILDER_H 27b8e80941Smrg 28b8e80941Smrg#include "brw_ir_fs.h" 29b8e80941Smrg#include "brw_shader.h" 30b8e80941Smrg 31b8e80941Smrgnamespace brw { 32b8e80941Smrg /** 33b8e80941Smrg * Toolbox to assemble an FS IR program out of individual instructions. 34b8e80941Smrg * 35b8e80941Smrg * This object is meant to have an interface consistent with 36b8e80941Smrg * brw::vec4_builder. They cannot be fully interchangeable because 37b8e80941Smrg * brw::fs_builder generates scalar code while brw::vec4_builder generates 38b8e80941Smrg * vector code. 39b8e80941Smrg */ 40b8e80941Smrg class fs_builder { 41b8e80941Smrg public: 42b8e80941Smrg /** Type used in this IR to represent a source of an instruction. */ 43b8e80941Smrg typedef fs_reg src_reg; 44b8e80941Smrg 45b8e80941Smrg /** Type used in this IR to represent the destination of an instruction. */ 46b8e80941Smrg typedef fs_reg dst_reg; 47b8e80941Smrg 48b8e80941Smrg /** Type used in this IR to represent an instruction. */ 49b8e80941Smrg typedef fs_inst instruction; 50b8e80941Smrg 51b8e80941Smrg /** 52b8e80941Smrg * Construct an fs_builder that inserts instructions into \p shader. 53b8e80941Smrg * \p dispatch_width gives the native execution width of the program. 54b8e80941Smrg */ 55b8e80941Smrg fs_builder(backend_shader *shader, 56b8e80941Smrg unsigned dispatch_width) : 57b8e80941Smrg shader(shader), block(NULL), cursor(NULL), 58b8e80941Smrg _dispatch_width(dispatch_width), 59b8e80941Smrg _group(0), 60b8e80941Smrg force_writemask_all(false), 61b8e80941Smrg annotation() 62b8e80941Smrg { 63b8e80941Smrg } 64b8e80941Smrg 65b8e80941Smrg /** 66b8e80941Smrg * Construct an fs_builder that inserts instructions into \p shader 67b8e80941Smrg * before instruction \p inst in basic block \p block. The default 68b8e80941Smrg * execution controls and debug annotation are initialized from the 69b8e80941Smrg * instruction passed as argument. 70b8e80941Smrg */ 71b8e80941Smrg fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 72b8e80941Smrg shader(shader), block(block), cursor(inst), 73b8e80941Smrg _dispatch_width(inst->exec_size), 74b8e80941Smrg _group(inst->group), 75b8e80941Smrg force_writemask_all(inst->force_writemask_all) 76b8e80941Smrg { 77b8e80941Smrg annotation.str = inst->annotation; 78b8e80941Smrg annotation.ir = inst->ir; 79b8e80941Smrg } 80b8e80941Smrg 81b8e80941Smrg /** 82b8e80941Smrg * Construct an fs_builder that inserts instructions before \p cursor in 83b8e80941Smrg * basic block \p block, inheriting other code generation parameters 84b8e80941Smrg * from this. 85b8e80941Smrg */ 86b8e80941Smrg fs_builder 87b8e80941Smrg at(bblock_t *block, exec_node *cursor) const 88b8e80941Smrg { 89b8e80941Smrg fs_builder bld = *this; 90b8e80941Smrg bld.block = block; 91b8e80941Smrg bld.cursor = cursor; 92b8e80941Smrg return bld; 93b8e80941Smrg } 94b8e80941Smrg 95b8e80941Smrg /** 96b8e80941Smrg * Construct an fs_builder appending instructions at the end of the 97b8e80941Smrg * instruction list of the shader, inheriting other code generation 98b8e80941Smrg * parameters from this. 99b8e80941Smrg */ 100b8e80941Smrg fs_builder 101b8e80941Smrg at_end() const 102b8e80941Smrg { 103b8e80941Smrg return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 104b8e80941Smrg } 105b8e80941Smrg 106b8e80941Smrg /** 107b8e80941Smrg * Construct a builder specifying the default SIMD width and group of 108b8e80941Smrg * channel enable signals, inheriting other code generation parameters 109b8e80941Smrg * from this. 110b8e80941Smrg * 111b8e80941Smrg * \p n gives the default SIMD width, \p i gives the slot group used for 112b8e80941Smrg * predication and control flow masking in multiples of \p n channels. 113b8e80941Smrg */ 114b8e80941Smrg fs_builder 115b8e80941Smrg group(unsigned n, unsigned i) const 116b8e80941Smrg { 117b8e80941Smrg fs_builder bld = *this; 118b8e80941Smrg 119b8e80941Smrg if (n <= dispatch_width() && i < dispatch_width() / n) { 120b8e80941Smrg bld._group += i * n; 121b8e80941Smrg } else { 122b8e80941Smrg /* The requested channel group isn't a subset of the channel group 123b8e80941Smrg * of this builder, which means that the resulting instructions 124b8e80941Smrg * would use (potentially undefined) channel enable signals not 125b8e80941Smrg * specified by the parent builder. That's only valid if the 126b8e80941Smrg * instruction doesn't have per-channel semantics, in which case 127b8e80941Smrg * we should clear off the default group index in order to prevent 128b8e80941Smrg * emitting instructions with channel group not aligned to their 129b8e80941Smrg * own execution size. 130b8e80941Smrg */ 131b8e80941Smrg assert(force_writemask_all); 132b8e80941Smrg bld._group = 0; 133b8e80941Smrg } 134b8e80941Smrg 135b8e80941Smrg bld._dispatch_width = n; 136b8e80941Smrg return bld; 137b8e80941Smrg } 138b8e80941Smrg 139b8e80941Smrg /** 140b8e80941Smrg * Alias for group() with width equal to eight. 141b8e80941Smrg */ 142b8e80941Smrg fs_builder 143b8e80941Smrg half(unsigned i) const 144b8e80941Smrg { 145b8e80941Smrg return group(8, i); 146b8e80941Smrg } 147b8e80941Smrg 148b8e80941Smrg /** 149b8e80941Smrg * Construct a builder with per-channel control flow execution masking 150b8e80941Smrg * disabled if \p b is true. If control flow execution masking is 151b8e80941Smrg * already disabled this has no effect. 152b8e80941Smrg */ 153b8e80941Smrg fs_builder 154b8e80941Smrg exec_all(bool b = true) const 155b8e80941Smrg { 156b8e80941Smrg fs_builder bld = *this; 157b8e80941Smrg if (b) 158b8e80941Smrg bld.force_writemask_all = true; 159b8e80941Smrg return bld; 160b8e80941Smrg } 161b8e80941Smrg 162b8e80941Smrg /** 163b8e80941Smrg * Construct a builder with the given debug annotation info. 164b8e80941Smrg */ 165b8e80941Smrg fs_builder 166b8e80941Smrg annotate(const char *str, const void *ir = NULL) const 167b8e80941Smrg { 168b8e80941Smrg fs_builder bld = *this; 169b8e80941Smrg bld.annotation.str = str; 170b8e80941Smrg bld.annotation.ir = ir; 171b8e80941Smrg return bld; 172b8e80941Smrg } 173b8e80941Smrg 174b8e80941Smrg /** 175b8e80941Smrg * Get the SIMD width in use. 176b8e80941Smrg */ 177b8e80941Smrg unsigned 178b8e80941Smrg dispatch_width() const 179b8e80941Smrg { 180b8e80941Smrg return _dispatch_width; 181b8e80941Smrg } 182b8e80941Smrg 183b8e80941Smrg /** 184b8e80941Smrg * Get the channel group in use. 185b8e80941Smrg */ 186b8e80941Smrg unsigned 187b8e80941Smrg group() const 188b8e80941Smrg { 189b8e80941Smrg return _group; 190b8e80941Smrg } 191b8e80941Smrg 192b8e80941Smrg /** 193b8e80941Smrg * Allocate a virtual register of natural vector size (one for this IR) 194b8e80941Smrg * and SIMD width. \p n gives the amount of space to allocate in 195b8e80941Smrg * dispatch_width units (which is just enough space for one logical 196b8e80941Smrg * component in this IR). 197b8e80941Smrg */ 198b8e80941Smrg dst_reg 199b8e80941Smrg vgrf(enum brw_reg_type type, unsigned n = 1) const 200b8e80941Smrg { 201b8e80941Smrg assert(dispatch_width() <= 32); 202b8e80941Smrg 203b8e80941Smrg if (n > 0) 204b8e80941Smrg return dst_reg(VGRF, shader->alloc.allocate( 205b8e80941Smrg DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 206b8e80941Smrg REG_SIZE)), 207b8e80941Smrg type); 208b8e80941Smrg else 209b8e80941Smrg return retype(null_reg_ud(), type); 210b8e80941Smrg } 211b8e80941Smrg 212b8e80941Smrg /** 213b8e80941Smrg * Create a null register of floating type. 214b8e80941Smrg */ 215b8e80941Smrg dst_reg 216b8e80941Smrg null_reg_f() const 217b8e80941Smrg { 218b8e80941Smrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 219b8e80941Smrg } 220b8e80941Smrg 221b8e80941Smrg dst_reg 222b8e80941Smrg null_reg_df() const 223b8e80941Smrg { 224b8e80941Smrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 225b8e80941Smrg } 226b8e80941Smrg 227b8e80941Smrg /** 228b8e80941Smrg * Create a null register of signed integer type. 229b8e80941Smrg */ 230b8e80941Smrg dst_reg 231b8e80941Smrg null_reg_d() const 232b8e80941Smrg { 233b8e80941Smrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 234b8e80941Smrg } 235b8e80941Smrg 236b8e80941Smrg /** 237b8e80941Smrg * Create a null register of unsigned integer type. 238b8e80941Smrg */ 239b8e80941Smrg dst_reg 240b8e80941Smrg null_reg_ud() const 241b8e80941Smrg { 242b8e80941Smrg return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 243b8e80941Smrg } 244b8e80941Smrg 245b8e80941Smrg /** 246b8e80941Smrg * Get the mask of SIMD channels enabled by dispatch and not yet 247b8e80941Smrg * disabled by discard. 248b8e80941Smrg */ 249b8e80941Smrg src_reg 250b8e80941Smrg sample_mask_reg() const 251b8e80941Smrg { 252b8e80941Smrg if (shader->stage != MESA_SHADER_FRAGMENT) { 253b8e80941Smrg return brw_imm_d(0xffffffff); 254b8e80941Smrg } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) { 255b8e80941Smrg return brw_flag_reg(0, 1); 256b8e80941Smrg } else { 257b8e80941Smrg assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16); 258b8e80941Smrg return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7), 259b8e80941Smrg BRW_REGISTER_TYPE_UD); 260b8e80941Smrg } 261b8e80941Smrg } 262b8e80941Smrg 263b8e80941Smrg /** 264b8e80941Smrg * Insert an instruction into the program. 265b8e80941Smrg */ 266b8e80941Smrg instruction * 267b8e80941Smrg emit(const instruction &inst) const 268b8e80941Smrg { 269b8e80941Smrg return emit(new(shader->mem_ctx) instruction(inst)); 270b8e80941Smrg } 271b8e80941Smrg 272b8e80941Smrg /** 273b8e80941Smrg * Create and insert a nullary control instruction into the program. 274b8e80941Smrg */ 275b8e80941Smrg instruction * 276b8e80941Smrg emit(enum opcode opcode) const 277b8e80941Smrg { 278b8e80941Smrg return emit(instruction(opcode, dispatch_width())); 279b8e80941Smrg } 280b8e80941Smrg 281b8e80941Smrg /** 282b8e80941Smrg * Create and insert a nullary instruction into the program. 283b8e80941Smrg */ 284b8e80941Smrg instruction * 285b8e80941Smrg emit(enum opcode opcode, const dst_reg &dst) const 286b8e80941Smrg { 287b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst)); 288b8e80941Smrg } 289b8e80941Smrg 290b8e80941Smrg /** 291b8e80941Smrg * Create and insert a unary instruction into the program. 292b8e80941Smrg */ 293b8e80941Smrg instruction * 294b8e80941Smrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 295b8e80941Smrg { 296b8e80941Smrg switch (opcode) { 297b8e80941Smrg case SHADER_OPCODE_RCP: 298b8e80941Smrg case SHADER_OPCODE_RSQ: 299b8e80941Smrg case SHADER_OPCODE_SQRT: 300b8e80941Smrg case SHADER_OPCODE_EXP2: 301b8e80941Smrg case SHADER_OPCODE_LOG2: 302b8e80941Smrg case SHADER_OPCODE_SIN: 303b8e80941Smrg case SHADER_OPCODE_COS: 304b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, 305b8e80941Smrg fix_math_operand(src0))); 306b8e80941Smrg 307b8e80941Smrg default: 308b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, src0)); 309b8e80941Smrg } 310b8e80941Smrg } 311b8e80941Smrg 312b8e80941Smrg /** 313b8e80941Smrg * Create and insert a binary instruction into the program. 314b8e80941Smrg */ 315b8e80941Smrg instruction * 316b8e80941Smrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 317b8e80941Smrg const src_reg &src1) const 318b8e80941Smrg { 319b8e80941Smrg switch (opcode) { 320b8e80941Smrg case SHADER_OPCODE_POW: 321b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 322b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 323b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, 324b8e80941Smrg fix_math_operand(src0), 325b8e80941Smrg fix_math_operand(fix_byte_src(src1)))); 326b8e80941Smrg 327b8e80941Smrg default: 328b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, 329b8e80941Smrg src0, fix_byte_src(src1))); 330b8e80941Smrg 331b8e80941Smrg } 332b8e80941Smrg } 333b8e80941Smrg 334b8e80941Smrg /** 335b8e80941Smrg * Create and insert a ternary instruction into the program. 336b8e80941Smrg */ 337b8e80941Smrg instruction * 338b8e80941Smrg emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 339b8e80941Smrg const src_reg &src1, const src_reg &src2) const 340b8e80941Smrg { 341b8e80941Smrg switch (opcode) { 342b8e80941Smrg case BRW_OPCODE_BFE: 343b8e80941Smrg case BRW_OPCODE_BFI2: 344b8e80941Smrg case BRW_OPCODE_MAD: 345b8e80941Smrg case BRW_OPCODE_LRP: 346b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, 347b8e80941Smrg fix_3src_operand(src0), 348b8e80941Smrg fix_3src_operand(fix_byte_src(src1)), 349b8e80941Smrg fix_3src_operand(fix_byte_src(src2)))); 350b8e80941Smrg 351b8e80941Smrg default: 352b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, 353b8e80941Smrg src0, fix_byte_src(src1), fix_byte_src(src2))); 354b8e80941Smrg } 355b8e80941Smrg } 356b8e80941Smrg 357b8e80941Smrg /** 358b8e80941Smrg * Create and insert an instruction with a variable number of sources 359b8e80941Smrg * into the program. 360b8e80941Smrg */ 361b8e80941Smrg instruction * 362b8e80941Smrg emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 363b8e80941Smrg unsigned n) const 364b8e80941Smrg { 365b8e80941Smrg return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 366b8e80941Smrg } 367b8e80941Smrg 368b8e80941Smrg /** 369b8e80941Smrg * Insert a preallocated instruction into the program. 370b8e80941Smrg */ 371b8e80941Smrg instruction * 372b8e80941Smrg emit(instruction *inst) const 373b8e80941Smrg { 374b8e80941Smrg assert(inst->exec_size <= 32); 375b8e80941Smrg assert(inst->exec_size == dispatch_width() || 376b8e80941Smrg force_writemask_all); 377b8e80941Smrg 378b8e80941Smrg inst->group = _group; 379b8e80941Smrg inst->force_writemask_all = force_writemask_all; 380b8e80941Smrg inst->annotation = annotation.str; 381b8e80941Smrg inst->ir = annotation.ir; 382b8e80941Smrg 383b8e80941Smrg if (block) 384b8e80941Smrg static_cast<instruction *>(cursor)->insert_before(block, inst); 385b8e80941Smrg else 386b8e80941Smrg cursor->insert_before(inst); 387b8e80941Smrg 388b8e80941Smrg return inst; 389b8e80941Smrg } 390b8e80941Smrg 391b8e80941Smrg /** 392b8e80941Smrg * Select \p src0 if the comparison of both sources with the given 393b8e80941Smrg * conditional mod evaluates to true, otherwise select \p src1. 394b8e80941Smrg * 395b8e80941Smrg * Generally useful to get the minimum or maximum of two values. 396b8e80941Smrg */ 397b8e80941Smrg instruction * 398b8e80941Smrg emit_minmax(const dst_reg &dst, const src_reg &src0, 399b8e80941Smrg const src_reg &src1, brw_conditional_mod mod) const 400b8e80941Smrg { 401b8e80941Smrg assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 402b8e80941Smrg 403b8e80941Smrg /* In some cases we can't have bytes as operand for src1, so use the 404b8e80941Smrg * same type for both operand. 405b8e80941Smrg */ 406b8e80941Smrg return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)), 407b8e80941Smrg fix_unsigned_negate(fix_byte_src(src1)))); 408b8e80941Smrg } 409b8e80941Smrg 410b8e80941Smrg /** 411b8e80941Smrg * Copy any live channel from \p src to the first channel of the result. 412b8e80941Smrg */ 413b8e80941Smrg src_reg 414b8e80941Smrg emit_uniformize(const src_reg &src) const 415b8e80941Smrg { 416b8e80941Smrg /* FIXME: We use a vector chan_index and dst to allow constant and 417b8e80941Smrg * copy propagration to move result all the way into the consuming 418b8e80941Smrg * instruction (typically a surface index or sampler index for a 419b8e80941Smrg * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 420b8e80941Smrg * dispatch. Once we teach const/copy propagation about scalars we 421b8e80941Smrg * should go back to scalar destinations here. 422b8e80941Smrg */ 423b8e80941Smrg const fs_builder ubld = exec_all(); 424b8e80941Smrg const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 425b8e80941Smrg const dst_reg dst = vgrf(src.type); 426b8e80941Smrg 427b8e80941Smrg ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2; 428b8e80941Smrg ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 429b8e80941Smrg 430b8e80941Smrg return src_reg(component(dst, 0)); 431b8e80941Smrg } 432b8e80941Smrg 433b8e80941Smrg src_reg 434b8e80941Smrg move_to_vgrf(const src_reg &src, unsigned num_components) const 435b8e80941Smrg { 436b8e80941Smrg src_reg *const src_comps = new src_reg[num_components]; 437b8e80941Smrg for (unsigned i = 0; i < num_components; i++) 438b8e80941Smrg src_comps[i] = offset(src, dispatch_width(), i); 439b8e80941Smrg 440b8e80941Smrg const dst_reg dst = vgrf(src.type, num_components); 441b8e80941Smrg LOAD_PAYLOAD(dst, src_comps, num_components, 0); 442b8e80941Smrg 443b8e80941Smrg delete[] src_comps; 444b8e80941Smrg 445b8e80941Smrg return src_reg(dst); 446b8e80941Smrg } 447b8e80941Smrg 448b8e80941Smrg void 449b8e80941Smrg emit_scan(enum opcode opcode, const dst_reg &tmp, 450b8e80941Smrg unsigned cluster_size, brw_conditional_mod mod) const 451b8e80941Smrg { 452b8e80941Smrg assert(dispatch_width() >= 8); 453b8e80941Smrg 454b8e80941Smrg /* The instruction splitting code isn't advanced enough to split 455b8e80941Smrg * these so we need to handle that ourselves. 456b8e80941Smrg */ 457b8e80941Smrg if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 458b8e80941Smrg const unsigned half_width = dispatch_width() / 2; 459b8e80941Smrg const fs_builder ubld = exec_all().group(half_width, 0); 460b8e80941Smrg dst_reg left = tmp; 461b8e80941Smrg dst_reg right = horiz_offset(tmp, half_width); 462b8e80941Smrg ubld.emit_scan(opcode, left, cluster_size, mod); 463b8e80941Smrg ubld.emit_scan(opcode, right, cluster_size, mod); 464b8e80941Smrg if (cluster_size > half_width) { 465b8e80941Smrg src_reg left_comp = component(left, half_width - 1); 466b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left_comp, right)); 467b8e80941Smrg } 468b8e80941Smrg return; 469b8e80941Smrg } 470b8e80941Smrg 471b8e80941Smrg if (cluster_size > 1) { 472b8e80941Smrg const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 473b8e80941Smrg const dst_reg left = horiz_stride(tmp, 2); 474b8e80941Smrg const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2); 475b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 476b8e80941Smrg } 477b8e80941Smrg 478b8e80941Smrg if (cluster_size > 2) { 479b8e80941Smrg if (type_sz(tmp.type) <= 4) { 480b8e80941Smrg const fs_builder ubld = 481b8e80941Smrg exec_all().group(dispatch_width() / 4, 0); 482b8e80941Smrg src_reg left = horiz_stride(horiz_offset(tmp, 1), 4); 483b8e80941Smrg 484b8e80941Smrg dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4); 485b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 486b8e80941Smrg 487b8e80941Smrg right = horiz_stride(horiz_offset(tmp, 3), 4); 488b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 489b8e80941Smrg } else { 490b8e80941Smrg /* For 64-bit types, we have to do things differently because 491b8e80941Smrg * the code above would land us with destination strides that 492b8e80941Smrg * the hardware can't handle. Fortunately, we'll only be 493b8e80941Smrg * 8-wide in that case and it's the same number of 494b8e80941Smrg * instructions. 495b8e80941Smrg */ 496b8e80941Smrg const fs_builder ubld = exec_all().group(2, 0); 497b8e80941Smrg 498b8e80941Smrg for (unsigned i = 0; i < dispatch_width(); i += 4) { 499b8e80941Smrg src_reg left = component(tmp, i + 1); 500b8e80941Smrg dst_reg right = horiz_offset(tmp, i + 2); 501b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 502b8e80941Smrg } 503b8e80941Smrg } 504b8e80941Smrg } 505b8e80941Smrg 506b8e80941Smrg if (cluster_size > 4) { 507b8e80941Smrg const fs_builder ubld = exec_all().group(4, 0); 508b8e80941Smrg src_reg left = component(tmp, 3); 509b8e80941Smrg dst_reg right = horiz_offset(tmp, 4); 510b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 511b8e80941Smrg 512b8e80941Smrg if (dispatch_width() > 8) { 513b8e80941Smrg left = component(tmp, 8 + 3); 514b8e80941Smrg right = horiz_offset(tmp, 8 + 4); 515b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 516b8e80941Smrg } 517b8e80941Smrg } 518b8e80941Smrg 519b8e80941Smrg if (cluster_size > 8 && dispatch_width() > 8) { 520b8e80941Smrg const fs_builder ubld = exec_all().group(8, 0); 521b8e80941Smrg src_reg left = component(tmp, 7); 522b8e80941Smrg dst_reg right = horiz_offset(tmp, 8); 523b8e80941Smrg set_condmod(mod, ubld.emit(opcode, right, left, right)); 524b8e80941Smrg } 525b8e80941Smrg } 526b8e80941Smrg 527b8e80941Smrg /** 528b8e80941Smrg * Assorted arithmetic ops. 529b8e80941Smrg * @{ 530b8e80941Smrg */ 531b8e80941Smrg#define ALU1(op) \ 532b8e80941Smrg instruction * \ 533b8e80941Smrg op(const dst_reg &dst, const src_reg &src0) const \ 534b8e80941Smrg { \ 535b8e80941Smrg return emit(BRW_OPCODE_##op, dst, src0); \ 536b8e80941Smrg } 537b8e80941Smrg 538b8e80941Smrg#define ALU2(op) \ 539b8e80941Smrg instruction * \ 540b8e80941Smrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 541b8e80941Smrg { \ 542b8e80941Smrg return emit(BRW_OPCODE_##op, dst, src0, src1); \ 543b8e80941Smrg } 544b8e80941Smrg 545b8e80941Smrg#define ALU2_ACC(op) \ 546b8e80941Smrg instruction * \ 547b8e80941Smrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 548b8e80941Smrg { \ 549b8e80941Smrg instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 550b8e80941Smrg inst->writes_accumulator = true; \ 551b8e80941Smrg return inst; \ 552b8e80941Smrg } 553b8e80941Smrg 554b8e80941Smrg#define ALU3(op) \ 555b8e80941Smrg instruction * \ 556b8e80941Smrg op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 557b8e80941Smrg const src_reg &src2) const \ 558b8e80941Smrg { \ 559b8e80941Smrg return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 560b8e80941Smrg } 561b8e80941Smrg 562b8e80941Smrg ALU2(ADD) 563b8e80941Smrg ALU2_ACC(ADDC) 564b8e80941Smrg ALU2(AND) 565b8e80941Smrg ALU2(ASR) 566b8e80941Smrg ALU2(AVG) 567b8e80941Smrg ALU3(BFE) 568b8e80941Smrg ALU2(BFI1) 569b8e80941Smrg ALU3(BFI2) 570b8e80941Smrg ALU1(BFREV) 571b8e80941Smrg ALU1(CBIT) 572b8e80941Smrg ALU2(CMPN) 573b8e80941Smrg ALU1(DIM) 574b8e80941Smrg ALU2(DP2) 575b8e80941Smrg ALU2(DP3) 576b8e80941Smrg ALU2(DP4) 577b8e80941Smrg ALU2(DPH) 578b8e80941Smrg ALU1(F16TO32) 579b8e80941Smrg ALU1(F32TO16) 580b8e80941Smrg ALU1(FBH) 581b8e80941Smrg ALU1(FBL) 582b8e80941Smrg ALU1(FRC) 583b8e80941Smrg ALU2(LINE) 584b8e80941Smrg ALU1(LZD) 585b8e80941Smrg ALU2(MAC) 586b8e80941Smrg ALU2_ACC(MACH) 587b8e80941Smrg ALU3(MAD) 588b8e80941Smrg ALU1(MOV) 589b8e80941Smrg ALU2(MUL) 590b8e80941Smrg ALU1(NOT) 591b8e80941Smrg ALU2(OR) 592b8e80941Smrg ALU2(PLN) 593b8e80941Smrg ALU1(RNDD) 594b8e80941Smrg ALU1(RNDE) 595b8e80941Smrg ALU1(RNDU) 596b8e80941Smrg ALU1(RNDZ) 597b8e80941Smrg ALU2(SAD2) 598b8e80941Smrg ALU2_ACC(SADA2) 599b8e80941Smrg ALU2(SEL) 600b8e80941Smrg ALU2(SHL) 601b8e80941Smrg ALU2(SHR) 602b8e80941Smrg ALU2_ACC(SUBB) 603b8e80941Smrg ALU2(XOR) 604b8e80941Smrg 605b8e80941Smrg#undef ALU3 606b8e80941Smrg#undef ALU2_ACC 607b8e80941Smrg#undef ALU2 608b8e80941Smrg#undef ALU1 609b8e80941Smrg /** @} */ 610b8e80941Smrg 611b8e80941Smrg /** 612b8e80941Smrg * CMP: Sets the low bit of the destination channels with the result 613b8e80941Smrg * of the comparison, while the upper bits are undefined, and updates 614b8e80941Smrg * the flag register with the packed 16 bits of the result. 615b8e80941Smrg */ 616b8e80941Smrg instruction * 617b8e80941Smrg CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 618b8e80941Smrg brw_conditional_mod condition) const 619b8e80941Smrg { 620b8e80941Smrg /* Take the instruction: 621b8e80941Smrg * 622b8e80941Smrg * CMP null<d> src0<f> src1<f> 623b8e80941Smrg * 624b8e80941Smrg * Original gen4 does type conversion to the destination type 625b8e80941Smrg * before comparison, producing garbage results for floating 626b8e80941Smrg * point comparisons. 627b8e80941Smrg * 628b8e80941Smrg * The destination type doesn't matter on newer generations, 629b8e80941Smrg * so we set the type to match src0 so we can compact the 630b8e80941Smrg * instruction. 631b8e80941Smrg */ 632b8e80941Smrg return set_condmod(condition, 633b8e80941Smrg emit(BRW_OPCODE_CMP, retype(dst, src0.type), 634b8e80941Smrg fix_unsigned_negate(src0), 635b8e80941Smrg fix_unsigned_negate(src1))); 636b8e80941Smrg } 637b8e80941Smrg 638b8e80941Smrg /** 639b8e80941Smrg * Gen4 predicated IF. 640b8e80941Smrg */ 641b8e80941Smrg instruction * 642b8e80941Smrg IF(brw_predicate predicate) const 643b8e80941Smrg { 644b8e80941Smrg return set_predicate(predicate, emit(BRW_OPCODE_IF)); 645b8e80941Smrg } 646b8e80941Smrg 647b8e80941Smrg /** 648b8e80941Smrg * CSEL: dst = src2 <op> 0.0f ? src0 : src1 649b8e80941Smrg */ 650b8e80941Smrg instruction * 651b8e80941Smrg CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 652b8e80941Smrg const src_reg &src2, brw_conditional_mod condition) const 653b8e80941Smrg { 654b8e80941Smrg /* CSEL only operates on floats, so we can't do integer </<=/>=/> 655b8e80941Smrg * comparisons. Zero/non-zero (== and !=) comparisons almost work. 656b8e80941Smrg * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 657b8e80941Smrg */ 658b8e80941Smrg assert(src2.type == BRW_REGISTER_TYPE_F); 659b8e80941Smrg 660b8e80941Smrg return set_condmod(condition, 661b8e80941Smrg emit(BRW_OPCODE_CSEL, 662b8e80941Smrg retype(dst, BRW_REGISTER_TYPE_F), 663b8e80941Smrg retype(src0, BRW_REGISTER_TYPE_F), 664b8e80941Smrg retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F), 665b8e80941Smrg fix_byte_src(src2))); 666b8e80941Smrg } 667b8e80941Smrg 668b8e80941Smrg /** 669b8e80941Smrg * Emit a linear interpolation instruction. 670b8e80941Smrg */ 671b8e80941Smrg instruction * 672b8e80941Smrg LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 673b8e80941Smrg const src_reg &a) const 674b8e80941Smrg { 675b8e80941Smrg if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) { 676b8e80941Smrg /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 677b8e80941Smrg * we need to reorder the operands. 678b8e80941Smrg */ 679b8e80941Smrg return emit(BRW_OPCODE_LRP, dst, a, y, x); 680b8e80941Smrg 681b8e80941Smrg } else { 682b8e80941Smrg /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 683b8e80941Smrg const dst_reg y_times_a = vgrf(dst.type); 684b8e80941Smrg const dst_reg one_minus_a = vgrf(dst.type); 685b8e80941Smrg const dst_reg x_times_one_minus_a = vgrf(dst.type); 686b8e80941Smrg 687b8e80941Smrg MUL(y_times_a, y, a); 688b8e80941Smrg ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 689b8e80941Smrg MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 690b8e80941Smrg return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 691b8e80941Smrg } 692b8e80941Smrg } 693b8e80941Smrg 694b8e80941Smrg /** 695b8e80941Smrg * Collect a number of registers in a contiguous range of registers. 696b8e80941Smrg */ 697b8e80941Smrg instruction * 698b8e80941Smrg LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 699b8e80941Smrg unsigned sources, unsigned header_size) const 700b8e80941Smrg { 701b8e80941Smrg instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 702b8e80941Smrg inst->header_size = header_size; 703b8e80941Smrg inst->size_written = header_size * REG_SIZE; 704b8e80941Smrg for (unsigned i = header_size; i < sources; i++) { 705b8e80941Smrg inst->size_written += 706b8e80941Smrg ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 707b8e80941Smrg REG_SIZE); 708b8e80941Smrg } 709b8e80941Smrg 710b8e80941Smrg return inst; 711b8e80941Smrg } 712b8e80941Smrg 713b8e80941Smrg backend_shader *shader; 714b8e80941Smrg 715b8e80941Smrg /** 716b8e80941Smrg * Byte sized operands are not supported for src1 on Gen11+. 717b8e80941Smrg */ 718b8e80941Smrg src_reg 719b8e80941Smrg fix_byte_src(const src_reg &src) const 720b8e80941Smrg { 721b8e80941Smrg if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) || 722b8e80941Smrg type_sz(src.type) != 1) 723b8e80941Smrg return src; 724b8e80941Smrg 725b8e80941Smrg dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ? 726b8e80941Smrg BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D); 727b8e80941Smrg MOV(temp, src); 728b8e80941Smrg return src_reg(temp); 729b8e80941Smrg } 730b8e80941Smrg 731b8e80941Smrg private: 732b8e80941Smrg /** 733b8e80941Smrg * Workaround for negation of UD registers. See comment in 734b8e80941Smrg * fs_generator::generate_code() for more details. 735b8e80941Smrg */ 736b8e80941Smrg src_reg 737b8e80941Smrg fix_unsigned_negate(const src_reg &src) const 738b8e80941Smrg { 739b8e80941Smrg if (src.type == BRW_REGISTER_TYPE_UD && 740b8e80941Smrg src.negate) { 741b8e80941Smrg dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 742b8e80941Smrg MOV(temp, src); 743b8e80941Smrg return src_reg(temp); 744b8e80941Smrg } else { 745b8e80941Smrg return src; 746b8e80941Smrg } 747b8e80941Smrg } 748b8e80941Smrg 749b8e80941Smrg /** 750b8e80941Smrg * Workaround for source register modes not supported by the ternary 751b8e80941Smrg * instruction encoding. 752b8e80941Smrg */ 753b8e80941Smrg src_reg 754b8e80941Smrg fix_3src_operand(const src_reg &src) const 755b8e80941Smrg { 756b8e80941Smrg switch (src.file) { 757b8e80941Smrg case FIXED_GRF: 758b8e80941Smrg /* FINISHME: Could handle scalar region, other stride=1 regions */ 759b8e80941Smrg if (src.vstride != BRW_VERTICAL_STRIDE_8 || 760b8e80941Smrg src.width != BRW_WIDTH_8 || 761b8e80941Smrg src.hstride != BRW_HORIZONTAL_STRIDE_1) 762b8e80941Smrg break; 763b8e80941Smrg /* fallthrough */ 764b8e80941Smrg case ATTR: 765b8e80941Smrg case VGRF: 766b8e80941Smrg case UNIFORM: 767b8e80941Smrg case IMM: 768b8e80941Smrg return src; 769b8e80941Smrg default: 770b8e80941Smrg break; 771b8e80941Smrg } 772b8e80941Smrg 773b8e80941Smrg dst_reg expanded = vgrf(src.type); 774b8e80941Smrg MOV(expanded, src); 775b8e80941Smrg return expanded; 776b8e80941Smrg } 777b8e80941Smrg 778b8e80941Smrg /** 779b8e80941Smrg * Workaround for source register modes not supported by the math 780b8e80941Smrg * instruction. 781b8e80941Smrg */ 782b8e80941Smrg src_reg 783b8e80941Smrg fix_math_operand(const src_reg &src) const 784b8e80941Smrg { 785b8e80941Smrg /* Can't do hstride == 0 args on gen6 math, so expand it out. We 786b8e80941Smrg * might be able to do better by doing execsize = 1 math and then 787b8e80941Smrg * expanding that result out, but we would need to be careful with 788b8e80941Smrg * masking. 789b8e80941Smrg * 790b8e80941Smrg * Gen6 hardware ignores source modifiers (negate and abs) on math 791b8e80941Smrg * instructions, so we also move to a temp to set those up. 792b8e80941Smrg * 793b8e80941Smrg * Gen7 relaxes most of the above restrictions, but still can't use IMM 794b8e80941Smrg * operands to math 795b8e80941Smrg */ 796b8e80941Smrg if ((shader->devinfo->gen == 6 && 797b8e80941Smrg (src.file == IMM || src.file == UNIFORM || 798b8e80941Smrg src.abs || src.negate)) || 799b8e80941Smrg (shader->devinfo->gen == 7 && src.file == IMM)) { 800b8e80941Smrg const dst_reg tmp = vgrf(src.type); 801b8e80941Smrg MOV(tmp, src); 802b8e80941Smrg return tmp; 803b8e80941Smrg } else { 804b8e80941Smrg return src; 805b8e80941Smrg } 806b8e80941Smrg } 807b8e80941Smrg 808b8e80941Smrg bblock_t *block; 809b8e80941Smrg exec_node *cursor; 810b8e80941Smrg 811b8e80941Smrg unsigned _dispatch_width; 812b8e80941Smrg unsigned _group; 813b8e80941Smrg bool force_writemask_all; 814b8e80941Smrg 815b8e80941Smrg /** Debug annotation info. */ 816b8e80941Smrg struct { 817b8e80941Smrg const char *str; 818b8e80941Smrg const void *ir; 819b8e80941Smrg } annotation; 820b8e80941Smrg }; 821b8e80941Smrg} 822b8e80941Smrg 823b8e80941Smrg#endif 824