1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2010 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg/** @file brw_fs.cpp 25b8e80941Smrg * 26b8e80941Smrg * This file drives the GLSL IR -> LIR translation, contains the 27b8e80941Smrg * optimizations on the LIR, and drives the generation of native code 28b8e80941Smrg * from the LIR. 29b8e80941Smrg */ 30b8e80941Smrg 31b8e80941Smrg#include "main/macros.h" 32b8e80941Smrg#include "brw_eu.h" 33b8e80941Smrg#include "brw_fs.h" 34b8e80941Smrg#include "brw_nir.h" 35b8e80941Smrg#include "brw_vec4_gs_visitor.h" 36b8e80941Smrg#include "brw_cfg.h" 37b8e80941Smrg#include "brw_dead_control_flow.h" 38b8e80941Smrg#include "dev/gen_debug.h" 39b8e80941Smrg#include "compiler/glsl_types.h" 40b8e80941Smrg#include "compiler/nir/nir_builder.h" 41b8e80941Smrg#include "program/prog_parameter.h" 42b8e80941Smrg#include "util/u_math.h" 43b8e80941Smrg 44b8e80941Smrgusing namespace brw; 45b8e80941Smrg 46b8e80941Smrgstatic unsigned get_lowered_simd_width(const struct gen_device_info *devinfo, 47b8e80941Smrg const fs_inst *inst); 48b8e80941Smrg 49b8e80941Smrgvoid 50b8e80941Smrgfs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 51b8e80941Smrg const fs_reg *src, unsigned sources) 52b8e80941Smrg{ 53b8e80941Smrg memset((void*)this, 0, sizeof(*this)); 54b8e80941Smrg 55b8e80941Smrg this->src = new fs_reg[MAX2(sources, 3)]; 56b8e80941Smrg for (unsigned i = 0; i < sources; i++) 57b8e80941Smrg this->src[i] = src[i]; 58b8e80941Smrg 59b8e80941Smrg this->opcode = opcode; 60b8e80941Smrg this->dst = dst; 61b8e80941Smrg this->sources = sources; 62b8e80941Smrg this->exec_size = exec_size; 63b8e80941Smrg this->base_mrf = -1; 64b8e80941Smrg 65b8e80941Smrg assert(dst.file != IMM && dst.file != UNIFORM); 66b8e80941Smrg 67b8e80941Smrg assert(this->exec_size != 0); 68b8e80941Smrg 69b8e80941Smrg this->conditional_mod = BRW_CONDITIONAL_NONE; 70b8e80941Smrg 71b8e80941Smrg /* This will be the case for almost all instructions. */ 72b8e80941Smrg switch (dst.file) { 73b8e80941Smrg case VGRF: 74b8e80941Smrg case ARF: 75b8e80941Smrg case FIXED_GRF: 76b8e80941Smrg case MRF: 77b8e80941Smrg case ATTR: 78b8e80941Smrg this->size_written = dst.component_size(exec_size); 79b8e80941Smrg break; 80b8e80941Smrg case BAD_FILE: 81b8e80941Smrg this->size_written = 0; 82b8e80941Smrg break; 83b8e80941Smrg case IMM: 84b8e80941Smrg case UNIFORM: 85b8e80941Smrg unreachable("Invalid destination register file"); 86b8e80941Smrg } 87b8e80941Smrg 88b8e80941Smrg this->writes_accumulator = false; 89b8e80941Smrg} 90b8e80941Smrg 91b8e80941Smrgfs_inst::fs_inst() 92b8e80941Smrg{ 93b8e80941Smrg init(BRW_OPCODE_NOP, 8, dst, NULL, 0); 94b8e80941Smrg} 95b8e80941Smrg 96b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size) 97b8e80941Smrg{ 98b8e80941Smrg init(opcode, exec_size, reg_undef, NULL, 0); 99b8e80941Smrg} 100b8e80941Smrg 101b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst) 102b8e80941Smrg{ 103b8e80941Smrg init(opcode, exec_size, dst, NULL, 0); 104b8e80941Smrg} 105b8e80941Smrg 106b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 107b8e80941Smrg const fs_reg &src0) 108b8e80941Smrg{ 109b8e80941Smrg const fs_reg src[1] = { src0 }; 110b8e80941Smrg init(opcode, exec_size, dst, src, 1); 111b8e80941Smrg} 112b8e80941Smrg 113b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 114b8e80941Smrg const fs_reg &src0, const fs_reg &src1) 115b8e80941Smrg{ 116b8e80941Smrg const fs_reg src[2] = { src0, src1 }; 117b8e80941Smrg init(opcode, exec_size, dst, src, 2); 118b8e80941Smrg} 119b8e80941Smrg 120b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, 121b8e80941Smrg const fs_reg &src0, const fs_reg &src1, const fs_reg &src2) 122b8e80941Smrg{ 123b8e80941Smrg const fs_reg src[3] = { src0, src1, src2 }; 124b8e80941Smrg init(opcode, exec_size, dst, src, 3); 125b8e80941Smrg} 126b8e80941Smrg 127b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, 128b8e80941Smrg const fs_reg src[], unsigned sources) 129b8e80941Smrg{ 130b8e80941Smrg init(opcode, exec_width, dst, src, sources); 131b8e80941Smrg} 132b8e80941Smrg 133b8e80941Smrgfs_inst::fs_inst(const fs_inst &that) 134b8e80941Smrg{ 135b8e80941Smrg memcpy((void*)this, &that, sizeof(that)); 136b8e80941Smrg 137b8e80941Smrg this->src = new fs_reg[MAX2(that.sources, 3)]; 138b8e80941Smrg 139b8e80941Smrg for (unsigned i = 0; i < that.sources; i++) 140b8e80941Smrg this->src[i] = that.src[i]; 141b8e80941Smrg} 142b8e80941Smrg 143b8e80941Smrgfs_inst::~fs_inst() 144b8e80941Smrg{ 145b8e80941Smrg delete[] this->src; 146b8e80941Smrg} 147b8e80941Smrg 148b8e80941Smrgvoid 149b8e80941Smrgfs_inst::resize_sources(uint8_t num_sources) 150b8e80941Smrg{ 151b8e80941Smrg if (this->sources != num_sources) { 152b8e80941Smrg fs_reg *src = new fs_reg[MAX2(num_sources, 3)]; 153b8e80941Smrg 154b8e80941Smrg for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i) 155b8e80941Smrg src[i] = this->src[i]; 156b8e80941Smrg 157b8e80941Smrg delete[] this->src; 158b8e80941Smrg this->src = src; 159b8e80941Smrg this->sources = num_sources; 160b8e80941Smrg } 161b8e80941Smrg} 162b8e80941Smrg 163b8e80941Smrgvoid 164b8e80941Smrgfs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld, 165b8e80941Smrg const fs_reg &dst, 166b8e80941Smrg const fs_reg &surf_index, 167b8e80941Smrg const fs_reg &varying_offset, 168b8e80941Smrg uint32_t const_offset) 169b8e80941Smrg{ 170b8e80941Smrg /* We have our constant surface use a pitch of 4 bytes, so our index can 171b8e80941Smrg * be any component of a vector, and then we load 4 contiguous 172b8e80941Smrg * components starting from that. 173b8e80941Smrg * 174b8e80941Smrg * We break down the const_offset to a portion added to the variable offset 175b8e80941Smrg * and a portion done using fs_reg::offset, which means that if you have 176b8e80941Smrg * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]", 177b8e80941Smrg * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can 178b8e80941Smrg * later notice that those loads are all the same and eliminate the 179b8e80941Smrg * redundant ones. 180b8e80941Smrg */ 181b8e80941Smrg fs_reg vec4_offset = vgrf(glsl_type::uint_type); 182b8e80941Smrg bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf)); 183b8e80941Smrg 184b8e80941Smrg /* The pull load message will load a vec4 (16 bytes). If we are loading 185b8e80941Smrg * a double this means we are only loading 2 elements worth of data. 186b8e80941Smrg * We also want to use a 32-bit data type for the dst of the load operation 187b8e80941Smrg * so other parts of the driver don't get confused about the size of the 188b8e80941Smrg * result. 189b8e80941Smrg */ 190b8e80941Smrg fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4); 191b8e80941Smrg fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL, 192b8e80941Smrg vec4_result, surf_index, vec4_offset); 193b8e80941Smrg inst->size_written = 4 * vec4_result.component_size(inst->exec_size); 194b8e80941Smrg 195b8e80941Smrg shuffle_from_32bit_read(bld, dst, vec4_result, 196b8e80941Smrg (const_offset & 0xf) / type_sz(dst.type), 1); 197b8e80941Smrg} 198b8e80941Smrg 199b8e80941Smrg/** 200b8e80941Smrg * A helper for MOV generation for fixing up broken hardware SEND dependency 201b8e80941Smrg * handling. 202b8e80941Smrg */ 203b8e80941Smrgvoid 204b8e80941Smrgfs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf) 205b8e80941Smrg{ 206b8e80941Smrg /* The caller always wants uncompressed to emit the minimal extra 207b8e80941Smrg * dependencies, and to avoid having to deal with aligning its regs to 2. 208b8e80941Smrg */ 209b8e80941Smrg const fs_builder ubld = bld.annotate("send dependency resolve") 210b8e80941Smrg .half(0); 211b8e80941Smrg 212b8e80941Smrg ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F)); 213b8e80941Smrg} 214b8e80941Smrg 215b8e80941Smrgbool 216b8e80941Smrgfs_inst::is_send_from_grf() const 217b8e80941Smrg{ 218b8e80941Smrg switch (opcode) { 219b8e80941Smrg case SHADER_OPCODE_SEND: 220b8e80941Smrg case SHADER_OPCODE_SHADER_TIME_ADD: 221b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 222b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 223b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 224b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8: 225b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 226b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 227b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 228b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8: 229b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 230b8e80941Smrg return true; 231b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 232b8e80941Smrg return src[1].file == VGRF; 233b8e80941Smrg case FS_OPCODE_FB_WRITE: 234b8e80941Smrg case FS_OPCODE_FB_READ: 235b8e80941Smrg return src[0].file == VGRF; 236b8e80941Smrg default: 237b8e80941Smrg if (is_tex()) 238b8e80941Smrg return src[0].file == VGRF; 239b8e80941Smrg 240b8e80941Smrg return false; 241b8e80941Smrg } 242b8e80941Smrg} 243b8e80941Smrg 244b8e80941Smrgbool 245b8e80941Smrgfs_inst::is_control_source(unsigned arg) const 246b8e80941Smrg{ 247b8e80941Smrg switch (opcode) { 248b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 249b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 250b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 251b8e80941Smrg return arg == 0; 252b8e80941Smrg 253b8e80941Smrg case SHADER_OPCODE_BROADCAST: 254b8e80941Smrg case SHADER_OPCODE_SHUFFLE: 255b8e80941Smrg case SHADER_OPCODE_QUAD_SWIZZLE: 256b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 257b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 258b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 259b8e80941Smrg case SHADER_OPCODE_GET_BUFFER_SIZE: 260b8e80941Smrg return arg == 1; 261b8e80941Smrg 262b8e80941Smrg case SHADER_OPCODE_MOV_INDIRECT: 263b8e80941Smrg case SHADER_OPCODE_CLUSTER_BROADCAST: 264b8e80941Smrg case SHADER_OPCODE_TEX: 265b8e80941Smrg case FS_OPCODE_TXB: 266b8e80941Smrg case SHADER_OPCODE_TXD: 267b8e80941Smrg case SHADER_OPCODE_TXF: 268b8e80941Smrg case SHADER_OPCODE_TXF_LZ: 269b8e80941Smrg case SHADER_OPCODE_TXF_CMS: 270b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W: 271b8e80941Smrg case SHADER_OPCODE_TXF_UMS: 272b8e80941Smrg case SHADER_OPCODE_TXF_MCS: 273b8e80941Smrg case SHADER_OPCODE_TXL: 274b8e80941Smrg case SHADER_OPCODE_TXL_LZ: 275b8e80941Smrg case SHADER_OPCODE_TXS: 276b8e80941Smrg case SHADER_OPCODE_LOD: 277b8e80941Smrg case SHADER_OPCODE_TG4: 278b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET: 279b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO: 280b8e80941Smrg return arg == 1 || arg == 2; 281b8e80941Smrg 282b8e80941Smrg case SHADER_OPCODE_SEND: 283b8e80941Smrg return arg == 0 || arg == 1; 284b8e80941Smrg 285b8e80941Smrg default: 286b8e80941Smrg return false; 287b8e80941Smrg } 288b8e80941Smrg} 289b8e80941Smrg 290b8e80941Smrg/** 291b8e80941Smrg * Returns true if this instruction's sources and destinations cannot 292b8e80941Smrg * safely be the same register. 293b8e80941Smrg * 294b8e80941Smrg * In most cases, a register can be written over safely by the same 295b8e80941Smrg * instruction that is its last use. For a single instruction, the 296b8e80941Smrg * sources are dereferenced before writing of the destination starts 297b8e80941Smrg * (naturally). 298b8e80941Smrg * 299b8e80941Smrg * However, there are a few cases where this can be problematic: 300b8e80941Smrg * 301b8e80941Smrg * - Virtual opcodes that translate to multiple instructions in the 302b8e80941Smrg * code generator: if src == dst and one instruction writes the 303b8e80941Smrg * destination before a later instruction reads the source, then 304b8e80941Smrg * src will have been clobbered. 305b8e80941Smrg * 306b8e80941Smrg * - SIMD16 compressed instructions with certain regioning (see below). 307b8e80941Smrg * 308b8e80941Smrg * The register allocator uses this information to set up conflicts between 309b8e80941Smrg * GRF sources and the destination. 310b8e80941Smrg */ 311b8e80941Smrgbool 312b8e80941Smrgfs_inst::has_source_and_destination_hazard() const 313b8e80941Smrg{ 314b8e80941Smrg switch (opcode) { 315b8e80941Smrg case FS_OPCODE_PACK_HALF_2x16_SPLIT: 316b8e80941Smrg /* Multiple partial writes to the destination */ 317b8e80941Smrg return true; 318b8e80941Smrg case SHADER_OPCODE_SHUFFLE: 319b8e80941Smrg /* This instruction returns an arbitrary channel from the source and 320b8e80941Smrg * gets split into smaller instructions in the generator. It's possible 321b8e80941Smrg * that one of the instructions will read from a channel corresponding 322b8e80941Smrg * to an earlier instruction. 323b8e80941Smrg */ 324b8e80941Smrg case SHADER_OPCODE_SEL_EXEC: 325b8e80941Smrg /* This is implemented as 326b8e80941Smrg * 327b8e80941Smrg * mov(16) g4<1>D 0D { align1 WE_all 1H }; 328b8e80941Smrg * mov(16) g4<1>D g5<8,8,1>D { align1 1H } 329b8e80941Smrg * 330b8e80941Smrg * Because the source is only read in the second instruction, the first 331b8e80941Smrg * may stomp all over it. 332b8e80941Smrg */ 333b8e80941Smrg return true; 334b8e80941Smrg case SHADER_OPCODE_QUAD_SWIZZLE: 335b8e80941Smrg switch (src[1].ud) { 336b8e80941Smrg case BRW_SWIZZLE_XXXX: 337b8e80941Smrg case BRW_SWIZZLE_YYYY: 338b8e80941Smrg case BRW_SWIZZLE_ZZZZ: 339b8e80941Smrg case BRW_SWIZZLE_WWWW: 340b8e80941Smrg case BRW_SWIZZLE_XXZZ: 341b8e80941Smrg case BRW_SWIZZLE_YYWW: 342b8e80941Smrg case BRW_SWIZZLE_XYXY: 343b8e80941Smrg case BRW_SWIZZLE_ZWZW: 344b8e80941Smrg /* These can be implemented as a single Align1 region on all 345b8e80941Smrg * platforms, so there's never a hazard between source and 346b8e80941Smrg * destination. C.f. fs_generator::generate_quad_swizzle(). 347b8e80941Smrg */ 348b8e80941Smrg return false; 349b8e80941Smrg default: 350b8e80941Smrg return !is_uniform(src[0]); 351b8e80941Smrg } 352b8e80941Smrg default: 353b8e80941Smrg /* The SIMD16 compressed instruction 354b8e80941Smrg * 355b8e80941Smrg * add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F 356b8e80941Smrg * 357b8e80941Smrg * is actually decoded in hardware as: 358b8e80941Smrg * 359b8e80941Smrg * add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F 360b8e80941Smrg * add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F 361b8e80941Smrg * 362b8e80941Smrg * Which is safe. However, if we have uniform accesses 363b8e80941Smrg * happening, we get into trouble: 364b8e80941Smrg * 365b8e80941Smrg * add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F 366b8e80941Smrg * add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F 367b8e80941Smrg * 368b8e80941Smrg * Now our destination for the first instruction overwrote the 369b8e80941Smrg * second instruction's src0, and we get garbage for those 8 370b8e80941Smrg * pixels. There's a similar issue for the pre-gen6 371b8e80941Smrg * pixel_x/pixel_y, which are registers of 16-bit values and thus 372b8e80941Smrg * would get stomped by the first decode as well. 373b8e80941Smrg */ 374b8e80941Smrg if (exec_size == 16) { 375b8e80941Smrg for (int i = 0; i < sources; i++) { 376b8e80941Smrg if (src[i].file == VGRF && (src[i].stride == 0 || 377b8e80941Smrg src[i].type == BRW_REGISTER_TYPE_UW || 378b8e80941Smrg src[i].type == BRW_REGISTER_TYPE_W || 379b8e80941Smrg src[i].type == BRW_REGISTER_TYPE_UB || 380b8e80941Smrg src[i].type == BRW_REGISTER_TYPE_B)) { 381b8e80941Smrg return true; 382b8e80941Smrg } 383b8e80941Smrg } 384b8e80941Smrg } 385b8e80941Smrg return false; 386b8e80941Smrg } 387b8e80941Smrg} 388b8e80941Smrg 389b8e80941Smrgbool 390b8e80941Smrgfs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const 391b8e80941Smrg{ 392b8e80941Smrg if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) 393b8e80941Smrg return false; 394b8e80941Smrg 395b8e80941Smrg fs_reg reg = this->src[0]; 396b8e80941Smrg if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1) 397b8e80941Smrg return false; 398b8e80941Smrg 399b8e80941Smrg if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written) 400b8e80941Smrg return false; 401b8e80941Smrg 402b8e80941Smrg for (int i = 0; i < this->sources; i++) { 403b8e80941Smrg reg.type = this->src[i].type; 404b8e80941Smrg if (!this->src[i].equals(reg)) 405b8e80941Smrg return false; 406b8e80941Smrg 407b8e80941Smrg if (i < this->header_size) { 408b8e80941Smrg reg.offset += REG_SIZE; 409b8e80941Smrg } else { 410b8e80941Smrg reg = horiz_offset(reg, this->exec_size); 411b8e80941Smrg } 412b8e80941Smrg } 413b8e80941Smrg 414b8e80941Smrg return true; 415b8e80941Smrg} 416b8e80941Smrg 417b8e80941Smrgbool 418b8e80941Smrgfs_inst::can_do_source_mods(const struct gen_device_info *devinfo) const 419b8e80941Smrg{ 420b8e80941Smrg if (devinfo->gen == 6 && is_math()) 421b8e80941Smrg return false; 422b8e80941Smrg 423b8e80941Smrg if (is_send_from_grf()) 424b8e80941Smrg return false; 425b8e80941Smrg 426b8e80941Smrg if (!backend_instruction::can_do_source_mods()) 427b8e80941Smrg return false; 428b8e80941Smrg 429b8e80941Smrg return true; 430b8e80941Smrg} 431b8e80941Smrg 432b8e80941Smrgbool 433b8e80941Smrgfs_inst::can_do_cmod() 434b8e80941Smrg{ 435b8e80941Smrg if (!backend_instruction::can_do_cmod()) 436b8e80941Smrg return false; 437b8e80941Smrg 438b8e80941Smrg /* The accumulator result appears to get used for the conditional modifier 439b8e80941Smrg * generation. When negating a UD value, there is a 33rd bit generated for 440b8e80941Smrg * the sign in the accumulator value, so now you can't check, for example, 441b8e80941Smrg * equality with a 32-bit value. See piglit fs-op-neg-uvec4. 442b8e80941Smrg */ 443b8e80941Smrg for (unsigned i = 0; i < sources; i++) { 444b8e80941Smrg if (type_is_unsigned_int(src[i].type) && src[i].negate) 445b8e80941Smrg return false; 446b8e80941Smrg } 447b8e80941Smrg 448b8e80941Smrg return true; 449b8e80941Smrg} 450b8e80941Smrg 451b8e80941Smrgbool 452b8e80941Smrgfs_inst::can_change_types() const 453b8e80941Smrg{ 454b8e80941Smrg return dst.type == src[0].type && 455b8e80941Smrg !src[0].abs && !src[0].negate && !saturate && 456b8e80941Smrg (opcode == BRW_OPCODE_MOV || 457b8e80941Smrg (opcode == BRW_OPCODE_SEL && 458b8e80941Smrg dst.type == src[1].type && 459b8e80941Smrg predicate != BRW_PREDICATE_NONE && 460b8e80941Smrg !src[1].abs && !src[1].negate)); 461b8e80941Smrg} 462b8e80941Smrg 463b8e80941Smrgvoid 464b8e80941Smrgfs_reg::init() 465b8e80941Smrg{ 466b8e80941Smrg memset((void*)this, 0, sizeof(*this)); 467b8e80941Smrg type = BRW_REGISTER_TYPE_UD; 468b8e80941Smrg stride = 1; 469b8e80941Smrg} 470b8e80941Smrg 471b8e80941Smrg/** Generic unset register constructor. */ 472b8e80941Smrgfs_reg::fs_reg() 473b8e80941Smrg{ 474b8e80941Smrg init(); 475b8e80941Smrg this->file = BAD_FILE; 476b8e80941Smrg} 477b8e80941Smrg 478b8e80941Smrgfs_reg::fs_reg(struct ::brw_reg reg) : 479b8e80941Smrg backend_reg(reg) 480b8e80941Smrg{ 481b8e80941Smrg this->offset = 0; 482b8e80941Smrg this->stride = 1; 483b8e80941Smrg if (this->file == IMM && 484b8e80941Smrg (this->type != BRW_REGISTER_TYPE_V && 485b8e80941Smrg this->type != BRW_REGISTER_TYPE_UV && 486b8e80941Smrg this->type != BRW_REGISTER_TYPE_VF)) { 487b8e80941Smrg this->stride = 0; 488b8e80941Smrg } 489b8e80941Smrg} 490b8e80941Smrg 491b8e80941Smrgbool 492b8e80941Smrgfs_reg::equals(const fs_reg &r) const 493b8e80941Smrg{ 494b8e80941Smrg return (this->backend_reg::equals(r) && 495b8e80941Smrg stride == r.stride); 496b8e80941Smrg} 497b8e80941Smrg 498b8e80941Smrgbool 499b8e80941Smrgfs_reg::negative_equals(const fs_reg &r) const 500b8e80941Smrg{ 501b8e80941Smrg return (this->backend_reg::negative_equals(r) && 502b8e80941Smrg stride == r.stride); 503b8e80941Smrg} 504b8e80941Smrg 505b8e80941Smrgbool 506b8e80941Smrgfs_reg::is_contiguous() const 507b8e80941Smrg{ 508b8e80941Smrg return stride == 1; 509b8e80941Smrg} 510b8e80941Smrg 511b8e80941Smrgunsigned 512b8e80941Smrgfs_reg::component_size(unsigned width) const 513b8e80941Smrg{ 514b8e80941Smrg const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride : 515b8e80941Smrg hstride == 0 ? 0 : 516b8e80941Smrg 1 << (hstride - 1)); 517b8e80941Smrg return MAX2(width * stride, 1) * type_sz(type); 518b8e80941Smrg} 519b8e80941Smrg 520b8e80941Smrgextern "C" int 521b8e80941Smrgtype_size_scalar(const struct glsl_type *type, bool bindless) 522b8e80941Smrg{ 523b8e80941Smrg unsigned int size, i; 524b8e80941Smrg 525b8e80941Smrg switch (type->base_type) { 526b8e80941Smrg case GLSL_TYPE_UINT: 527b8e80941Smrg case GLSL_TYPE_INT: 528b8e80941Smrg case GLSL_TYPE_FLOAT: 529b8e80941Smrg case GLSL_TYPE_BOOL: 530b8e80941Smrg return type->components(); 531b8e80941Smrg case GLSL_TYPE_UINT16: 532b8e80941Smrg case GLSL_TYPE_INT16: 533b8e80941Smrg case GLSL_TYPE_FLOAT16: 534b8e80941Smrg return DIV_ROUND_UP(type->components(), 2); 535b8e80941Smrg case GLSL_TYPE_UINT8: 536b8e80941Smrg case GLSL_TYPE_INT8: 537b8e80941Smrg return DIV_ROUND_UP(type->components(), 4); 538b8e80941Smrg case GLSL_TYPE_DOUBLE: 539b8e80941Smrg case GLSL_TYPE_UINT64: 540b8e80941Smrg case GLSL_TYPE_INT64: 541b8e80941Smrg return type->components() * 2; 542b8e80941Smrg case GLSL_TYPE_ARRAY: 543b8e80941Smrg return type_size_scalar(type->fields.array, bindless) * type->length; 544b8e80941Smrg case GLSL_TYPE_STRUCT: 545b8e80941Smrg case GLSL_TYPE_INTERFACE: 546b8e80941Smrg size = 0; 547b8e80941Smrg for (i = 0; i < type->length; i++) { 548b8e80941Smrg size += type_size_scalar(type->fields.structure[i].type, bindless); 549b8e80941Smrg } 550b8e80941Smrg return size; 551b8e80941Smrg case GLSL_TYPE_SAMPLER: 552b8e80941Smrg case GLSL_TYPE_IMAGE: 553b8e80941Smrg if (bindless) 554b8e80941Smrg return type->components() * 2; 555b8e80941Smrg case GLSL_TYPE_ATOMIC_UINT: 556b8e80941Smrg /* Samplers, atomics, and images take up no register space, since 557b8e80941Smrg * they're baked in at link time. 558b8e80941Smrg */ 559b8e80941Smrg return 0; 560b8e80941Smrg case GLSL_TYPE_SUBROUTINE: 561b8e80941Smrg return 1; 562b8e80941Smrg case GLSL_TYPE_VOID: 563b8e80941Smrg case GLSL_TYPE_ERROR: 564b8e80941Smrg case GLSL_TYPE_FUNCTION: 565b8e80941Smrg unreachable("not reached"); 566b8e80941Smrg } 567b8e80941Smrg 568b8e80941Smrg return 0; 569b8e80941Smrg} 570b8e80941Smrg 571b8e80941Smrg/** 572b8e80941Smrg * Create a MOV to read the timestamp register. 573b8e80941Smrg * 574b8e80941Smrg * The caller is responsible for emitting the MOV. The return value is 575b8e80941Smrg * the destination of the MOV, with extra parameters set. 576b8e80941Smrg */ 577b8e80941Smrgfs_reg 578b8e80941Smrgfs_visitor::get_timestamp(const fs_builder &bld) 579b8e80941Smrg{ 580b8e80941Smrg assert(devinfo->gen >= 7); 581b8e80941Smrg 582b8e80941Smrg fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, 583b8e80941Smrg BRW_ARF_TIMESTAMP, 584b8e80941Smrg 0), 585b8e80941Smrg BRW_REGISTER_TYPE_UD)); 586b8e80941Smrg 587b8e80941Smrg fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); 588b8e80941Smrg 589b8e80941Smrg /* We want to read the 3 fields we care about even if it's not enabled in 590b8e80941Smrg * the dispatch. 591b8e80941Smrg */ 592b8e80941Smrg bld.group(4, 0).exec_all().MOV(dst, ts); 593b8e80941Smrg 594b8e80941Smrg return dst; 595b8e80941Smrg} 596b8e80941Smrg 597b8e80941Smrgvoid 598b8e80941Smrgfs_visitor::emit_shader_time_begin() 599b8e80941Smrg{ 600b8e80941Smrg /* We want only the low 32 bits of the timestamp. Since it's running 601b8e80941Smrg * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, 602b8e80941Smrg * which is plenty of time for our purposes. It is identical across the 603b8e80941Smrg * EUs, but since it's tracking GPU core speed it will increment at a 604b8e80941Smrg * varying rate as render P-states change. 605b8e80941Smrg */ 606b8e80941Smrg shader_start_time = component( 607b8e80941Smrg get_timestamp(bld.annotate("shader time start")), 0); 608b8e80941Smrg} 609b8e80941Smrg 610b8e80941Smrgvoid 611b8e80941Smrgfs_visitor::emit_shader_time_end() 612b8e80941Smrg{ 613b8e80941Smrg /* Insert our code just before the final SEND with EOT. */ 614b8e80941Smrg exec_node *end = this->instructions.get_tail(); 615b8e80941Smrg assert(end && ((fs_inst *) end)->eot); 616b8e80941Smrg const fs_builder ibld = bld.annotate("shader time end") 617b8e80941Smrg .exec_all().at(NULL, end); 618b8e80941Smrg const fs_reg timestamp = get_timestamp(ibld); 619b8e80941Smrg 620b8e80941Smrg /* We only use the low 32 bits of the timestamp - see 621b8e80941Smrg * emit_shader_time_begin()). 622b8e80941Smrg * 623b8e80941Smrg * We could also check if render P-states have changed (or anything 624b8e80941Smrg * else that might disrupt timing) by setting smear to 2 and checking if 625b8e80941Smrg * that field is != 0. 626b8e80941Smrg */ 627b8e80941Smrg const fs_reg shader_end_time = component(timestamp, 0); 628b8e80941Smrg 629b8e80941Smrg /* Check that there weren't any timestamp reset events (assuming these 630b8e80941Smrg * were the only two timestamp reads that happened). 631b8e80941Smrg */ 632b8e80941Smrg const fs_reg reset = component(timestamp, 2); 633b8e80941Smrg set_condmod(BRW_CONDITIONAL_Z, 634b8e80941Smrg ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u))); 635b8e80941Smrg ibld.IF(BRW_PREDICATE_NORMAL); 636b8e80941Smrg 637b8e80941Smrg fs_reg start = shader_start_time; 638b8e80941Smrg start.negate = true; 639b8e80941Smrg const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1), 640b8e80941Smrg BRW_REGISTER_TYPE_UD), 641b8e80941Smrg 0); 642b8e80941Smrg const fs_builder cbld = ibld.group(1, 0); 643b8e80941Smrg cbld.group(1, 0).ADD(diff, start, shader_end_time); 644b8e80941Smrg 645b8e80941Smrg /* If there were no instructions between the two timestamp gets, the diff 646b8e80941Smrg * is 2 cycles. Remove that overhead, so I can forget about that when 647b8e80941Smrg * trying to determine the time taken for single instructions. 648b8e80941Smrg */ 649b8e80941Smrg cbld.ADD(diff, diff, brw_imm_ud(-2u)); 650b8e80941Smrg SHADER_TIME_ADD(cbld, 0, diff); 651b8e80941Smrg SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u)); 652b8e80941Smrg ibld.emit(BRW_OPCODE_ELSE); 653b8e80941Smrg SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u)); 654b8e80941Smrg ibld.emit(BRW_OPCODE_ENDIF); 655b8e80941Smrg} 656b8e80941Smrg 657b8e80941Smrgvoid 658b8e80941Smrgfs_visitor::SHADER_TIME_ADD(const fs_builder &bld, 659b8e80941Smrg int shader_time_subindex, 660b8e80941Smrg fs_reg value) 661b8e80941Smrg{ 662b8e80941Smrg int index = shader_time_index * 3 + shader_time_subindex; 663b8e80941Smrg struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE); 664b8e80941Smrg 665b8e80941Smrg fs_reg payload; 666b8e80941Smrg if (dispatch_width == 8) 667b8e80941Smrg payload = vgrf(glsl_type::uvec2_type); 668b8e80941Smrg else 669b8e80941Smrg payload = vgrf(glsl_type::uint_type); 670b8e80941Smrg 671b8e80941Smrg bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value); 672b8e80941Smrg} 673b8e80941Smrg 674b8e80941Smrgvoid 675b8e80941Smrgfs_visitor::vfail(const char *format, va_list va) 676b8e80941Smrg{ 677b8e80941Smrg char *msg; 678b8e80941Smrg 679b8e80941Smrg if (failed) 680b8e80941Smrg return; 681b8e80941Smrg 682b8e80941Smrg failed = true; 683b8e80941Smrg 684b8e80941Smrg msg = ralloc_vasprintf(mem_ctx, format, va); 685b8e80941Smrg msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 686b8e80941Smrg 687b8e80941Smrg this->fail_msg = msg; 688b8e80941Smrg 689b8e80941Smrg if (debug_enabled) { 690b8e80941Smrg fprintf(stderr, "%s", msg); 691b8e80941Smrg } 692b8e80941Smrg} 693b8e80941Smrg 694b8e80941Smrgvoid 695b8e80941Smrgfs_visitor::fail(const char *format, ...) 696b8e80941Smrg{ 697b8e80941Smrg va_list va; 698b8e80941Smrg 699b8e80941Smrg va_start(va, format); 700b8e80941Smrg vfail(format, va); 701b8e80941Smrg va_end(va); 702b8e80941Smrg} 703b8e80941Smrg 704b8e80941Smrg/** 705b8e80941Smrg * Mark this program as impossible to compile with dispatch width greater 706b8e80941Smrg * than n. 707b8e80941Smrg * 708b8e80941Smrg * During the SIMD8 compile (which happens first), we can detect and flag 709b8e80941Smrg * things that are unsupported in SIMD16+ mode, so the compiler can skip the 710b8e80941Smrg * SIMD16+ compile altogether. 711b8e80941Smrg * 712b8e80941Smrg * During a compile of dispatch width greater than n (if one happens anyway), 713b8e80941Smrg * this just calls fail(). 714b8e80941Smrg */ 715b8e80941Smrgvoid 716b8e80941Smrgfs_visitor::limit_dispatch_width(unsigned n, const char *msg) 717b8e80941Smrg{ 718b8e80941Smrg if (dispatch_width > n) { 719b8e80941Smrg fail("%s", msg); 720b8e80941Smrg } else { 721b8e80941Smrg max_dispatch_width = n; 722b8e80941Smrg compiler->shader_perf_log(log_data, 723b8e80941Smrg "Shader dispatch width limited to SIMD%d: %s", 724b8e80941Smrg n, msg); 725b8e80941Smrg } 726b8e80941Smrg} 727b8e80941Smrg 728b8e80941Smrg/** 729b8e80941Smrg * Returns true if the instruction has a flag that means it won't 730b8e80941Smrg * update an entire destination register. 731b8e80941Smrg * 732b8e80941Smrg * For example, dead code elimination and live variable analysis want to know 733b8e80941Smrg * when a write to a variable screens off any preceding values that were in 734b8e80941Smrg * it. 735b8e80941Smrg */ 736b8e80941Smrgbool 737b8e80941Smrgfs_inst::is_partial_write() const 738b8e80941Smrg{ 739b8e80941Smrg return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || 740b8e80941Smrg (this->exec_size * type_sz(this->dst.type)) < 32 || 741b8e80941Smrg !this->dst.is_contiguous() || 742b8e80941Smrg this->dst.offset % REG_SIZE != 0); 743b8e80941Smrg} 744b8e80941Smrg 745b8e80941Smrgunsigned 746b8e80941Smrgfs_inst::components_read(unsigned i) const 747b8e80941Smrg{ 748b8e80941Smrg /* Return zero if the source is not present. */ 749b8e80941Smrg if (src[i].file == BAD_FILE) 750b8e80941Smrg return 0; 751b8e80941Smrg 752b8e80941Smrg switch (opcode) { 753b8e80941Smrg case FS_OPCODE_LINTERP: 754b8e80941Smrg if (i == 0) 755b8e80941Smrg return 2; 756b8e80941Smrg else 757b8e80941Smrg return 1; 758b8e80941Smrg 759b8e80941Smrg case FS_OPCODE_PIXEL_X: 760b8e80941Smrg case FS_OPCODE_PIXEL_Y: 761b8e80941Smrg assert(i == 0); 762b8e80941Smrg return 2; 763b8e80941Smrg 764b8e80941Smrg case FS_OPCODE_FB_WRITE_LOGICAL: 765b8e80941Smrg assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); 766b8e80941Smrg /* First/second FB write color. */ 767b8e80941Smrg if (i < 2) 768b8e80941Smrg return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; 769b8e80941Smrg else 770b8e80941Smrg return 1; 771b8e80941Smrg 772b8e80941Smrg case SHADER_OPCODE_TEX_LOGICAL: 773b8e80941Smrg case SHADER_OPCODE_TXD_LOGICAL: 774b8e80941Smrg case SHADER_OPCODE_TXF_LOGICAL: 775b8e80941Smrg case SHADER_OPCODE_TXL_LOGICAL: 776b8e80941Smrg case SHADER_OPCODE_TXS_LOGICAL: 777b8e80941Smrg case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 778b8e80941Smrg case FS_OPCODE_TXB_LOGICAL: 779b8e80941Smrg case SHADER_OPCODE_TXF_CMS_LOGICAL: 780b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 781b8e80941Smrg case SHADER_OPCODE_TXF_UMS_LOGICAL: 782b8e80941Smrg case SHADER_OPCODE_TXF_MCS_LOGICAL: 783b8e80941Smrg case SHADER_OPCODE_LOD_LOGICAL: 784b8e80941Smrg case SHADER_OPCODE_TG4_LOGICAL: 785b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 786b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO_LOGICAL: 787b8e80941Smrg assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM && 788b8e80941Smrg src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); 789b8e80941Smrg /* Texture coordinates. */ 790b8e80941Smrg if (i == TEX_LOGICAL_SRC_COORDINATE) 791b8e80941Smrg return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; 792b8e80941Smrg /* Texture derivatives. */ 793b8e80941Smrg else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) && 794b8e80941Smrg opcode == SHADER_OPCODE_TXD_LOGICAL) 795b8e80941Smrg return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; 796b8e80941Smrg /* Texture offset. */ 797b8e80941Smrg else if (i == TEX_LOGICAL_SRC_TG4_OFFSET) 798b8e80941Smrg return 2; 799b8e80941Smrg /* MCS */ 800b8e80941Smrg else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL) 801b8e80941Smrg return 2; 802b8e80941Smrg else 803b8e80941Smrg return 1; 804b8e80941Smrg 805b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 806b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 807b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM); 808b8e80941Smrg /* Surface coordinates. */ 809b8e80941Smrg if (i == SURFACE_LOGICAL_SRC_ADDRESS) 810b8e80941Smrg return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; 811b8e80941Smrg /* Surface operation source (ignored for reads). */ 812b8e80941Smrg else if (i == SURFACE_LOGICAL_SRC_DATA) 813b8e80941Smrg return 0; 814b8e80941Smrg else 815b8e80941Smrg return 1; 816b8e80941Smrg 817b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 818b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 819b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && 820b8e80941Smrg src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); 821b8e80941Smrg /* Surface coordinates. */ 822b8e80941Smrg if (i == SURFACE_LOGICAL_SRC_ADDRESS) 823b8e80941Smrg return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; 824b8e80941Smrg /* Surface operation source. */ 825b8e80941Smrg else if (i == SURFACE_LOGICAL_SRC_DATA) 826b8e80941Smrg return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; 827b8e80941Smrg else 828b8e80941Smrg return 1; 829b8e80941Smrg 830b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 831b8e80941Smrg assert(src[2].file == IMM); 832b8e80941Smrg return 1; 833b8e80941Smrg 834b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 835b8e80941Smrg assert(src[2].file == IMM); 836b8e80941Smrg return i == 1 ? src[2].ud : 1; 837b8e80941Smrg 838b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 839b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 840b8e80941Smrg assert(src[2].file == IMM); 841b8e80941Smrg if (i == 1) { 842b8e80941Smrg /* Data source */ 843b8e80941Smrg const unsigned op = src[2].ud; 844b8e80941Smrg switch (op) { 845b8e80941Smrg case BRW_AOP_INC: 846b8e80941Smrg case BRW_AOP_DEC: 847b8e80941Smrg case BRW_AOP_PREDEC: 848b8e80941Smrg return 0; 849b8e80941Smrg case BRW_AOP_CMPWR: 850b8e80941Smrg return 2; 851b8e80941Smrg default: 852b8e80941Smrg return 1; 853b8e80941Smrg } 854b8e80941Smrg } else { 855b8e80941Smrg return 1; 856b8e80941Smrg } 857b8e80941Smrg 858b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: 859b8e80941Smrg assert(src[2].file == IMM); 860b8e80941Smrg if (i == 1) { 861b8e80941Smrg /* Data source */ 862b8e80941Smrg const unsigned op = src[2].ud; 863b8e80941Smrg return op == BRW_AOP_FCMPWR ? 2 : 1; 864b8e80941Smrg } else { 865b8e80941Smrg return 1; 866b8e80941Smrg } 867b8e80941Smrg 868b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 869b8e80941Smrg /* Scattered logical opcodes use the following params: 870b8e80941Smrg * src[0] Surface coordinates 871b8e80941Smrg * src[1] Surface operation source (ignored for reads) 872b8e80941Smrg * src[2] Surface 873b8e80941Smrg * src[3] IMM with always 1 dimension. 874b8e80941Smrg * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32 875b8e80941Smrg */ 876b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && 877b8e80941Smrg src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); 878b8e80941Smrg return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1; 879b8e80941Smrg 880b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 881b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && 882b8e80941Smrg src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); 883b8e80941Smrg return 1; 884b8e80941Smrg 885b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 886b8e80941Smrg case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: { 887b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && 888b8e80941Smrg src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); 889b8e80941Smrg const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; 890b8e80941Smrg /* Surface coordinates. */ 891b8e80941Smrg if (i == SURFACE_LOGICAL_SRC_ADDRESS) 892b8e80941Smrg return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; 893b8e80941Smrg /* Surface operation source. */ 894b8e80941Smrg else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR) 895b8e80941Smrg return 2; 896b8e80941Smrg else if (i == SURFACE_LOGICAL_SRC_DATA && 897b8e80941Smrg (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC)) 898b8e80941Smrg return 0; 899b8e80941Smrg else 900b8e80941Smrg return 1; 901b8e80941Smrg } 902b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 903b8e80941Smrg return (i == 0 ? 2 : 1); 904b8e80941Smrg 905b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: { 906b8e80941Smrg assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM && 907b8e80941Smrg src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM); 908b8e80941Smrg const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud; 909b8e80941Smrg /* Surface coordinates. */ 910b8e80941Smrg if (i == SURFACE_LOGICAL_SRC_ADDRESS) 911b8e80941Smrg return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud; 912b8e80941Smrg /* Surface operation source. */ 913b8e80941Smrg else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR) 914b8e80941Smrg return 2; 915b8e80941Smrg else 916b8e80941Smrg return 1; 917b8e80941Smrg } 918b8e80941Smrg 919b8e80941Smrg default: 920b8e80941Smrg return 1; 921b8e80941Smrg } 922b8e80941Smrg} 923b8e80941Smrg 924b8e80941Smrgunsigned 925b8e80941Smrgfs_inst::size_read(int arg) const 926b8e80941Smrg{ 927b8e80941Smrg switch (opcode) { 928b8e80941Smrg case SHADER_OPCODE_SEND: 929b8e80941Smrg if (arg == 2) { 930b8e80941Smrg return mlen * REG_SIZE; 931b8e80941Smrg } else if (arg == 3) { 932b8e80941Smrg return ex_mlen * REG_SIZE; 933b8e80941Smrg } 934b8e80941Smrg break; 935b8e80941Smrg 936b8e80941Smrg case FS_OPCODE_FB_WRITE: 937b8e80941Smrg case FS_OPCODE_REP_FB_WRITE: 938b8e80941Smrg if (arg == 0) { 939b8e80941Smrg if (base_mrf >= 0) 940b8e80941Smrg return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE; 941b8e80941Smrg else 942b8e80941Smrg return mlen * REG_SIZE; 943b8e80941Smrg } 944b8e80941Smrg break; 945b8e80941Smrg 946b8e80941Smrg case FS_OPCODE_FB_READ: 947b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8: 948b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 949b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 950b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 951b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8: 952b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 953b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 954b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 955b8e80941Smrg if (arg == 0) 956b8e80941Smrg return mlen * REG_SIZE; 957b8e80941Smrg break; 958b8e80941Smrg 959b8e80941Smrg case FS_OPCODE_SET_SAMPLE_ID: 960b8e80941Smrg if (arg == 1) 961b8e80941Smrg return 1; 962b8e80941Smrg break; 963b8e80941Smrg 964b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7: 965b8e80941Smrg /* The payload is actually stored in src1 */ 966b8e80941Smrg if (arg == 1) 967b8e80941Smrg return mlen * REG_SIZE; 968b8e80941Smrg break; 969b8e80941Smrg 970b8e80941Smrg case FS_OPCODE_LINTERP: 971b8e80941Smrg if (arg == 1) 972b8e80941Smrg return 16; 973b8e80941Smrg break; 974b8e80941Smrg 975b8e80941Smrg case SHADER_OPCODE_LOAD_PAYLOAD: 976b8e80941Smrg if (arg < this->header_size) 977b8e80941Smrg return REG_SIZE; 978b8e80941Smrg break; 979b8e80941Smrg 980b8e80941Smrg case CS_OPCODE_CS_TERMINATE: 981b8e80941Smrg case SHADER_OPCODE_BARRIER: 982b8e80941Smrg return REG_SIZE; 983b8e80941Smrg 984b8e80941Smrg case SHADER_OPCODE_MOV_INDIRECT: 985b8e80941Smrg if (arg == 0) { 986b8e80941Smrg assert(src[2].file == IMM); 987b8e80941Smrg return src[2].ud; 988b8e80941Smrg } 989b8e80941Smrg break; 990b8e80941Smrg 991b8e80941Smrg default: 992b8e80941Smrg if (is_tex() && arg == 0 && src[0].file == VGRF) 993b8e80941Smrg return mlen * REG_SIZE; 994b8e80941Smrg break; 995b8e80941Smrg } 996b8e80941Smrg 997b8e80941Smrg switch (src[arg].file) { 998b8e80941Smrg case UNIFORM: 999b8e80941Smrg case IMM: 1000b8e80941Smrg return components_read(arg) * type_sz(src[arg].type); 1001b8e80941Smrg case BAD_FILE: 1002b8e80941Smrg case ARF: 1003b8e80941Smrg case FIXED_GRF: 1004b8e80941Smrg case VGRF: 1005b8e80941Smrg case ATTR: 1006b8e80941Smrg return components_read(arg) * src[arg].component_size(exec_size); 1007b8e80941Smrg case MRF: 1008b8e80941Smrg unreachable("MRF registers are not allowed as sources"); 1009b8e80941Smrg } 1010b8e80941Smrg return 0; 1011b8e80941Smrg} 1012b8e80941Smrg 1013b8e80941Smrgnamespace { 1014b8e80941Smrg /* Return the subset of flag registers that an instruction could 1015b8e80941Smrg * potentially read or write based on the execution controls and flag 1016b8e80941Smrg * subregister number of the instruction. 1017b8e80941Smrg */ 1018b8e80941Smrg unsigned 1019b8e80941Smrg flag_mask(const fs_inst *inst) 1020b8e80941Smrg { 1021b8e80941Smrg const unsigned start = inst->flag_subreg * 16 + inst->group; 1022b8e80941Smrg const unsigned end = start + inst->exec_size; 1023b8e80941Smrg return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1); 1024b8e80941Smrg } 1025b8e80941Smrg 1026b8e80941Smrg unsigned 1027b8e80941Smrg bit_mask(unsigned n) 1028b8e80941Smrg { 1029b8e80941Smrg return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1); 1030b8e80941Smrg } 1031b8e80941Smrg 1032b8e80941Smrg unsigned 1033b8e80941Smrg flag_mask(const fs_reg &r, unsigned sz) 1034b8e80941Smrg { 1035b8e80941Smrg if (r.file == ARF) { 1036b8e80941Smrg const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr; 1037b8e80941Smrg const unsigned end = start + sz; 1038b8e80941Smrg return bit_mask(end) & ~bit_mask(start); 1039b8e80941Smrg } else { 1040b8e80941Smrg return 0; 1041b8e80941Smrg } 1042b8e80941Smrg } 1043b8e80941Smrg} 1044b8e80941Smrg 1045b8e80941Smrgunsigned 1046b8e80941Smrgfs_inst::flags_read(const gen_device_info *devinfo) const 1047b8e80941Smrg{ 1048b8e80941Smrg if (predicate == BRW_PREDICATE_ALIGN1_ANYV || 1049b8e80941Smrg predicate == BRW_PREDICATE_ALIGN1_ALLV) { 1050b8e80941Smrg /* The vertical predication modes combine corresponding bits from 1051b8e80941Smrg * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware. 1052b8e80941Smrg */ 1053b8e80941Smrg const unsigned shift = devinfo->gen >= 7 ? 4 : 2; 1054b8e80941Smrg return flag_mask(this) << shift | flag_mask(this); 1055b8e80941Smrg } else if (predicate) { 1056b8e80941Smrg return flag_mask(this); 1057b8e80941Smrg } else { 1058b8e80941Smrg unsigned mask = 0; 1059b8e80941Smrg for (int i = 0; i < sources; i++) { 1060b8e80941Smrg mask |= flag_mask(src[i], size_read(i)); 1061b8e80941Smrg } 1062b8e80941Smrg return mask; 1063b8e80941Smrg } 1064b8e80941Smrg} 1065b8e80941Smrg 1066b8e80941Smrgunsigned 1067b8e80941Smrgfs_inst::flags_written() const 1068b8e80941Smrg{ 1069b8e80941Smrg if ((conditional_mod && (opcode != BRW_OPCODE_SEL && 1070b8e80941Smrg opcode != BRW_OPCODE_CSEL && 1071b8e80941Smrg opcode != BRW_OPCODE_IF && 1072b8e80941Smrg opcode != BRW_OPCODE_WHILE)) || 1073b8e80941Smrg opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL || 1074b8e80941Smrg opcode == FS_OPCODE_FB_WRITE) { 1075b8e80941Smrg return flag_mask(this); 1076b8e80941Smrg } else { 1077b8e80941Smrg return flag_mask(dst, size_written); 1078b8e80941Smrg } 1079b8e80941Smrg} 1080b8e80941Smrg 1081b8e80941Smrg/** 1082b8e80941Smrg * Returns how many MRFs an FS opcode will write over. 1083b8e80941Smrg * 1084b8e80941Smrg * Note that this is not the 0 or 1 implied writes in an actual gen 1085b8e80941Smrg * instruction -- the FS opcodes often generate MOVs in addition. 1086b8e80941Smrg */ 1087b8e80941Smrgint 1088b8e80941Smrgfs_visitor::implied_mrf_writes(fs_inst *inst) const 1089b8e80941Smrg{ 1090b8e80941Smrg if (inst->mlen == 0) 1091b8e80941Smrg return 0; 1092b8e80941Smrg 1093b8e80941Smrg if (inst->base_mrf == -1) 1094b8e80941Smrg return 0; 1095b8e80941Smrg 1096b8e80941Smrg switch (inst->opcode) { 1097b8e80941Smrg case SHADER_OPCODE_RCP: 1098b8e80941Smrg case SHADER_OPCODE_RSQ: 1099b8e80941Smrg case SHADER_OPCODE_SQRT: 1100b8e80941Smrg case SHADER_OPCODE_EXP2: 1101b8e80941Smrg case SHADER_OPCODE_LOG2: 1102b8e80941Smrg case SHADER_OPCODE_SIN: 1103b8e80941Smrg case SHADER_OPCODE_COS: 1104b8e80941Smrg return 1 * dispatch_width / 8; 1105b8e80941Smrg case SHADER_OPCODE_POW: 1106b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 1107b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 1108b8e80941Smrg return 2 * dispatch_width / 8; 1109b8e80941Smrg case SHADER_OPCODE_TEX: 1110b8e80941Smrg case FS_OPCODE_TXB: 1111b8e80941Smrg case SHADER_OPCODE_TXD: 1112b8e80941Smrg case SHADER_OPCODE_TXF: 1113b8e80941Smrg case SHADER_OPCODE_TXF_CMS: 1114b8e80941Smrg case SHADER_OPCODE_TXF_MCS: 1115b8e80941Smrg case SHADER_OPCODE_TG4: 1116b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET: 1117b8e80941Smrg case SHADER_OPCODE_TXL: 1118b8e80941Smrg case SHADER_OPCODE_TXS: 1119b8e80941Smrg case SHADER_OPCODE_LOD: 1120b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO: 1121b8e80941Smrg return 1; 1122b8e80941Smrg case FS_OPCODE_FB_WRITE: 1123b8e80941Smrg case FS_OPCODE_REP_FB_WRITE: 1124b8e80941Smrg return inst->src[0].file == BAD_FILE ? 0 : 2; 1125b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 1126b8e80941Smrg case SHADER_OPCODE_GEN4_SCRATCH_READ: 1127b8e80941Smrg return 1; 1128b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4: 1129b8e80941Smrg return inst->mlen; 1130b8e80941Smrg case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 1131b8e80941Smrg return inst->mlen; 1132b8e80941Smrg default: 1133b8e80941Smrg unreachable("not reached"); 1134b8e80941Smrg } 1135b8e80941Smrg} 1136b8e80941Smrg 1137b8e80941Smrgfs_reg 1138b8e80941Smrgfs_visitor::vgrf(const glsl_type *const type) 1139b8e80941Smrg{ 1140b8e80941Smrg int reg_width = dispatch_width / 8; 1141b8e80941Smrg return fs_reg(VGRF, 1142b8e80941Smrg alloc.allocate(type_size_scalar(type, false) * reg_width), 1143b8e80941Smrg brw_type_for_base_type(type)); 1144b8e80941Smrg} 1145b8e80941Smrg 1146b8e80941Smrgfs_reg::fs_reg(enum brw_reg_file file, int nr) 1147b8e80941Smrg{ 1148b8e80941Smrg init(); 1149b8e80941Smrg this->file = file; 1150b8e80941Smrg this->nr = nr; 1151b8e80941Smrg this->type = BRW_REGISTER_TYPE_F; 1152b8e80941Smrg this->stride = (file == UNIFORM ? 0 : 1); 1153b8e80941Smrg} 1154b8e80941Smrg 1155b8e80941Smrgfs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type) 1156b8e80941Smrg{ 1157b8e80941Smrg init(); 1158b8e80941Smrg this->file = file; 1159b8e80941Smrg this->nr = nr; 1160b8e80941Smrg this->type = type; 1161b8e80941Smrg this->stride = (file == UNIFORM ? 0 : 1); 1162b8e80941Smrg} 1163b8e80941Smrg 1164b8e80941Smrg/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. 1165b8e80941Smrg * This brings in those uniform definitions 1166b8e80941Smrg */ 1167b8e80941Smrgvoid 1168b8e80941Smrgfs_visitor::import_uniforms(fs_visitor *v) 1169b8e80941Smrg{ 1170b8e80941Smrg this->push_constant_loc = v->push_constant_loc; 1171b8e80941Smrg this->pull_constant_loc = v->pull_constant_loc; 1172b8e80941Smrg this->uniforms = v->uniforms; 1173b8e80941Smrg this->subgroup_id = v->subgroup_id; 1174b8e80941Smrg} 1175b8e80941Smrg 1176b8e80941Smrgvoid 1177b8e80941Smrgfs_visitor::emit_fragcoord_interpolation(fs_reg wpos) 1178b8e80941Smrg{ 1179b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1180b8e80941Smrg 1181b8e80941Smrg /* gl_FragCoord.x */ 1182b8e80941Smrg bld.MOV(wpos, this->pixel_x); 1183b8e80941Smrg wpos = offset(wpos, bld, 1); 1184b8e80941Smrg 1185b8e80941Smrg /* gl_FragCoord.y */ 1186b8e80941Smrg bld.MOV(wpos, this->pixel_y); 1187b8e80941Smrg wpos = offset(wpos, bld, 1); 1188b8e80941Smrg 1189b8e80941Smrg /* gl_FragCoord.z */ 1190b8e80941Smrg if (devinfo->gen >= 6) { 1191b8e80941Smrg bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg)); 1192b8e80941Smrg } else { 1193b8e80941Smrg bld.emit(FS_OPCODE_LINTERP, wpos, 1194b8e80941Smrg this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL], 1195b8e80941Smrg component(interp_reg(VARYING_SLOT_POS, 2), 0)); 1196b8e80941Smrg } 1197b8e80941Smrg wpos = offset(wpos, bld, 1); 1198b8e80941Smrg 1199b8e80941Smrg /* gl_FragCoord.w: Already set up in emit_interpolation */ 1200b8e80941Smrg bld.MOV(wpos, this->wpos_w); 1201b8e80941Smrg} 1202b8e80941Smrg 1203b8e80941Smrgenum brw_barycentric_mode 1204b8e80941Smrgbrw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op) 1205b8e80941Smrg{ 1206b8e80941Smrg /* Barycentric modes don't make sense for flat inputs. */ 1207b8e80941Smrg assert(mode != INTERP_MODE_FLAT); 1208b8e80941Smrg 1209b8e80941Smrg unsigned bary; 1210b8e80941Smrg switch (op) { 1211b8e80941Smrg case nir_intrinsic_load_barycentric_pixel: 1212b8e80941Smrg case nir_intrinsic_load_barycentric_at_offset: 1213b8e80941Smrg bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL; 1214b8e80941Smrg break; 1215b8e80941Smrg case nir_intrinsic_load_barycentric_centroid: 1216b8e80941Smrg bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID; 1217b8e80941Smrg break; 1218b8e80941Smrg case nir_intrinsic_load_barycentric_sample: 1219b8e80941Smrg case nir_intrinsic_load_barycentric_at_sample: 1220b8e80941Smrg bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE; 1221b8e80941Smrg break; 1222b8e80941Smrg default: 1223b8e80941Smrg unreachable("invalid intrinsic"); 1224b8e80941Smrg } 1225b8e80941Smrg 1226b8e80941Smrg if (mode == INTERP_MODE_NOPERSPECTIVE) 1227b8e80941Smrg bary += 3; 1228b8e80941Smrg 1229b8e80941Smrg return (enum brw_barycentric_mode) bary; 1230b8e80941Smrg} 1231b8e80941Smrg 1232b8e80941Smrg/** 1233b8e80941Smrg * Turn one of the two CENTROID barycentric modes into PIXEL mode. 1234b8e80941Smrg */ 1235b8e80941Smrgstatic enum brw_barycentric_mode 1236b8e80941Smrgcentroid_to_pixel(enum brw_barycentric_mode bary) 1237b8e80941Smrg{ 1238b8e80941Smrg assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID || 1239b8e80941Smrg bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); 1240b8e80941Smrg return (enum brw_barycentric_mode) ((unsigned) bary - 1); 1241b8e80941Smrg} 1242b8e80941Smrg 1243b8e80941Smrgfs_reg * 1244b8e80941Smrgfs_visitor::emit_frontfacing_interpolation() 1245b8e80941Smrg{ 1246b8e80941Smrg fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); 1247b8e80941Smrg 1248b8e80941Smrg if (devinfo->gen >= 6) { 1249b8e80941Smrg /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create 1250b8e80941Smrg * a boolean result from this (~0/true or 0/false). 1251b8e80941Smrg * 1252b8e80941Smrg * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish 1253b8e80941Smrg * this task in only one instruction: 1254b8e80941Smrg * - a negation source modifier will flip the bit; and 1255b8e80941Smrg * - a W -> D type conversion will sign extend the bit into the high 1256b8e80941Smrg * word of the destination. 1257b8e80941Smrg * 1258b8e80941Smrg * An ASR 15 fills the low word of the destination. 1259b8e80941Smrg */ 1260b8e80941Smrg fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 1261b8e80941Smrg g0.negate = true; 1262b8e80941Smrg 1263b8e80941Smrg bld.ASR(*reg, g0, brw_imm_d(15)); 1264b8e80941Smrg } else { 1265b8e80941Smrg /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create 1266b8e80941Smrg * a boolean result from this (1/true or 0/false). 1267b8e80941Smrg * 1268b8e80941Smrg * Like in the above case, since the bit is the MSB of g1.6:UD we can use 1269b8e80941Smrg * the negation source modifier to flip it. Unfortunately the SHR 1270b8e80941Smrg * instruction only operates on UD (or D with an abs source modifier) 1271b8e80941Smrg * sources without negation. 1272b8e80941Smrg * 1273b8e80941Smrg * Instead, use ASR (which will give ~0/true or 0/false). 1274b8e80941Smrg */ 1275b8e80941Smrg fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 1276b8e80941Smrg g1_6.negate = true; 1277b8e80941Smrg 1278b8e80941Smrg bld.ASR(*reg, g1_6, brw_imm_d(31)); 1279b8e80941Smrg } 1280b8e80941Smrg 1281b8e80941Smrg return reg; 1282b8e80941Smrg} 1283b8e80941Smrg 1284b8e80941Smrgvoid 1285b8e80941Smrgfs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) 1286b8e80941Smrg{ 1287b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1288b8e80941Smrg struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); 1289b8e80941Smrg assert(dst.type == BRW_REGISTER_TYPE_F); 1290b8e80941Smrg 1291b8e80941Smrg if (wm_prog_data->persample_dispatch) { 1292b8e80941Smrg /* Convert int_sample_pos to floating point */ 1293b8e80941Smrg bld.MOV(dst, int_sample_pos); 1294b8e80941Smrg /* Scale to the range [0, 1] */ 1295b8e80941Smrg bld.MUL(dst, dst, brw_imm_f(1 / 16.0f)); 1296b8e80941Smrg } 1297b8e80941Smrg else { 1298b8e80941Smrg /* From ARB_sample_shading specification: 1299b8e80941Smrg * "When rendering to a non-multisample buffer, or if multisample 1300b8e80941Smrg * rasterization is disabled, gl_SamplePosition will always be 1301b8e80941Smrg * (0.5, 0.5). 1302b8e80941Smrg */ 1303b8e80941Smrg bld.MOV(dst, brw_imm_f(0.5f)); 1304b8e80941Smrg } 1305b8e80941Smrg} 1306b8e80941Smrg 1307b8e80941Smrgfs_reg * 1308b8e80941Smrgfs_visitor::emit_samplepos_setup() 1309b8e80941Smrg{ 1310b8e80941Smrg assert(devinfo->gen >= 6); 1311b8e80941Smrg 1312b8e80941Smrg const fs_builder abld = bld.annotate("compute sample position"); 1313b8e80941Smrg fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type)); 1314b8e80941Smrg fs_reg pos = *reg; 1315b8e80941Smrg fs_reg int_sample_x = vgrf(glsl_type::int_type); 1316b8e80941Smrg fs_reg int_sample_y = vgrf(glsl_type::int_type); 1317b8e80941Smrg 1318b8e80941Smrg /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 1319b8e80941Smrg * mode will be enabled. 1320b8e80941Smrg * 1321b8e80941Smrg * From the Ivy Bridge PRM, volume 2 part 1, page 344: 1322b8e80941Smrg * R31.1:0 Position Offset X/Y for Slot[3:0] 1323b8e80941Smrg * R31.3:2 Position Offset X/Y for Slot[7:4] 1324b8e80941Smrg * ..... 1325b8e80941Smrg * 1326b8e80941Smrg * The X, Y sample positions come in as bytes in thread payload. So, read 1327b8e80941Smrg * the positions using vstride=16, width=8, hstride=2. 1328b8e80941Smrg */ 1329b8e80941Smrg const fs_reg sample_pos_reg = 1330b8e80941Smrg fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W); 1331b8e80941Smrg 1332b8e80941Smrg /* Compute gl_SamplePosition.x */ 1333b8e80941Smrg abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0)); 1334b8e80941Smrg compute_sample_position(offset(pos, abld, 0), int_sample_x); 1335b8e80941Smrg 1336b8e80941Smrg /* Compute gl_SamplePosition.y */ 1337b8e80941Smrg abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1)); 1338b8e80941Smrg compute_sample_position(offset(pos, abld, 1), int_sample_y); 1339b8e80941Smrg return reg; 1340b8e80941Smrg} 1341b8e80941Smrg 1342b8e80941Smrgfs_reg * 1343b8e80941Smrgfs_visitor::emit_sampleid_setup() 1344b8e80941Smrg{ 1345b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1346b8e80941Smrg brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 1347b8e80941Smrg assert(devinfo->gen >= 6); 1348b8e80941Smrg 1349b8e80941Smrg const fs_builder abld = bld.annotate("compute sample id"); 1350b8e80941Smrg fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type)); 1351b8e80941Smrg 1352b8e80941Smrg if (!key->multisample_fbo) { 1353b8e80941Smrg /* As per GL_ARB_sample_shading specification: 1354b8e80941Smrg * "When rendering to a non-multisample buffer, or if multisample 1355b8e80941Smrg * rasterization is disabled, gl_SampleID will always be zero." 1356b8e80941Smrg */ 1357b8e80941Smrg abld.MOV(*reg, brw_imm_d(0)); 1358b8e80941Smrg } else if (devinfo->gen >= 8) { 1359b8e80941Smrg /* Sample ID comes in as 4-bit numbers in g1.0: 1360b8e80941Smrg * 1361b8e80941Smrg * 15:12 Slot 3 SampleID (only used in SIMD16) 1362b8e80941Smrg * 11:8 Slot 2 SampleID (only used in SIMD16) 1363b8e80941Smrg * 7:4 Slot 1 SampleID 1364b8e80941Smrg * 3:0 Slot 0 SampleID 1365b8e80941Smrg * 1366b8e80941Smrg * Each slot corresponds to four channels, so we want to replicate each 1367b8e80941Smrg * half-byte value to 4 channels in a row: 1368b8e80941Smrg * 1369b8e80941Smrg * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 1370b8e80941Smrg * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 1371b8e80941Smrg * 1372b8e80941Smrg * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16) 1373b8e80941Smrg * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 1374b8e80941Smrg * 1375b8e80941Smrg * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 1376b8e80941Smrg * channels to read the first byte (7:0), and the second group of 8 1377b8e80941Smrg * channels to read the second byte (15:8). Then, we shift right by 1378b8e80941Smrg * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 1379b8e80941Smrg * values into place. Finally, we AND with 0xf to keep the low nibble. 1380b8e80941Smrg * 1381b8e80941Smrg * shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V 1382b8e80941Smrg * and(16) dst<1>D tmp<8,8,1>W 0xf:W 1383b8e80941Smrg * 1384b8e80941Smrg * TODO: These payload bits exist on Gen7 too, but they appear to always 1385b8e80941Smrg * be zero, so this code fails to work. We should find out why. 1386b8e80941Smrg */ 1387b8e80941Smrg const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW); 1388b8e80941Smrg 1389b8e80941Smrg for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { 1390b8e80941Smrg const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i); 1391b8e80941Smrg hbld.SHR(offset(tmp, hbld, i), 1392b8e80941Smrg stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB), 1393b8e80941Smrg 1, 8, 0), 1394b8e80941Smrg brw_imm_v(0x44440000)); 1395b8e80941Smrg } 1396b8e80941Smrg 1397b8e80941Smrg abld.AND(*reg, tmp, brw_imm_w(0xf)); 1398b8e80941Smrg } else { 1399b8e80941Smrg const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0); 1400b8e80941Smrg const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW); 1401b8e80941Smrg 1402b8e80941Smrg /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with 1403b8e80941Smrg * 8x multisampling, subspan 0 will represent sample N (where N 1404b8e80941Smrg * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or 1405b8e80941Smrg * 7. We can find the value of N by looking at R0.0 bits 7:6 1406b8e80941Smrg * ("Starting Sample Pair Index (SSPI)") and multiplying by two 1407b8e80941Smrg * (since samples are always delivered in pairs). That is, we 1408b8e80941Smrg * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then 1409b8e80941Smrg * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in 1410b8e80941Smrg * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 1411b8e80941Smrg * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by 1412b8e80941Smrg * populating a temporary variable with the sequence (0, 1, 2, 3), 1413b8e80941Smrg * and then reading from it using vstride=1, width=4, hstride=0. 1414b8e80941Smrg * These computations hold good for 4x multisampling as well. 1415b8e80941Smrg * 1416b8e80941Smrg * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1): 1417b8e80941Smrg * the first four slots are sample 0 of subspan 0; the next four 1418b8e80941Smrg * are sample 1 of subspan 0; the third group is sample 0 of 1419b8e80941Smrg * subspan 1, and finally sample 1 of subspan 1. 1420b8e80941Smrg */ 1421b8e80941Smrg 1422b8e80941Smrg /* SKL+ has an extra bit for the Starting Sample Pair Index to 1423b8e80941Smrg * accomodate 16x MSAA. 1424b8e80941Smrg */ 1425b8e80941Smrg abld.exec_all().group(1, 0) 1426b8e80941Smrg .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), 1427b8e80941Smrg brw_imm_ud(0xc0)); 1428b8e80941Smrg abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5)); 1429b8e80941Smrg 1430b8e80941Smrg /* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we 1431b8e80941Smrg * can assume 4x MSAA. Disallow it on IVB+ 1432b8e80941Smrg * 1433b8e80941Smrg * FINISHME: One day, we could come up with a way to do this that 1434b8e80941Smrg * actually works on gen7. 1435b8e80941Smrg */ 1436b8e80941Smrg if (devinfo->gen >= 7) 1437b8e80941Smrg limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gen7"); 1438b8e80941Smrg abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210)); 1439b8e80941Smrg 1440b8e80941Smrg /* This special instruction takes care of setting vstride=1, 1441b8e80941Smrg * width=4, hstride=0 of t2 during an ADD instruction. 1442b8e80941Smrg */ 1443b8e80941Smrg abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); 1444b8e80941Smrg } 1445b8e80941Smrg 1446b8e80941Smrg return reg; 1447b8e80941Smrg} 1448b8e80941Smrg 1449b8e80941Smrgfs_reg * 1450b8e80941Smrgfs_visitor::emit_samplemaskin_setup() 1451b8e80941Smrg{ 1452b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1453b8e80941Smrg struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); 1454b8e80941Smrg assert(devinfo->gen >= 6); 1455b8e80941Smrg 1456b8e80941Smrg fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); 1457b8e80941Smrg 1458b8e80941Smrg fs_reg coverage_mask = 1459b8e80941Smrg fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D); 1460b8e80941Smrg 1461b8e80941Smrg if (wm_prog_data->persample_dispatch) { 1462b8e80941Smrg /* gl_SampleMaskIn[] comes from two sources: the input coverage mask, 1463b8e80941Smrg * and a mask representing which sample is being processed by the 1464b8e80941Smrg * current shader invocation. 1465b8e80941Smrg * 1466b8e80941Smrg * From the OES_sample_variables specification: 1467b8e80941Smrg * "When per-sample shading is active due to the use of a fragment input 1468b8e80941Smrg * qualified by "sample" or due to the use of the gl_SampleID or 1469b8e80941Smrg * gl_SamplePosition variables, only the bit for the current sample is 1470b8e80941Smrg * set in gl_SampleMaskIn." 1471b8e80941Smrg */ 1472b8e80941Smrg const fs_builder abld = bld.annotate("compute gl_SampleMaskIn"); 1473b8e80941Smrg 1474b8e80941Smrg if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 1475b8e80941Smrg nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 1476b8e80941Smrg 1477b8e80941Smrg fs_reg one = vgrf(glsl_type::int_type); 1478b8e80941Smrg fs_reg enabled_mask = vgrf(glsl_type::int_type); 1479b8e80941Smrg abld.MOV(one, brw_imm_d(1)); 1480b8e80941Smrg abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]); 1481b8e80941Smrg abld.AND(*reg, enabled_mask, coverage_mask); 1482b8e80941Smrg } else { 1483b8e80941Smrg /* In per-pixel mode, the coverage mask is sufficient. */ 1484b8e80941Smrg *reg = coverage_mask; 1485b8e80941Smrg } 1486b8e80941Smrg return reg; 1487b8e80941Smrg} 1488b8e80941Smrg 1489b8e80941Smrgfs_reg 1490b8e80941Smrgfs_visitor::resolve_source_modifiers(const fs_reg &src) 1491b8e80941Smrg{ 1492b8e80941Smrg if (!src.abs && !src.negate) 1493b8e80941Smrg return src; 1494b8e80941Smrg 1495b8e80941Smrg fs_reg temp = bld.vgrf(src.type); 1496b8e80941Smrg bld.MOV(temp, src); 1497b8e80941Smrg 1498b8e80941Smrg return temp; 1499b8e80941Smrg} 1500b8e80941Smrg 1501b8e80941Smrgvoid 1502b8e80941Smrgfs_visitor::emit_discard_jump() 1503b8e80941Smrg{ 1504b8e80941Smrg assert(brw_wm_prog_data(this->prog_data)->uses_kill); 1505b8e80941Smrg 1506b8e80941Smrg /* For performance, after a discard, jump to the end of the 1507b8e80941Smrg * shader if all relevant channels have been discarded. 1508b8e80941Smrg */ 1509b8e80941Smrg fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP); 1510b8e80941Smrg discard_jump->flag_subreg = 1; 1511b8e80941Smrg 1512b8e80941Smrg discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; 1513b8e80941Smrg discard_jump->predicate_inverse = true; 1514b8e80941Smrg} 1515b8e80941Smrg 1516b8e80941Smrgvoid 1517b8e80941Smrgfs_visitor::emit_gs_thread_end() 1518b8e80941Smrg{ 1519b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 1520b8e80941Smrg 1521b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1522b8e80941Smrg 1523b8e80941Smrg if (gs_compile->control_data_header_size_bits > 0) { 1524b8e80941Smrg emit_gs_control_data_bits(this->final_gs_vertex_count); 1525b8e80941Smrg } 1526b8e80941Smrg 1527b8e80941Smrg const fs_builder abld = bld.annotate("thread end"); 1528b8e80941Smrg fs_inst *inst; 1529b8e80941Smrg 1530b8e80941Smrg if (gs_prog_data->static_vertex_count != -1) { 1531b8e80941Smrg foreach_in_list_reverse(fs_inst, prev, &this->instructions) { 1532b8e80941Smrg if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 || 1533b8e80941Smrg prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED || 1534b8e80941Smrg prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT || 1535b8e80941Smrg prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) { 1536b8e80941Smrg prev->eot = true; 1537b8e80941Smrg 1538b8e80941Smrg /* Delete now dead instructions. */ 1539b8e80941Smrg foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) { 1540b8e80941Smrg if (dead == prev) 1541b8e80941Smrg break; 1542b8e80941Smrg dead->remove(); 1543b8e80941Smrg } 1544b8e80941Smrg return; 1545b8e80941Smrg } else if (prev->is_control_flow() || prev->has_side_effects()) { 1546b8e80941Smrg break; 1547b8e80941Smrg } 1548b8e80941Smrg } 1549b8e80941Smrg fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1550b8e80941Smrg abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD))); 1551b8e80941Smrg inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr); 1552b8e80941Smrg inst->mlen = 1; 1553b8e80941Smrg } else { 1554b8e80941Smrg fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2); 1555b8e80941Smrg fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2); 1556b8e80941Smrg sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 1557b8e80941Smrg sources[1] = this->final_gs_vertex_count; 1558b8e80941Smrg abld.LOAD_PAYLOAD(payload, sources, 2, 2); 1559b8e80941Smrg inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); 1560b8e80941Smrg inst->mlen = 2; 1561b8e80941Smrg } 1562b8e80941Smrg inst->eot = true; 1563b8e80941Smrg inst->offset = 0; 1564b8e80941Smrg} 1565b8e80941Smrg 1566b8e80941Smrgvoid 1567b8e80941Smrgfs_visitor::assign_curb_setup() 1568b8e80941Smrg{ 1569b8e80941Smrg unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); 1570b8e80941Smrg 1571b8e80941Smrg unsigned ubo_push_length = 0; 1572b8e80941Smrg unsigned ubo_push_start[4]; 1573b8e80941Smrg for (int i = 0; i < 4; i++) { 1574b8e80941Smrg ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length); 1575b8e80941Smrg ubo_push_length += stage_prog_data->ubo_ranges[i].length; 1576b8e80941Smrg } 1577b8e80941Smrg 1578b8e80941Smrg prog_data->curb_read_length = uniform_push_length + ubo_push_length; 1579b8e80941Smrg 1580b8e80941Smrg /* Map the offsets in the UNIFORM file to fixed HW regs. */ 1581b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1582b8e80941Smrg for (unsigned int i = 0; i < inst->sources; i++) { 1583b8e80941Smrg if (inst->src[i].file == UNIFORM) { 1584b8e80941Smrg int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4; 1585b8e80941Smrg int constant_nr; 1586b8e80941Smrg if (inst->src[i].nr >= UBO_START) { 1587b8e80941Smrg /* constant_nr is in 32-bit units, the rest are in bytes */ 1588b8e80941Smrg constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] + 1589b8e80941Smrg inst->src[i].offset / 4; 1590b8e80941Smrg } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { 1591b8e80941Smrg constant_nr = push_constant_loc[uniform_nr]; 1592b8e80941Smrg } else { 1593b8e80941Smrg /* Section 5.11 of the OpenGL 4.1 spec says: 1594b8e80941Smrg * "Out-of-bounds reads return undefined values, which include 1595b8e80941Smrg * values from other variables of the active program or zero." 1596b8e80941Smrg * Just return the first push constant. 1597b8e80941Smrg */ 1598b8e80941Smrg constant_nr = 0; 1599b8e80941Smrg } 1600b8e80941Smrg 1601b8e80941Smrg struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + 1602b8e80941Smrg constant_nr / 8, 1603b8e80941Smrg constant_nr % 8); 1604b8e80941Smrg brw_reg.abs = inst->src[i].abs; 1605b8e80941Smrg brw_reg.negate = inst->src[i].negate; 1606b8e80941Smrg 1607b8e80941Smrg assert(inst->src[i].stride == 0); 1608b8e80941Smrg inst->src[i] = byte_offset( 1609b8e80941Smrg retype(brw_reg, inst->src[i].type), 1610b8e80941Smrg inst->src[i].offset % 4); 1611b8e80941Smrg } 1612b8e80941Smrg } 1613b8e80941Smrg } 1614b8e80941Smrg 1615b8e80941Smrg /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ 1616b8e80941Smrg this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length; 1617b8e80941Smrg} 1618b8e80941Smrg 1619b8e80941Smrgvoid 1620b8e80941Smrgfs_visitor::calculate_urb_setup() 1621b8e80941Smrg{ 1622b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1623b8e80941Smrg struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 1624b8e80941Smrg brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 1625b8e80941Smrg 1626b8e80941Smrg memset(prog_data->urb_setup, -1, 1627b8e80941Smrg sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); 1628b8e80941Smrg 1629b8e80941Smrg int urb_next = 0; 1630b8e80941Smrg /* Figure out where each of the incoming setup attributes lands. */ 1631b8e80941Smrg if (devinfo->gen >= 6) { 1632b8e80941Smrg if (util_bitcount64(nir->info.inputs_read & 1633b8e80941Smrg BRW_FS_VARYING_INPUT_MASK) <= 16) { 1634b8e80941Smrg /* The SF/SBE pipeline stage can do arbitrary rearrangement of the 1635b8e80941Smrg * first 16 varying inputs, so we can put them wherever we want. 1636b8e80941Smrg * Just put them in order. 1637b8e80941Smrg * 1638b8e80941Smrg * This is useful because it means that (a) inputs not used by the 1639b8e80941Smrg * fragment shader won't take up valuable register space, and (b) we 1640b8e80941Smrg * won't have to recompile the fragment shader if it gets paired with 1641b8e80941Smrg * a different vertex (or geometry) shader. 1642b8e80941Smrg */ 1643b8e80941Smrg for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { 1644b8e80941Smrg if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & 1645b8e80941Smrg BITFIELD64_BIT(i)) { 1646b8e80941Smrg prog_data->urb_setup[i] = urb_next++; 1647b8e80941Smrg } 1648b8e80941Smrg } 1649b8e80941Smrg } else { 1650b8e80941Smrg /* We have enough input varyings that the SF/SBE pipeline stage can't 1651b8e80941Smrg * arbitrarily rearrange them to suit our whim; we have to put them 1652b8e80941Smrg * in an order that matches the output of the previous pipeline stage 1653b8e80941Smrg * (geometry or vertex shader). 1654b8e80941Smrg */ 1655b8e80941Smrg struct brw_vue_map prev_stage_vue_map; 1656b8e80941Smrg brw_compute_vue_map(devinfo, &prev_stage_vue_map, 1657b8e80941Smrg key->input_slots_valid, 1658b8e80941Smrg nir->info.separate_shader); 1659b8e80941Smrg 1660b8e80941Smrg int first_slot = 1661b8e80941Smrg brw_compute_first_urb_slot_required(nir->info.inputs_read, 1662b8e80941Smrg &prev_stage_vue_map); 1663b8e80941Smrg 1664b8e80941Smrg assert(prev_stage_vue_map.num_slots <= first_slot + 32); 1665b8e80941Smrg for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; 1666b8e80941Smrg slot++) { 1667b8e80941Smrg int varying = prev_stage_vue_map.slot_to_varying[slot]; 1668b8e80941Smrg if (varying != BRW_VARYING_SLOT_PAD && 1669b8e80941Smrg (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK & 1670b8e80941Smrg BITFIELD64_BIT(varying))) { 1671b8e80941Smrg prog_data->urb_setup[varying] = slot - first_slot; 1672b8e80941Smrg } 1673b8e80941Smrg } 1674b8e80941Smrg urb_next = prev_stage_vue_map.num_slots - first_slot; 1675b8e80941Smrg } 1676b8e80941Smrg } else { 1677b8e80941Smrg /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ 1678b8e80941Smrg for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { 1679b8e80941Smrg /* Point size is packed into the header, not as a general attribute */ 1680b8e80941Smrg if (i == VARYING_SLOT_PSIZ) 1681b8e80941Smrg continue; 1682b8e80941Smrg 1683b8e80941Smrg if (key->input_slots_valid & BITFIELD64_BIT(i)) { 1684b8e80941Smrg /* The back color slot is skipped when the front color is 1685b8e80941Smrg * also written to. In addition, some slots can be 1686b8e80941Smrg * written in the vertex shader and not read in the 1687b8e80941Smrg * fragment shader. So the register number must always be 1688b8e80941Smrg * incremented, mapped or not. 1689b8e80941Smrg */ 1690b8e80941Smrg if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) 1691b8e80941Smrg prog_data->urb_setup[i] = urb_next; 1692b8e80941Smrg urb_next++; 1693b8e80941Smrg } 1694b8e80941Smrg } 1695b8e80941Smrg 1696b8e80941Smrg /* 1697b8e80941Smrg * It's a FS only attribute, and we did interpolation for this attribute 1698b8e80941Smrg * in SF thread. So, count it here, too. 1699b8e80941Smrg * 1700b8e80941Smrg * See compile_sf_prog() for more info. 1701b8e80941Smrg */ 1702b8e80941Smrg if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC)) 1703b8e80941Smrg prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; 1704b8e80941Smrg } 1705b8e80941Smrg 1706b8e80941Smrg prog_data->num_varying_inputs = urb_next; 1707b8e80941Smrg} 1708b8e80941Smrg 1709b8e80941Smrgvoid 1710b8e80941Smrgfs_visitor::assign_urb_setup() 1711b8e80941Smrg{ 1712b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 1713b8e80941Smrg struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 1714b8e80941Smrg 1715b8e80941Smrg int urb_start = payload.num_regs + prog_data->base.curb_read_length; 1716b8e80941Smrg 1717b8e80941Smrg /* Offset all the urb_setup[] index by the actual position of the 1718b8e80941Smrg * setup regs, now that the location of the constants has been chosen. 1719b8e80941Smrg */ 1720b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1721b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1722b8e80941Smrg if (inst->src[i].file == ATTR) { 1723b8e80941Smrg /* ATTR regs in the FS are in units of logical scalar inputs each 1724b8e80941Smrg * of which consumes half of a GRF register. 1725b8e80941Smrg */ 1726b8e80941Smrg assert(inst->src[i].offset < REG_SIZE / 2); 1727b8e80941Smrg const unsigned grf = urb_start + inst->src[i].nr / 2; 1728b8e80941Smrg const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) + 1729b8e80941Smrg inst->src[i].offset; 1730b8e80941Smrg const unsigned width = inst->src[i].stride == 0 ? 1731b8e80941Smrg 1 : MIN2(inst->exec_size, 8); 1732b8e80941Smrg struct brw_reg reg = stride( 1733b8e80941Smrg byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), 1734b8e80941Smrg offset), 1735b8e80941Smrg width * inst->src[i].stride, 1736b8e80941Smrg width, inst->src[i].stride); 1737b8e80941Smrg reg.abs = inst->src[i].abs; 1738b8e80941Smrg reg.negate = inst->src[i].negate; 1739b8e80941Smrg inst->src[i] = reg; 1740b8e80941Smrg } 1741b8e80941Smrg } 1742b8e80941Smrg } 1743b8e80941Smrg 1744b8e80941Smrg /* Each attribute is 4 setup channels, each of which is half a reg. */ 1745b8e80941Smrg this->first_non_payload_grf += prog_data->num_varying_inputs * 2; 1746b8e80941Smrg} 1747b8e80941Smrg 1748b8e80941Smrgvoid 1749b8e80941Smrgfs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst) 1750b8e80941Smrg{ 1751b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1752b8e80941Smrg if (inst->src[i].file == ATTR) { 1753b8e80941Smrg int grf = payload.num_regs + 1754b8e80941Smrg prog_data->curb_read_length + 1755b8e80941Smrg inst->src[i].nr + 1756b8e80941Smrg inst->src[i].offset / REG_SIZE; 1757b8e80941Smrg 1758b8e80941Smrg /* As explained at brw_reg_from_fs_reg, From the Haswell PRM: 1759b8e80941Smrg * 1760b8e80941Smrg * VertStride must be used to cross GRF register boundaries. This 1761b8e80941Smrg * rule implies that elements within a 'Width' cannot cross GRF 1762b8e80941Smrg * boundaries. 1763b8e80941Smrg * 1764b8e80941Smrg * So, for registers that are large enough, we have to split the exec 1765b8e80941Smrg * size in two and trust the compression state to sort it out. 1766b8e80941Smrg */ 1767b8e80941Smrg unsigned total_size = inst->exec_size * 1768b8e80941Smrg inst->src[i].stride * 1769b8e80941Smrg type_sz(inst->src[i].type); 1770b8e80941Smrg 1771b8e80941Smrg assert(total_size <= 2 * REG_SIZE); 1772b8e80941Smrg const unsigned exec_size = 1773b8e80941Smrg (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2; 1774b8e80941Smrg 1775b8e80941Smrg unsigned width = inst->src[i].stride == 0 ? 1 : exec_size; 1776b8e80941Smrg struct brw_reg reg = 1777b8e80941Smrg stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type), 1778b8e80941Smrg inst->src[i].offset % REG_SIZE), 1779b8e80941Smrg exec_size * inst->src[i].stride, 1780b8e80941Smrg width, inst->src[i].stride); 1781b8e80941Smrg reg.abs = inst->src[i].abs; 1782b8e80941Smrg reg.negate = inst->src[i].negate; 1783b8e80941Smrg 1784b8e80941Smrg inst->src[i] = reg; 1785b8e80941Smrg } 1786b8e80941Smrg } 1787b8e80941Smrg} 1788b8e80941Smrg 1789b8e80941Smrgvoid 1790b8e80941Smrgfs_visitor::assign_vs_urb_setup() 1791b8e80941Smrg{ 1792b8e80941Smrg struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data); 1793b8e80941Smrg 1794b8e80941Smrg assert(stage == MESA_SHADER_VERTEX); 1795b8e80941Smrg 1796b8e80941Smrg /* Each attribute is 4 regs. */ 1797b8e80941Smrg this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots; 1798b8e80941Smrg 1799b8e80941Smrg assert(vs_prog_data->base.urb_read_length <= 15); 1800b8e80941Smrg 1801b8e80941Smrg /* Rewrite all ATTR file references to the hw grf that they land in. */ 1802b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1803b8e80941Smrg convert_attr_sources_to_hw_regs(inst); 1804b8e80941Smrg } 1805b8e80941Smrg} 1806b8e80941Smrg 1807b8e80941Smrgvoid 1808b8e80941Smrgfs_visitor::assign_tcs_single_patch_urb_setup() 1809b8e80941Smrg{ 1810b8e80941Smrg assert(stage == MESA_SHADER_TESS_CTRL); 1811b8e80941Smrg 1812b8e80941Smrg /* Rewrite all ATTR file references to HW_REGs. */ 1813b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1814b8e80941Smrg convert_attr_sources_to_hw_regs(inst); 1815b8e80941Smrg } 1816b8e80941Smrg} 1817b8e80941Smrg 1818b8e80941Smrgvoid 1819b8e80941Smrgfs_visitor::assign_tes_urb_setup() 1820b8e80941Smrg{ 1821b8e80941Smrg assert(stage == MESA_SHADER_TESS_EVAL); 1822b8e80941Smrg 1823b8e80941Smrg struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 1824b8e80941Smrg 1825b8e80941Smrg first_non_payload_grf += 8 * vue_prog_data->urb_read_length; 1826b8e80941Smrg 1827b8e80941Smrg /* Rewrite all ATTR file references to HW_REGs. */ 1828b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1829b8e80941Smrg convert_attr_sources_to_hw_regs(inst); 1830b8e80941Smrg } 1831b8e80941Smrg} 1832b8e80941Smrg 1833b8e80941Smrgvoid 1834b8e80941Smrgfs_visitor::assign_gs_urb_setup() 1835b8e80941Smrg{ 1836b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 1837b8e80941Smrg 1838b8e80941Smrg struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 1839b8e80941Smrg 1840b8e80941Smrg first_non_payload_grf += 1841b8e80941Smrg 8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in; 1842b8e80941Smrg 1843b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1844b8e80941Smrg /* Rewrite all ATTR file references to GRFs. */ 1845b8e80941Smrg convert_attr_sources_to_hw_regs(inst); 1846b8e80941Smrg } 1847b8e80941Smrg} 1848b8e80941Smrg 1849b8e80941Smrg 1850b8e80941Smrg/** 1851b8e80941Smrg * Split large virtual GRFs into separate components if we can. 1852b8e80941Smrg * 1853b8e80941Smrg * This is mostly duplicated with what brw_fs_vector_splitting does, 1854b8e80941Smrg * but that's really conservative because it's afraid of doing 1855b8e80941Smrg * splitting that doesn't result in real progress after the rest of 1856b8e80941Smrg * the optimization phases, which would cause infinite looping in 1857b8e80941Smrg * optimization. We can do it once here, safely. This also has the 1858b8e80941Smrg * opportunity to split interpolated values, or maybe even uniforms, 1859b8e80941Smrg * which we don't have at the IR level. 1860b8e80941Smrg * 1861b8e80941Smrg * We want to split, because virtual GRFs are what we register 1862b8e80941Smrg * allocate and spill (due to contiguousness requirements for some 1863b8e80941Smrg * instructions), and they're what we naturally generate in the 1864b8e80941Smrg * codegen process, but most virtual GRFs don't actually need to be 1865b8e80941Smrg * contiguous sets of GRFs. If we split, we'll end up with reduced 1866b8e80941Smrg * live intervals and better dead code elimination and coalescing. 1867b8e80941Smrg */ 1868b8e80941Smrgvoid 1869b8e80941Smrgfs_visitor::split_virtual_grfs() 1870b8e80941Smrg{ 1871b8e80941Smrg /* Compact the register file so we eliminate dead vgrfs. This 1872b8e80941Smrg * only defines split points for live registers, so if we have 1873b8e80941Smrg * too large dead registers they will hit assertions later. 1874b8e80941Smrg */ 1875b8e80941Smrg compact_virtual_grfs(); 1876b8e80941Smrg 1877b8e80941Smrg int num_vars = this->alloc.count; 1878b8e80941Smrg 1879b8e80941Smrg /* Count the total number of registers */ 1880b8e80941Smrg int reg_count = 0; 1881b8e80941Smrg int vgrf_to_reg[num_vars]; 1882b8e80941Smrg for (int i = 0; i < num_vars; i++) { 1883b8e80941Smrg vgrf_to_reg[i] = reg_count; 1884b8e80941Smrg reg_count += alloc.sizes[i]; 1885b8e80941Smrg } 1886b8e80941Smrg 1887b8e80941Smrg /* An array of "split points". For each register slot, this indicates 1888b8e80941Smrg * if this slot can be separated from the previous slot. Every time an 1889b8e80941Smrg * instruction uses multiple elements of a register (as a source or 1890b8e80941Smrg * destination), we mark the used slots as inseparable. Then we go 1891b8e80941Smrg * through and split the registers into the smallest pieces we can. 1892b8e80941Smrg */ 1893b8e80941Smrg bool *split_points = new bool[reg_count]; 1894b8e80941Smrg memset(split_points, 0, reg_count * sizeof(*split_points)); 1895b8e80941Smrg 1896b8e80941Smrg /* Mark all used registers as fully splittable */ 1897b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1898b8e80941Smrg if (inst->dst.file == VGRF) { 1899b8e80941Smrg int reg = vgrf_to_reg[inst->dst.nr]; 1900b8e80941Smrg for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++) 1901b8e80941Smrg split_points[reg + j] = true; 1902b8e80941Smrg } 1903b8e80941Smrg 1904b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1905b8e80941Smrg if (inst->src[i].file == VGRF) { 1906b8e80941Smrg int reg = vgrf_to_reg[inst->src[i].nr]; 1907b8e80941Smrg for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++) 1908b8e80941Smrg split_points[reg + j] = true; 1909b8e80941Smrg } 1910b8e80941Smrg } 1911b8e80941Smrg } 1912b8e80941Smrg 1913b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1914b8e80941Smrg if (inst->dst.file == VGRF) { 1915b8e80941Smrg int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; 1916b8e80941Smrg for (unsigned j = 1; j < regs_written(inst); j++) 1917b8e80941Smrg split_points[reg + j] = false; 1918b8e80941Smrg } 1919b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1920b8e80941Smrg if (inst->src[i].file == VGRF) { 1921b8e80941Smrg int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; 1922b8e80941Smrg for (unsigned j = 1; j < regs_read(inst, i); j++) 1923b8e80941Smrg split_points[reg + j] = false; 1924b8e80941Smrg } 1925b8e80941Smrg } 1926b8e80941Smrg } 1927b8e80941Smrg 1928b8e80941Smrg int *new_virtual_grf = new int[reg_count]; 1929b8e80941Smrg int *new_reg_offset = new int[reg_count]; 1930b8e80941Smrg 1931b8e80941Smrg int reg = 0; 1932b8e80941Smrg for (int i = 0; i < num_vars; i++) { 1933b8e80941Smrg /* The first one should always be 0 as a quick sanity check. */ 1934b8e80941Smrg assert(split_points[reg] == false); 1935b8e80941Smrg 1936b8e80941Smrg /* j = 0 case */ 1937b8e80941Smrg new_reg_offset[reg] = 0; 1938b8e80941Smrg reg++; 1939b8e80941Smrg int offset = 1; 1940b8e80941Smrg 1941b8e80941Smrg /* j > 0 case */ 1942b8e80941Smrg for (unsigned j = 1; j < alloc.sizes[i]; j++) { 1943b8e80941Smrg /* If this is a split point, reset the offset to 0 and allocate a 1944b8e80941Smrg * new virtual GRF for the previous offset many registers 1945b8e80941Smrg */ 1946b8e80941Smrg if (split_points[reg]) { 1947b8e80941Smrg assert(offset <= MAX_VGRF_SIZE); 1948b8e80941Smrg int grf = alloc.allocate(offset); 1949b8e80941Smrg for (int k = reg - offset; k < reg; k++) 1950b8e80941Smrg new_virtual_grf[k] = grf; 1951b8e80941Smrg offset = 0; 1952b8e80941Smrg } 1953b8e80941Smrg new_reg_offset[reg] = offset; 1954b8e80941Smrg offset++; 1955b8e80941Smrg reg++; 1956b8e80941Smrg } 1957b8e80941Smrg 1958b8e80941Smrg /* The last one gets the original register number */ 1959b8e80941Smrg assert(offset <= MAX_VGRF_SIZE); 1960b8e80941Smrg alloc.sizes[i] = offset; 1961b8e80941Smrg for (int k = reg - offset; k < reg; k++) 1962b8e80941Smrg new_virtual_grf[k] = i; 1963b8e80941Smrg } 1964b8e80941Smrg assert(reg == reg_count); 1965b8e80941Smrg 1966b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 1967b8e80941Smrg if (inst->dst.file == VGRF) { 1968b8e80941Smrg reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE; 1969b8e80941Smrg inst->dst.nr = new_virtual_grf[reg]; 1970b8e80941Smrg inst->dst.offset = new_reg_offset[reg] * REG_SIZE + 1971b8e80941Smrg inst->dst.offset % REG_SIZE; 1972b8e80941Smrg assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); 1973b8e80941Smrg } 1974b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 1975b8e80941Smrg if (inst->src[i].file == VGRF) { 1976b8e80941Smrg reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE; 1977b8e80941Smrg inst->src[i].nr = new_virtual_grf[reg]; 1978b8e80941Smrg inst->src[i].offset = new_reg_offset[reg] * REG_SIZE + 1979b8e80941Smrg inst->src[i].offset % REG_SIZE; 1980b8e80941Smrg assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); 1981b8e80941Smrg } 1982b8e80941Smrg } 1983b8e80941Smrg } 1984b8e80941Smrg invalidate_live_intervals(); 1985b8e80941Smrg 1986b8e80941Smrg delete[] split_points; 1987b8e80941Smrg delete[] new_virtual_grf; 1988b8e80941Smrg delete[] new_reg_offset; 1989b8e80941Smrg} 1990b8e80941Smrg 1991b8e80941Smrg/** 1992b8e80941Smrg * Remove unused virtual GRFs and compact the virtual_grf_* arrays. 1993b8e80941Smrg * 1994b8e80941Smrg * During code generation, we create tons of temporary variables, many of 1995b8e80941Smrg * which get immediately killed and are never used again. Yet, in later 1996b8e80941Smrg * optimization and analysis passes, such as compute_live_intervals, we need 1997b8e80941Smrg * to loop over all the virtual GRFs. Compacting them can save a lot of 1998b8e80941Smrg * overhead. 1999b8e80941Smrg */ 2000b8e80941Smrgbool 2001b8e80941Smrgfs_visitor::compact_virtual_grfs() 2002b8e80941Smrg{ 2003b8e80941Smrg bool progress = false; 2004b8e80941Smrg int *remap_table = new int[this->alloc.count]; 2005b8e80941Smrg memset(remap_table, -1, this->alloc.count * sizeof(int)); 2006b8e80941Smrg 2007b8e80941Smrg /* Mark which virtual GRFs are used. */ 2008b8e80941Smrg foreach_block_and_inst(block, const fs_inst, inst, cfg) { 2009b8e80941Smrg if (inst->dst.file == VGRF) 2010b8e80941Smrg remap_table[inst->dst.nr] = 0; 2011b8e80941Smrg 2012b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 2013b8e80941Smrg if (inst->src[i].file == VGRF) 2014b8e80941Smrg remap_table[inst->src[i].nr] = 0; 2015b8e80941Smrg } 2016b8e80941Smrg } 2017b8e80941Smrg 2018b8e80941Smrg /* Compact the GRF arrays. */ 2019b8e80941Smrg int new_index = 0; 2020b8e80941Smrg for (unsigned i = 0; i < this->alloc.count; i++) { 2021b8e80941Smrg if (remap_table[i] == -1) { 2022b8e80941Smrg /* We just found an unused register. This means that we are 2023b8e80941Smrg * actually going to compact something. 2024b8e80941Smrg */ 2025b8e80941Smrg progress = true; 2026b8e80941Smrg } else { 2027b8e80941Smrg remap_table[i] = new_index; 2028b8e80941Smrg alloc.sizes[new_index] = alloc.sizes[i]; 2029b8e80941Smrg invalidate_live_intervals(); 2030b8e80941Smrg ++new_index; 2031b8e80941Smrg } 2032b8e80941Smrg } 2033b8e80941Smrg 2034b8e80941Smrg this->alloc.count = new_index; 2035b8e80941Smrg 2036b8e80941Smrg /* Patch all the instructions to use the newly renumbered registers */ 2037b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 2038b8e80941Smrg if (inst->dst.file == VGRF) 2039b8e80941Smrg inst->dst.nr = remap_table[inst->dst.nr]; 2040b8e80941Smrg 2041b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 2042b8e80941Smrg if (inst->src[i].file == VGRF) 2043b8e80941Smrg inst->src[i].nr = remap_table[inst->src[i].nr]; 2044b8e80941Smrg } 2045b8e80941Smrg } 2046b8e80941Smrg 2047b8e80941Smrg /* Patch all the references to delta_xy, since they're used in register 2048b8e80941Smrg * allocation. If they're unused, switch them to BAD_FILE so we don't 2049b8e80941Smrg * think some random VGRF is delta_xy. 2050b8e80941Smrg */ 2051b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { 2052b8e80941Smrg if (delta_xy[i].file == VGRF) { 2053b8e80941Smrg if (remap_table[delta_xy[i].nr] != -1) { 2054b8e80941Smrg delta_xy[i].nr = remap_table[delta_xy[i].nr]; 2055b8e80941Smrg } else { 2056b8e80941Smrg delta_xy[i].file = BAD_FILE; 2057b8e80941Smrg } 2058b8e80941Smrg } 2059b8e80941Smrg } 2060b8e80941Smrg 2061b8e80941Smrg delete[] remap_table; 2062b8e80941Smrg 2063b8e80941Smrg return progress; 2064b8e80941Smrg} 2065b8e80941Smrg 2066b8e80941Smrgstatic int 2067b8e80941Smrgget_subgroup_id_param_index(const brw_stage_prog_data *prog_data) 2068b8e80941Smrg{ 2069b8e80941Smrg if (prog_data->nr_params == 0) 2070b8e80941Smrg return -1; 2071b8e80941Smrg 2072b8e80941Smrg /* The local thread id is always the last parameter in the list */ 2073b8e80941Smrg uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; 2074b8e80941Smrg if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) 2075b8e80941Smrg return prog_data->nr_params - 1; 2076b8e80941Smrg 2077b8e80941Smrg return -1; 2078b8e80941Smrg} 2079b8e80941Smrg 2080b8e80941Smrg/** 2081b8e80941Smrg * Struct for handling complex alignments. 2082b8e80941Smrg * 2083b8e80941Smrg * A complex alignment is stored as multiplier and an offset. A value is 2084b8e80941Smrg * considered to be aligned if it is {offset} larger than a multiple of {mul}. 2085b8e80941Smrg * For instance, with an alignment of {8, 2}, cplx_align_apply would do the 2086b8e80941Smrg * following: 2087b8e80941Smrg * 2088b8e80941Smrg * N | cplx_align_apply({8, 2}, N) 2089b8e80941Smrg * ----+----------------------------- 2090b8e80941Smrg * 4 | 6 2091b8e80941Smrg * 6 | 6 2092b8e80941Smrg * 8 | 14 2093b8e80941Smrg * 10 | 14 2094b8e80941Smrg * 12 | 14 2095b8e80941Smrg * 14 | 14 2096b8e80941Smrg * 16 | 22 2097b8e80941Smrg */ 2098b8e80941Smrgstruct cplx_align { 2099b8e80941Smrg unsigned mul:4; 2100b8e80941Smrg unsigned offset:4; 2101b8e80941Smrg}; 2102b8e80941Smrg 2103b8e80941Smrg#define CPLX_ALIGN_MAX_MUL 8 2104b8e80941Smrg 2105b8e80941Smrgstatic void 2106b8e80941Smrgcplx_align_assert_sane(struct cplx_align a) 2107b8e80941Smrg{ 2108b8e80941Smrg assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul)); 2109b8e80941Smrg assert(a.offset < a.mul); 2110b8e80941Smrg} 2111b8e80941Smrg 2112b8e80941Smrg/** 2113b8e80941Smrg * Combines two alignments to produce a least multiple of sorts. 2114b8e80941Smrg * 2115b8e80941Smrg * The returned alignment is the smallest (in terms of multiplier) such that 2116b8e80941Smrg * anything aligned to both a and b will be aligned to the new alignment. 2117b8e80941Smrg * This function will assert-fail if a and b are not compatible, i.e. if the 2118b8e80941Smrg * offset parameters are such that no common alignment is possible. 2119b8e80941Smrg */ 2120b8e80941Smrgstatic struct cplx_align 2121b8e80941Smrgcplx_align_combine(struct cplx_align a, struct cplx_align b) 2122b8e80941Smrg{ 2123b8e80941Smrg cplx_align_assert_sane(a); 2124b8e80941Smrg cplx_align_assert_sane(b); 2125b8e80941Smrg 2126b8e80941Smrg /* Assert that the alignments agree. */ 2127b8e80941Smrg assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1))); 2128b8e80941Smrg 2129b8e80941Smrg return a.mul > b.mul ? a : b; 2130b8e80941Smrg} 2131b8e80941Smrg 2132b8e80941Smrg/** 2133b8e80941Smrg * Apply a complex alignment 2134b8e80941Smrg * 2135b8e80941Smrg * This function will return the smallest number greater than or equal to 2136b8e80941Smrg * offset that is aligned to align. 2137b8e80941Smrg */ 2138b8e80941Smrgstatic unsigned 2139b8e80941Smrgcplx_align_apply(struct cplx_align align, unsigned offset) 2140b8e80941Smrg{ 2141b8e80941Smrg return ALIGN(offset - align.offset, align.mul) + align.offset; 2142b8e80941Smrg} 2143b8e80941Smrg 2144b8e80941Smrg#define UNIFORM_SLOT_SIZE 4 2145b8e80941Smrg 2146b8e80941Smrgstruct uniform_slot_info { 2147b8e80941Smrg /** True if the given uniform slot is live */ 2148b8e80941Smrg unsigned is_live:1; 2149b8e80941Smrg 2150b8e80941Smrg /** True if this slot and the next slot must remain contiguous */ 2151b8e80941Smrg unsigned contiguous:1; 2152b8e80941Smrg 2153b8e80941Smrg struct cplx_align align; 2154b8e80941Smrg}; 2155b8e80941Smrg 2156b8e80941Smrgstatic void 2157b8e80941Smrgmark_uniform_slots_read(struct uniform_slot_info *slots, 2158b8e80941Smrg unsigned num_slots, unsigned alignment) 2159b8e80941Smrg{ 2160b8e80941Smrg assert(alignment > 0 && util_is_power_of_two_nonzero(alignment)); 2161b8e80941Smrg assert(alignment <= CPLX_ALIGN_MAX_MUL); 2162b8e80941Smrg 2163b8e80941Smrg /* We can't align a slot to anything less than the slot size */ 2164b8e80941Smrg alignment = MAX2(alignment, UNIFORM_SLOT_SIZE); 2165b8e80941Smrg 2166b8e80941Smrg struct cplx_align align = {alignment, 0}; 2167b8e80941Smrg cplx_align_assert_sane(align); 2168b8e80941Smrg 2169b8e80941Smrg for (unsigned i = 0; i < num_slots; i++) { 2170b8e80941Smrg slots[i].is_live = true; 2171b8e80941Smrg if (i < num_slots - 1) 2172b8e80941Smrg slots[i].contiguous = true; 2173b8e80941Smrg 2174b8e80941Smrg align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1); 2175b8e80941Smrg if (slots[i].align.mul == 0) { 2176b8e80941Smrg slots[i].align = align; 2177b8e80941Smrg } else { 2178b8e80941Smrg slots[i].align = cplx_align_combine(slots[i].align, align); 2179b8e80941Smrg } 2180b8e80941Smrg } 2181b8e80941Smrg} 2182b8e80941Smrg 2183b8e80941Smrg/** 2184b8e80941Smrg * Assign UNIFORM file registers to either push constants or pull constants. 2185b8e80941Smrg * 2186b8e80941Smrg * We allow a fragment shader to have more than the specified minimum 2187b8e80941Smrg * maximum number of fragment shader uniform components (64). If 2188b8e80941Smrg * there are too many of these, they'd fill up all of register space. 2189b8e80941Smrg * So, this will push some of them out to the pull constant buffer and 2190b8e80941Smrg * update the program to load them. 2191b8e80941Smrg */ 2192b8e80941Smrgvoid 2193b8e80941Smrgfs_visitor::assign_constant_locations() 2194b8e80941Smrg{ 2195b8e80941Smrg /* Only the first compile gets to decide on locations. */ 2196b8e80941Smrg if (push_constant_loc) { 2197b8e80941Smrg assert(pull_constant_loc); 2198b8e80941Smrg return; 2199b8e80941Smrg } 2200b8e80941Smrg 2201b8e80941Smrg struct uniform_slot_info slots[uniforms]; 2202b8e80941Smrg memset(slots, 0, sizeof(slots)); 2203b8e80941Smrg 2204b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 2205b8e80941Smrg for (int i = 0 ; i < inst->sources; i++) { 2206b8e80941Smrg if (inst->src[i].file != UNIFORM) 2207b8e80941Smrg continue; 2208b8e80941Smrg 2209b8e80941Smrg /* NIR tightly packs things so the uniform number might not be 2210b8e80941Smrg * aligned (if we have a double right after a float, for instance). 2211b8e80941Smrg * This is fine because the process of re-arranging them will ensure 2212b8e80941Smrg * that things are properly aligned. The offset into that uniform, 2213b8e80941Smrg * however, must be aligned. 2214b8e80941Smrg * 2215b8e80941Smrg * In Vulkan, we have explicit offsets but everything is crammed 2216b8e80941Smrg * into a single "variable" so inst->src[i].nr will always be 0. 2217b8e80941Smrg * Everything will be properly aligned relative to that one base. 2218b8e80941Smrg */ 2219b8e80941Smrg assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0); 2220b8e80941Smrg 2221b8e80941Smrg unsigned u = inst->src[i].nr + 2222b8e80941Smrg inst->src[i].offset / UNIFORM_SLOT_SIZE; 2223b8e80941Smrg 2224b8e80941Smrg if (u >= uniforms) 2225b8e80941Smrg continue; 2226b8e80941Smrg 2227b8e80941Smrg unsigned slots_read; 2228b8e80941Smrg if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) { 2229b8e80941Smrg slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE); 2230b8e80941Smrg } else { 2231b8e80941Smrg unsigned bytes_read = inst->components_read(i) * 2232b8e80941Smrg type_sz(inst->src[i].type); 2233b8e80941Smrg slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE); 2234b8e80941Smrg } 2235b8e80941Smrg 2236b8e80941Smrg assert(u + slots_read <= uniforms); 2237b8e80941Smrg mark_uniform_slots_read(&slots[u], slots_read, 2238b8e80941Smrg type_sz(inst->src[i].type)); 2239b8e80941Smrg } 2240b8e80941Smrg } 2241b8e80941Smrg 2242b8e80941Smrg int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data); 2243b8e80941Smrg 2244b8e80941Smrg /* Only allow 16 registers (128 uniform components) as push constants. 2245b8e80941Smrg * 2246b8e80941Smrg * Just demote the end of the list. We could probably do better 2247b8e80941Smrg * here, demoting things that are rarely used in the program first. 2248b8e80941Smrg * 2249b8e80941Smrg * If changing this value, note the limitation about total_regs in 2250b8e80941Smrg * brw_curbe.c. 2251b8e80941Smrg */ 2252b8e80941Smrg unsigned int max_push_components = 16 * 8; 2253b8e80941Smrg if (subgroup_id_index >= 0) 2254b8e80941Smrg max_push_components--; /* Save a slot for the thread ID */ 2255b8e80941Smrg 2256b8e80941Smrg /* We push small arrays, but no bigger than 16 floats. This is big enough 2257b8e80941Smrg * for a vec4 but hopefully not large enough to push out other stuff. We 2258b8e80941Smrg * should probably use a better heuristic at some point. 2259b8e80941Smrg */ 2260b8e80941Smrg const unsigned int max_chunk_size = 16; 2261b8e80941Smrg 2262b8e80941Smrg unsigned int num_push_constants = 0; 2263b8e80941Smrg unsigned int num_pull_constants = 0; 2264b8e80941Smrg 2265b8e80941Smrg push_constant_loc = ralloc_array(mem_ctx, int, uniforms); 2266b8e80941Smrg pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); 2267b8e80941Smrg 2268b8e80941Smrg /* Default to -1 meaning no location */ 2269b8e80941Smrg memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc)); 2270b8e80941Smrg memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc)); 2271b8e80941Smrg 2272b8e80941Smrg int chunk_start = -1; 2273b8e80941Smrg struct cplx_align align; 2274b8e80941Smrg for (unsigned u = 0; u < uniforms; u++) { 2275b8e80941Smrg if (!slots[u].is_live) { 2276b8e80941Smrg assert(chunk_start == -1); 2277b8e80941Smrg continue; 2278b8e80941Smrg } 2279b8e80941Smrg 2280b8e80941Smrg /* Skip subgroup_id_index to put it in the last push register. */ 2281b8e80941Smrg if (subgroup_id_index == (int)u) 2282b8e80941Smrg continue; 2283b8e80941Smrg 2284b8e80941Smrg if (chunk_start == -1) { 2285b8e80941Smrg chunk_start = u; 2286b8e80941Smrg align = slots[u].align; 2287b8e80941Smrg } else { 2288b8e80941Smrg /* Offset into the chunk */ 2289b8e80941Smrg unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE; 2290b8e80941Smrg 2291b8e80941Smrg /* Shift the slot alignment down by the chunk offset so it is 2292b8e80941Smrg * comparable with the base chunk alignment. 2293b8e80941Smrg */ 2294b8e80941Smrg struct cplx_align slot_align = slots[u].align; 2295b8e80941Smrg slot_align.offset = 2296b8e80941Smrg (slot_align.offset - chunk_offset) & (align.mul - 1); 2297b8e80941Smrg 2298b8e80941Smrg align = cplx_align_combine(align, slot_align); 2299b8e80941Smrg } 2300b8e80941Smrg 2301b8e80941Smrg /* Sanity check the alignment */ 2302b8e80941Smrg cplx_align_assert_sane(align); 2303b8e80941Smrg 2304b8e80941Smrg if (slots[u].contiguous) 2305b8e80941Smrg continue; 2306b8e80941Smrg 2307b8e80941Smrg /* Adjust the alignment to be in terms of slots, not bytes */ 2308b8e80941Smrg assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0); 2309b8e80941Smrg assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0); 2310b8e80941Smrg align.mul /= UNIFORM_SLOT_SIZE; 2311b8e80941Smrg align.offset /= UNIFORM_SLOT_SIZE; 2312b8e80941Smrg 2313b8e80941Smrg unsigned push_start_align = cplx_align_apply(align, num_push_constants); 2314b8e80941Smrg unsigned chunk_size = u - chunk_start + 1; 2315b8e80941Smrg if ((!compiler->supports_pull_constants && u < UBO_START) || 2316b8e80941Smrg (chunk_size < max_chunk_size && 2317b8e80941Smrg push_start_align + chunk_size <= max_push_components)) { 2318b8e80941Smrg /* Align up the number of push constants */ 2319b8e80941Smrg num_push_constants = push_start_align; 2320b8e80941Smrg for (unsigned i = 0; i < chunk_size; i++) 2321b8e80941Smrg push_constant_loc[chunk_start + i] = num_push_constants++; 2322b8e80941Smrg } else { 2323b8e80941Smrg /* We need to pull this one */ 2324b8e80941Smrg num_pull_constants = cplx_align_apply(align, num_pull_constants); 2325b8e80941Smrg for (unsigned i = 0; i < chunk_size; i++) 2326b8e80941Smrg pull_constant_loc[chunk_start + i] = num_pull_constants++; 2327b8e80941Smrg } 2328b8e80941Smrg 2329b8e80941Smrg /* Reset the chunk and start again */ 2330b8e80941Smrg chunk_start = -1; 2331b8e80941Smrg } 2332b8e80941Smrg 2333b8e80941Smrg /* Add the CS local thread ID uniform at the end of the push constants */ 2334b8e80941Smrg if (subgroup_id_index >= 0) 2335b8e80941Smrg push_constant_loc[subgroup_id_index] = num_push_constants++; 2336b8e80941Smrg 2337b8e80941Smrg /* As the uniforms are going to be reordered, stash the old array and 2338b8e80941Smrg * create two new arrays for push/pull params. 2339b8e80941Smrg */ 2340b8e80941Smrg uint32_t *param = stage_prog_data->param; 2341b8e80941Smrg stage_prog_data->nr_params = num_push_constants; 2342b8e80941Smrg if (num_push_constants) { 2343b8e80941Smrg stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t, 2344b8e80941Smrg num_push_constants); 2345b8e80941Smrg } else { 2346b8e80941Smrg stage_prog_data->param = NULL; 2347b8e80941Smrg } 2348b8e80941Smrg assert(stage_prog_data->nr_pull_params == 0); 2349b8e80941Smrg assert(stage_prog_data->pull_param == NULL); 2350b8e80941Smrg if (num_pull_constants > 0) { 2351b8e80941Smrg stage_prog_data->nr_pull_params = num_pull_constants; 2352b8e80941Smrg stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t, 2353b8e80941Smrg num_pull_constants); 2354b8e80941Smrg } 2355b8e80941Smrg 2356b8e80941Smrg /* Now that we know how many regular uniforms we'll push, reduce the 2357b8e80941Smrg * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits. 2358b8e80941Smrg */ 2359b8e80941Smrg unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8); 2360b8e80941Smrg for (int i = 0; i < 4; i++) { 2361b8e80941Smrg struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 2362b8e80941Smrg 2363b8e80941Smrg if (push_length + range->length > 64) 2364b8e80941Smrg range->length = 64 - push_length; 2365b8e80941Smrg 2366b8e80941Smrg push_length += range->length; 2367b8e80941Smrg } 2368b8e80941Smrg assert(push_length <= 64); 2369b8e80941Smrg 2370b8e80941Smrg /* Up until now, the param[] array has been indexed by reg + offset 2371b8e80941Smrg * of UNIFORM registers. Move pull constants into pull_param[] and 2372b8e80941Smrg * condense param[] to only contain the uniforms we chose to push. 2373b8e80941Smrg * 2374b8e80941Smrg * NOTE: Because we are condensing the params[] array, we know that 2375b8e80941Smrg * push_constant_loc[i] <= i and we can do it in one smooth loop without 2376b8e80941Smrg * having to make a copy. 2377b8e80941Smrg */ 2378b8e80941Smrg for (unsigned int i = 0; i < uniforms; i++) { 2379b8e80941Smrg uint32_t value = param[i]; 2380b8e80941Smrg if (pull_constant_loc[i] != -1) { 2381b8e80941Smrg stage_prog_data->pull_param[pull_constant_loc[i]] = value; 2382b8e80941Smrg } else if (push_constant_loc[i] != -1) { 2383b8e80941Smrg stage_prog_data->param[push_constant_loc[i]] = value; 2384b8e80941Smrg } 2385b8e80941Smrg } 2386b8e80941Smrg ralloc_free(param); 2387b8e80941Smrg} 2388b8e80941Smrg 2389b8e80941Smrgbool 2390b8e80941Smrgfs_visitor::get_pull_locs(const fs_reg &src, 2391b8e80941Smrg unsigned *out_surf_index, 2392b8e80941Smrg unsigned *out_pull_index) 2393b8e80941Smrg{ 2394b8e80941Smrg assert(src.file == UNIFORM); 2395b8e80941Smrg 2396b8e80941Smrg if (src.nr >= UBO_START) { 2397b8e80941Smrg const struct brw_ubo_range *range = 2398b8e80941Smrg &prog_data->ubo_ranges[src.nr - UBO_START]; 2399b8e80941Smrg 2400b8e80941Smrg /* If this access is in our (reduced) range, use the push data. */ 2401b8e80941Smrg if (src.offset / 32 < range->length) 2402b8e80941Smrg return false; 2403b8e80941Smrg 2404b8e80941Smrg *out_surf_index = prog_data->binding_table.ubo_start + range->block; 2405b8e80941Smrg *out_pull_index = (32 * range->start + src.offset) / 4; 2406b8e80941Smrg return true; 2407b8e80941Smrg } 2408b8e80941Smrg 2409b8e80941Smrg const unsigned location = src.nr + src.offset / 4; 2410b8e80941Smrg 2411b8e80941Smrg if (location < uniforms && pull_constant_loc[location] != -1) { 2412b8e80941Smrg /* A regular uniform push constant */ 2413b8e80941Smrg *out_surf_index = stage_prog_data->binding_table.pull_constants_start; 2414b8e80941Smrg *out_pull_index = pull_constant_loc[location]; 2415b8e80941Smrg return true; 2416b8e80941Smrg } 2417b8e80941Smrg 2418b8e80941Smrg return false; 2419b8e80941Smrg} 2420b8e80941Smrg 2421b8e80941Smrg/** 2422b8e80941Smrg * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD 2423b8e80941Smrg * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. 2424b8e80941Smrg */ 2425b8e80941Smrgvoid 2426b8e80941Smrgfs_visitor::lower_constant_loads() 2427b8e80941Smrg{ 2428b8e80941Smrg unsigned index, pull_index; 2429b8e80941Smrg 2430b8e80941Smrg foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { 2431b8e80941Smrg /* Set up the annotation tracking for new generated instructions. */ 2432b8e80941Smrg const fs_builder ibld(this, block, inst); 2433b8e80941Smrg 2434b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 2435b8e80941Smrg if (inst->src[i].file != UNIFORM) 2436b8e80941Smrg continue; 2437b8e80941Smrg 2438b8e80941Smrg /* We'll handle this case later */ 2439b8e80941Smrg if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) 2440b8e80941Smrg continue; 2441b8e80941Smrg 2442b8e80941Smrg if (!get_pull_locs(inst->src[i], &index, &pull_index)) 2443b8e80941Smrg continue; 2444b8e80941Smrg 2445b8e80941Smrg assert(inst->src[i].stride == 0); 2446b8e80941Smrg 2447b8e80941Smrg const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 2448b8e80941Smrg const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); 2449b8e80941Smrg const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); 2450b8e80941Smrg const unsigned base = pull_index * 4; 2451b8e80941Smrg 2452b8e80941Smrg ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 2453b8e80941Smrg dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1))); 2454b8e80941Smrg 2455b8e80941Smrg /* Rewrite the instruction to use the temporary VGRF. */ 2456b8e80941Smrg inst->src[i].file = VGRF; 2457b8e80941Smrg inst->src[i].nr = dst.nr; 2458b8e80941Smrg inst->src[i].offset = (base & (block_sz - 1)) + 2459b8e80941Smrg inst->src[i].offset % 4; 2460b8e80941Smrg } 2461b8e80941Smrg 2462b8e80941Smrg if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && 2463b8e80941Smrg inst->src[0].file == UNIFORM) { 2464b8e80941Smrg 2465b8e80941Smrg if (!get_pull_locs(inst->src[0], &index, &pull_index)) 2466b8e80941Smrg continue; 2467b8e80941Smrg 2468b8e80941Smrg VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, 2469b8e80941Smrg brw_imm_ud(index), 2470b8e80941Smrg inst->src[1], 2471b8e80941Smrg pull_index * 4); 2472b8e80941Smrg inst->remove(block); 2473b8e80941Smrg } 2474b8e80941Smrg } 2475b8e80941Smrg invalidate_live_intervals(); 2476b8e80941Smrg} 2477b8e80941Smrg 2478b8e80941Smrgbool 2479b8e80941Smrgfs_visitor::opt_algebraic() 2480b8e80941Smrg{ 2481b8e80941Smrg bool progress = false; 2482b8e80941Smrg 2483b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 2484b8e80941Smrg switch (inst->opcode) { 2485b8e80941Smrg case BRW_OPCODE_MOV: 2486b8e80941Smrg if (!devinfo->has_64bit_types && 2487b8e80941Smrg (inst->dst.type == BRW_REGISTER_TYPE_DF || 2488b8e80941Smrg inst->dst.type == BRW_REGISTER_TYPE_UQ || 2489b8e80941Smrg inst->dst.type == BRW_REGISTER_TYPE_Q)) { 2490b8e80941Smrg assert(inst->dst.type == inst->src[0].type); 2491b8e80941Smrg assert(!inst->saturate); 2492b8e80941Smrg assert(!inst->src[0].abs); 2493b8e80941Smrg assert(!inst->src[0].negate); 2494b8e80941Smrg const brw::fs_builder ibld(this, block, inst); 2495b8e80941Smrg 2496b8e80941Smrg if (inst->src[0].file == IMM) { 2497b8e80941Smrg ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), 2498b8e80941Smrg brw_imm_ud(inst->src[0].u64 >> 32)); 2499b8e80941Smrg ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), 2500b8e80941Smrg brw_imm_ud(inst->src[0].u64)); 2501b8e80941Smrg } else { 2502b8e80941Smrg ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), 2503b8e80941Smrg subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1)); 2504b8e80941Smrg ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), 2505b8e80941Smrg subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0)); 2506b8e80941Smrg } 2507b8e80941Smrg 2508b8e80941Smrg inst->remove(block); 2509b8e80941Smrg progress = true; 2510b8e80941Smrg } 2511b8e80941Smrg 2512b8e80941Smrg if ((inst->conditional_mod == BRW_CONDITIONAL_Z || 2513b8e80941Smrg inst->conditional_mod == BRW_CONDITIONAL_NZ) && 2514b8e80941Smrg inst->dst.is_null() && 2515b8e80941Smrg (inst->src[0].abs || inst->src[0].negate)) { 2516b8e80941Smrg inst->src[0].abs = false; 2517b8e80941Smrg inst->src[0].negate = false; 2518b8e80941Smrg progress = true; 2519b8e80941Smrg break; 2520b8e80941Smrg } 2521b8e80941Smrg 2522b8e80941Smrg if (inst->src[0].file != IMM) 2523b8e80941Smrg break; 2524b8e80941Smrg 2525b8e80941Smrg if (inst->saturate) { 2526b8e80941Smrg /* Full mixed-type saturates don't happen. However, we can end up 2527b8e80941Smrg * with things like: 2528b8e80941Smrg * 2529b8e80941Smrg * mov.sat(8) g21<1>DF -1F 2530b8e80941Smrg * 2531b8e80941Smrg * Other mixed-size-but-same-base-type cases may also be possible. 2532b8e80941Smrg */ 2533b8e80941Smrg if (inst->dst.type != inst->src[0].type && 2534b8e80941Smrg inst->dst.type != BRW_REGISTER_TYPE_DF && 2535b8e80941Smrg inst->src[0].type != BRW_REGISTER_TYPE_F) 2536b8e80941Smrg assert(!"unimplemented: saturate mixed types"); 2537b8e80941Smrg 2538b8e80941Smrg if (brw_saturate_immediate(inst->src[0].type, 2539b8e80941Smrg &inst->src[0].as_brw_reg())) { 2540b8e80941Smrg inst->saturate = false; 2541b8e80941Smrg progress = true; 2542b8e80941Smrg } 2543b8e80941Smrg } 2544b8e80941Smrg break; 2545b8e80941Smrg 2546b8e80941Smrg case BRW_OPCODE_MUL: 2547b8e80941Smrg if (inst->src[1].file != IMM) 2548b8e80941Smrg continue; 2549b8e80941Smrg 2550b8e80941Smrg /* a * 1.0 = a */ 2551b8e80941Smrg if (inst->src[1].is_one()) { 2552b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2553b8e80941Smrg inst->src[1] = reg_undef; 2554b8e80941Smrg progress = true; 2555b8e80941Smrg break; 2556b8e80941Smrg } 2557b8e80941Smrg 2558b8e80941Smrg /* a * -1.0 = -a */ 2559b8e80941Smrg if (inst->src[1].is_negative_one()) { 2560b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2561b8e80941Smrg inst->src[0].negate = !inst->src[0].negate; 2562b8e80941Smrg inst->src[1] = reg_undef; 2563b8e80941Smrg progress = true; 2564b8e80941Smrg break; 2565b8e80941Smrg } 2566b8e80941Smrg 2567b8e80941Smrg if (inst->src[0].file == IMM) { 2568b8e80941Smrg assert(inst->src[0].type == BRW_REGISTER_TYPE_F); 2569b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2570b8e80941Smrg inst->src[0].f *= inst->src[1].f; 2571b8e80941Smrg inst->src[1] = reg_undef; 2572b8e80941Smrg progress = true; 2573b8e80941Smrg break; 2574b8e80941Smrg } 2575b8e80941Smrg break; 2576b8e80941Smrg case BRW_OPCODE_ADD: 2577b8e80941Smrg if (inst->src[1].file != IMM) 2578b8e80941Smrg continue; 2579b8e80941Smrg 2580b8e80941Smrg if (inst->src[0].file == IMM) { 2581b8e80941Smrg assert(inst->src[0].type == BRW_REGISTER_TYPE_F); 2582b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2583b8e80941Smrg inst->src[0].f += inst->src[1].f; 2584b8e80941Smrg inst->src[1] = reg_undef; 2585b8e80941Smrg progress = true; 2586b8e80941Smrg break; 2587b8e80941Smrg } 2588b8e80941Smrg break; 2589b8e80941Smrg case BRW_OPCODE_OR: 2590b8e80941Smrg if (inst->src[0].equals(inst->src[1]) || 2591b8e80941Smrg inst->src[1].is_zero()) { 2592b8e80941Smrg /* On Gen8+, the OR instruction can have a source modifier that 2593b8e80941Smrg * performs logical not on the operand. Cases of 'OR r0, ~r1, 0' 2594b8e80941Smrg * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV. 2595b8e80941Smrg */ 2596b8e80941Smrg if (inst->src[0].negate) { 2597b8e80941Smrg inst->opcode = BRW_OPCODE_NOT; 2598b8e80941Smrg inst->src[0].negate = false; 2599b8e80941Smrg } else { 2600b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2601b8e80941Smrg } 2602b8e80941Smrg inst->src[1] = reg_undef; 2603b8e80941Smrg progress = true; 2604b8e80941Smrg break; 2605b8e80941Smrg } 2606b8e80941Smrg break; 2607b8e80941Smrg case BRW_OPCODE_CMP: 2608b8e80941Smrg if ((inst->conditional_mod == BRW_CONDITIONAL_Z || 2609b8e80941Smrg inst->conditional_mod == BRW_CONDITIONAL_NZ) && 2610b8e80941Smrg inst->src[1].is_zero() && 2611b8e80941Smrg (inst->src[0].abs || inst->src[0].negate)) { 2612b8e80941Smrg inst->src[0].abs = false; 2613b8e80941Smrg inst->src[0].negate = false; 2614b8e80941Smrg progress = true; 2615b8e80941Smrg break; 2616b8e80941Smrg } 2617b8e80941Smrg break; 2618b8e80941Smrg case BRW_OPCODE_SEL: 2619b8e80941Smrg if (!devinfo->has_64bit_types && 2620b8e80941Smrg (inst->dst.type == BRW_REGISTER_TYPE_DF || 2621b8e80941Smrg inst->dst.type == BRW_REGISTER_TYPE_UQ || 2622b8e80941Smrg inst->dst.type == BRW_REGISTER_TYPE_Q)) { 2623b8e80941Smrg assert(inst->dst.type == inst->src[0].type); 2624b8e80941Smrg assert(!inst->saturate); 2625b8e80941Smrg assert(!inst->src[0].abs && !inst->src[0].negate); 2626b8e80941Smrg assert(!inst->src[1].abs && !inst->src[1].negate); 2627b8e80941Smrg const brw::fs_builder ibld(this, block, inst); 2628b8e80941Smrg 2629b8e80941Smrg set_predicate(inst->predicate, 2630b8e80941Smrg ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0), 2631b8e80941Smrg subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0), 2632b8e80941Smrg subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0))); 2633b8e80941Smrg set_predicate(inst->predicate, 2634b8e80941Smrg ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1), 2635b8e80941Smrg subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1), 2636b8e80941Smrg subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1))); 2637b8e80941Smrg 2638b8e80941Smrg inst->remove(block); 2639b8e80941Smrg progress = true; 2640b8e80941Smrg } 2641b8e80941Smrg if (inst->src[0].equals(inst->src[1])) { 2642b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2643b8e80941Smrg inst->src[1] = reg_undef; 2644b8e80941Smrg inst->predicate = BRW_PREDICATE_NONE; 2645b8e80941Smrg inst->predicate_inverse = false; 2646b8e80941Smrg progress = true; 2647b8e80941Smrg } else if (inst->saturate && inst->src[1].file == IMM) { 2648b8e80941Smrg switch (inst->conditional_mod) { 2649b8e80941Smrg case BRW_CONDITIONAL_LE: 2650b8e80941Smrg case BRW_CONDITIONAL_L: 2651b8e80941Smrg switch (inst->src[1].type) { 2652b8e80941Smrg case BRW_REGISTER_TYPE_F: 2653b8e80941Smrg if (inst->src[1].f >= 1.0f) { 2654b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2655b8e80941Smrg inst->src[1] = reg_undef; 2656b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_NONE; 2657b8e80941Smrg progress = true; 2658b8e80941Smrg } 2659b8e80941Smrg break; 2660b8e80941Smrg default: 2661b8e80941Smrg break; 2662b8e80941Smrg } 2663b8e80941Smrg break; 2664b8e80941Smrg case BRW_CONDITIONAL_GE: 2665b8e80941Smrg case BRW_CONDITIONAL_G: 2666b8e80941Smrg switch (inst->src[1].type) { 2667b8e80941Smrg case BRW_REGISTER_TYPE_F: 2668b8e80941Smrg if (inst->src[1].f <= 0.0f) { 2669b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2670b8e80941Smrg inst->src[1] = reg_undef; 2671b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_NONE; 2672b8e80941Smrg progress = true; 2673b8e80941Smrg } 2674b8e80941Smrg break; 2675b8e80941Smrg default: 2676b8e80941Smrg break; 2677b8e80941Smrg } 2678b8e80941Smrg default: 2679b8e80941Smrg break; 2680b8e80941Smrg } 2681b8e80941Smrg } 2682b8e80941Smrg break; 2683b8e80941Smrg case BRW_OPCODE_MAD: 2684b8e80941Smrg if (inst->src[0].type != BRW_REGISTER_TYPE_F || 2685b8e80941Smrg inst->src[1].type != BRW_REGISTER_TYPE_F || 2686b8e80941Smrg inst->src[2].type != BRW_REGISTER_TYPE_F) 2687b8e80941Smrg break; 2688b8e80941Smrg if (inst->src[1].is_one()) { 2689b8e80941Smrg inst->opcode = BRW_OPCODE_ADD; 2690b8e80941Smrg inst->src[1] = inst->src[2]; 2691b8e80941Smrg inst->src[2] = reg_undef; 2692b8e80941Smrg progress = true; 2693b8e80941Smrg } else if (inst->src[2].is_one()) { 2694b8e80941Smrg inst->opcode = BRW_OPCODE_ADD; 2695b8e80941Smrg inst->src[2] = reg_undef; 2696b8e80941Smrg progress = true; 2697b8e80941Smrg } 2698b8e80941Smrg break; 2699b8e80941Smrg case SHADER_OPCODE_BROADCAST: 2700b8e80941Smrg if (is_uniform(inst->src[0])) { 2701b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2702b8e80941Smrg inst->sources = 1; 2703b8e80941Smrg inst->force_writemask_all = true; 2704b8e80941Smrg progress = true; 2705b8e80941Smrg } else if (inst->src[1].file == IMM) { 2706b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2707b8e80941Smrg /* It's possible that the selected component will be too large and 2708b8e80941Smrg * overflow the register. This can happen if someone does a 2709b8e80941Smrg * readInvocation() from GLSL or SPIR-V and provides an OOB 2710b8e80941Smrg * invocationIndex. If this happens and we some how manage 2711b8e80941Smrg * to constant fold it in and get here, then component() may cause 2712b8e80941Smrg * us to start reading outside of the VGRF which will lead to an 2713b8e80941Smrg * assert later. Instead, just let it wrap around if it goes over 2714b8e80941Smrg * exec_size. 2715b8e80941Smrg */ 2716b8e80941Smrg const unsigned comp = inst->src[1].ud & (inst->exec_size - 1); 2717b8e80941Smrg inst->src[0] = component(inst->src[0], comp); 2718b8e80941Smrg inst->sources = 1; 2719b8e80941Smrg inst->force_writemask_all = true; 2720b8e80941Smrg progress = true; 2721b8e80941Smrg } 2722b8e80941Smrg break; 2723b8e80941Smrg 2724b8e80941Smrg case SHADER_OPCODE_SHUFFLE: 2725b8e80941Smrg if (is_uniform(inst->src[0])) { 2726b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2727b8e80941Smrg inst->sources = 1; 2728b8e80941Smrg progress = true; 2729b8e80941Smrg } else if (inst->src[1].file == IMM) { 2730b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 2731b8e80941Smrg inst->src[0] = component(inst->src[0], 2732b8e80941Smrg inst->src[1].ud); 2733b8e80941Smrg inst->sources = 1; 2734b8e80941Smrg progress = true; 2735b8e80941Smrg } 2736b8e80941Smrg break; 2737b8e80941Smrg 2738b8e80941Smrg default: 2739b8e80941Smrg break; 2740b8e80941Smrg } 2741b8e80941Smrg 2742b8e80941Smrg /* Swap if src[0] is immediate. */ 2743b8e80941Smrg if (progress && inst->is_commutative()) { 2744b8e80941Smrg if (inst->src[0].file == IMM) { 2745b8e80941Smrg fs_reg tmp = inst->src[1]; 2746b8e80941Smrg inst->src[1] = inst->src[0]; 2747b8e80941Smrg inst->src[0] = tmp; 2748b8e80941Smrg } 2749b8e80941Smrg } 2750b8e80941Smrg } 2751b8e80941Smrg return progress; 2752b8e80941Smrg} 2753b8e80941Smrg 2754b8e80941Smrg/** 2755b8e80941Smrg * Optimize sample messages that have constant zero values for the trailing 2756b8e80941Smrg * texture coordinates. We can just reduce the message length for these 2757b8e80941Smrg * instructions instead of reserving a register for it. Trailing parameters 2758b8e80941Smrg * that aren't sent default to zero anyway. This will cause the dead code 2759b8e80941Smrg * eliminator to remove the MOV instruction that would otherwise be emitted to 2760b8e80941Smrg * set up the zero value. 2761b8e80941Smrg */ 2762b8e80941Smrgbool 2763b8e80941Smrgfs_visitor::opt_zero_samples() 2764b8e80941Smrg{ 2765b8e80941Smrg /* Gen4 infers the texturing opcode based on the message length so we can't 2766b8e80941Smrg * change it. 2767b8e80941Smrg */ 2768b8e80941Smrg if (devinfo->gen < 5) 2769b8e80941Smrg return false; 2770b8e80941Smrg 2771b8e80941Smrg bool progress = false; 2772b8e80941Smrg 2773b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 2774b8e80941Smrg if (!inst->is_tex()) 2775b8e80941Smrg continue; 2776b8e80941Smrg 2777b8e80941Smrg fs_inst *load_payload = (fs_inst *) inst->prev; 2778b8e80941Smrg 2779b8e80941Smrg if (load_payload->is_head_sentinel() || 2780b8e80941Smrg load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) 2781b8e80941Smrg continue; 2782b8e80941Smrg 2783b8e80941Smrg /* We don't want to remove the message header or the first parameter. 2784b8e80941Smrg * Removing the first parameter is not allowed, see the Haswell PRM 2785b8e80941Smrg * volume 7, page 149: 2786b8e80941Smrg * 2787b8e80941Smrg * "Parameter 0 is required except for the sampleinfo message, which 2788b8e80941Smrg * has no parameter 0" 2789b8e80941Smrg */ 2790b8e80941Smrg while (inst->mlen > inst->header_size + inst->exec_size / 8 && 2791b8e80941Smrg load_payload->src[(inst->mlen - inst->header_size) / 2792b8e80941Smrg (inst->exec_size / 8) + 2793b8e80941Smrg inst->header_size - 1].is_zero()) { 2794b8e80941Smrg inst->mlen -= inst->exec_size / 8; 2795b8e80941Smrg progress = true; 2796b8e80941Smrg } 2797b8e80941Smrg } 2798b8e80941Smrg 2799b8e80941Smrg if (progress) 2800b8e80941Smrg invalidate_live_intervals(); 2801b8e80941Smrg 2802b8e80941Smrg return progress; 2803b8e80941Smrg} 2804b8e80941Smrg 2805b8e80941Smrg/** 2806b8e80941Smrg * Optimize sample messages which are followed by the final RT write. 2807b8e80941Smrg * 2808b8e80941Smrg * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its 2809b8e80941Smrg * results sent directly to the framebuffer, bypassing the EU. Recognize the 2810b8e80941Smrg * final texturing results copied to the framebuffer write payload and modify 2811b8e80941Smrg * them to write to the framebuffer directly. 2812b8e80941Smrg */ 2813b8e80941Smrgbool 2814b8e80941Smrgfs_visitor::opt_sampler_eot() 2815b8e80941Smrg{ 2816b8e80941Smrg brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 2817b8e80941Smrg 2818b8e80941Smrg if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16) 2819b8e80941Smrg return false; 2820b8e80941Smrg 2821b8e80941Smrg if (devinfo->gen != 9 && !devinfo->is_cherryview) 2822b8e80941Smrg return false; 2823b8e80941Smrg 2824b8e80941Smrg /* FINISHME: It should be possible to implement this optimization when there 2825b8e80941Smrg * are multiple drawbuffers. 2826b8e80941Smrg */ 2827b8e80941Smrg if (key->nr_color_regions != 1) 2828b8e80941Smrg return false; 2829b8e80941Smrg 2830b8e80941Smrg /* Requires emitting a bunch of saturating MOV instructions during logical 2831b8e80941Smrg * send lowering to clamp the color payload, which the sampler unit isn't 2832b8e80941Smrg * going to do for us. 2833b8e80941Smrg */ 2834b8e80941Smrg if (key->clamp_fragment_color) 2835b8e80941Smrg return false; 2836b8e80941Smrg 2837b8e80941Smrg /* Look for a texturing instruction immediately before the final FB_WRITE. */ 2838b8e80941Smrg bblock_t *block = cfg->blocks[cfg->num_blocks - 1]; 2839b8e80941Smrg fs_inst *fb_write = (fs_inst *)block->end(); 2840b8e80941Smrg assert(fb_write->eot); 2841b8e80941Smrg assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL); 2842b8e80941Smrg 2843b8e80941Smrg /* There wasn't one; nothing to do. */ 2844b8e80941Smrg if (unlikely(fb_write->prev->is_head_sentinel())) 2845b8e80941Smrg return false; 2846b8e80941Smrg 2847b8e80941Smrg fs_inst *tex_inst = (fs_inst *) fb_write->prev; 2848b8e80941Smrg 2849b8e80941Smrg /* 3D Sampler » Messages » Message Format 2850b8e80941Smrg * 2851b8e80941Smrg * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler 2852b8e80941Smrg * messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*” 2853b8e80941Smrg */ 2854b8e80941Smrg if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL && 2855b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL && 2856b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL && 2857b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL && 2858b8e80941Smrg tex_inst->opcode != FS_OPCODE_TXB_LOGICAL && 2859b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL && 2860b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL && 2861b8e80941Smrg tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL) 2862b8e80941Smrg return false; 2863b8e80941Smrg 2864b8e80941Smrg /* XXX - This shouldn't be necessary. */ 2865b8e80941Smrg if (tex_inst->prev->is_head_sentinel()) 2866b8e80941Smrg return false; 2867b8e80941Smrg 2868b8e80941Smrg /* Check that the FB write sources are fully initialized by the single 2869b8e80941Smrg * texturing instruction. 2870b8e80941Smrg */ 2871b8e80941Smrg for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) { 2872b8e80941Smrg if (i == FB_WRITE_LOGICAL_SRC_COLOR0) { 2873b8e80941Smrg if (!fb_write->src[i].equals(tex_inst->dst) || 2874b8e80941Smrg fb_write->size_read(i) != tex_inst->size_written) 2875b8e80941Smrg return false; 2876b8e80941Smrg } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) { 2877b8e80941Smrg if (fb_write->src[i].file != BAD_FILE) 2878b8e80941Smrg return false; 2879b8e80941Smrg } 2880b8e80941Smrg } 2881b8e80941Smrg 2882b8e80941Smrg assert(!tex_inst->eot); /* We can't get here twice */ 2883b8e80941Smrg assert((tex_inst->offset & (0xff << 24)) == 0); 2884b8e80941Smrg 2885b8e80941Smrg const fs_builder ibld(this, block, tex_inst); 2886b8e80941Smrg 2887b8e80941Smrg tex_inst->offset |= fb_write->target << 24; 2888b8e80941Smrg tex_inst->eot = true; 2889b8e80941Smrg tex_inst->dst = ibld.null_reg_ud(); 2890b8e80941Smrg tex_inst->size_written = 0; 2891b8e80941Smrg fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); 2892b8e80941Smrg 2893b8e80941Smrg /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT 2894b8e80941Smrg * flag and submit a header together with the sampler message as required 2895b8e80941Smrg * by the hardware. 2896b8e80941Smrg */ 2897b8e80941Smrg invalidate_live_intervals(); 2898b8e80941Smrg return true; 2899b8e80941Smrg} 2900b8e80941Smrg 2901b8e80941Smrgbool 2902b8e80941Smrgfs_visitor::opt_register_renaming() 2903b8e80941Smrg{ 2904b8e80941Smrg bool progress = false; 2905b8e80941Smrg int depth = 0; 2906b8e80941Smrg 2907b8e80941Smrg unsigned remap[alloc.count]; 2908b8e80941Smrg memset(remap, ~0u, sizeof(unsigned) * alloc.count); 2909b8e80941Smrg 2910b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 2911b8e80941Smrg if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) { 2912b8e80941Smrg depth++; 2913b8e80941Smrg } else if (inst->opcode == BRW_OPCODE_ENDIF || 2914b8e80941Smrg inst->opcode == BRW_OPCODE_WHILE) { 2915b8e80941Smrg depth--; 2916b8e80941Smrg } 2917b8e80941Smrg 2918b8e80941Smrg /* Rewrite instruction sources. */ 2919b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 2920b8e80941Smrg if (inst->src[i].file == VGRF && 2921b8e80941Smrg remap[inst->src[i].nr] != ~0u && 2922b8e80941Smrg remap[inst->src[i].nr] != inst->src[i].nr) { 2923b8e80941Smrg inst->src[i].nr = remap[inst->src[i].nr]; 2924b8e80941Smrg progress = true; 2925b8e80941Smrg } 2926b8e80941Smrg } 2927b8e80941Smrg 2928b8e80941Smrg const unsigned dst = inst->dst.nr; 2929b8e80941Smrg 2930b8e80941Smrg if (depth == 0 && 2931b8e80941Smrg inst->dst.file == VGRF && 2932b8e80941Smrg alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written && 2933b8e80941Smrg !inst->is_partial_write()) { 2934b8e80941Smrg if (remap[dst] == ~0u) { 2935b8e80941Smrg remap[dst] = dst; 2936b8e80941Smrg } else { 2937b8e80941Smrg remap[dst] = alloc.allocate(regs_written(inst)); 2938b8e80941Smrg inst->dst.nr = remap[dst]; 2939b8e80941Smrg progress = true; 2940b8e80941Smrg } 2941b8e80941Smrg } else if (inst->dst.file == VGRF && 2942b8e80941Smrg remap[dst] != ~0u && 2943b8e80941Smrg remap[dst] != dst) { 2944b8e80941Smrg inst->dst.nr = remap[dst]; 2945b8e80941Smrg progress = true; 2946b8e80941Smrg } 2947b8e80941Smrg } 2948b8e80941Smrg 2949b8e80941Smrg if (progress) { 2950b8e80941Smrg invalidate_live_intervals(); 2951b8e80941Smrg 2952b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { 2953b8e80941Smrg if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) { 2954b8e80941Smrg delta_xy[i].nr = remap[delta_xy[i].nr]; 2955b8e80941Smrg } 2956b8e80941Smrg } 2957b8e80941Smrg } 2958b8e80941Smrg 2959b8e80941Smrg return progress; 2960b8e80941Smrg} 2961b8e80941Smrg 2962b8e80941Smrg/** 2963b8e80941Smrg * Remove redundant or useless discard jumps. 2964b8e80941Smrg * 2965b8e80941Smrg * For example, we can eliminate jumps in the following sequence: 2966b8e80941Smrg * 2967b8e80941Smrg * discard-jump (redundant with the next jump) 2968b8e80941Smrg * discard-jump (useless; jumps to the next instruction) 2969b8e80941Smrg * placeholder-halt 2970b8e80941Smrg */ 2971b8e80941Smrgbool 2972b8e80941Smrgfs_visitor::opt_redundant_discard_jumps() 2973b8e80941Smrg{ 2974b8e80941Smrg bool progress = false; 2975b8e80941Smrg 2976b8e80941Smrg bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1]; 2977b8e80941Smrg 2978b8e80941Smrg fs_inst *placeholder_halt = NULL; 2979b8e80941Smrg foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) { 2980b8e80941Smrg if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) { 2981b8e80941Smrg placeholder_halt = inst; 2982b8e80941Smrg break; 2983b8e80941Smrg } 2984b8e80941Smrg } 2985b8e80941Smrg 2986b8e80941Smrg if (!placeholder_halt) 2987b8e80941Smrg return false; 2988b8e80941Smrg 2989b8e80941Smrg /* Delete any HALTs immediately before the placeholder halt. */ 2990b8e80941Smrg for (fs_inst *prev = (fs_inst *) placeholder_halt->prev; 2991b8e80941Smrg !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP; 2992b8e80941Smrg prev = (fs_inst *) placeholder_halt->prev) { 2993b8e80941Smrg prev->remove(last_bblock); 2994b8e80941Smrg progress = true; 2995b8e80941Smrg } 2996b8e80941Smrg 2997b8e80941Smrg if (progress) 2998b8e80941Smrg invalidate_live_intervals(); 2999b8e80941Smrg 3000b8e80941Smrg return progress; 3001b8e80941Smrg} 3002b8e80941Smrg 3003b8e80941Smrg/** 3004b8e80941Smrg * Compute a bitmask with GRF granularity with a bit set for each GRF starting 3005b8e80941Smrg * from \p r.offset which overlaps the region starting at \p s.offset and 3006b8e80941Smrg * spanning \p ds bytes. 3007b8e80941Smrg */ 3008b8e80941Smrgstatic inline unsigned 3009b8e80941Smrgmask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds) 3010b8e80941Smrg{ 3011b8e80941Smrg const int rel_offset = reg_offset(s) - reg_offset(r); 3012b8e80941Smrg const int shift = rel_offset / REG_SIZE; 3013b8e80941Smrg const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE); 3014b8e80941Smrg assert(reg_space(r) == reg_space(s) && 3015b8e80941Smrg shift >= 0 && shift < int(8 * sizeof(unsigned))); 3016b8e80941Smrg return ((1 << n) - 1) << shift; 3017b8e80941Smrg} 3018b8e80941Smrg 3019b8e80941Smrgbool 3020b8e80941Smrgfs_visitor::opt_peephole_csel() 3021b8e80941Smrg{ 3022b8e80941Smrg if (devinfo->gen < 8) 3023b8e80941Smrg return false; 3024b8e80941Smrg 3025b8e80941Smrg bool progress = false; 3026b8e80941Smrg 3027b8e80941Smrg foreach_block_reverse(block, cfg) { 3028b8e80941Smrg int ip = block->end_ip + 1; 3029b8e80941Smrg 3030b8e80941Smrg foreach_inst_in_block_reverse_safe(fs_inst, inst, block) { 3031b8e80941Smrg ip--; 3032b8e80941Smrg 3033b8e80941Smrg if (inst->opcode != BRW_OPCODE_SEL || 3034b8e80941Smrg inst->predicate != BRW_PREDICATE_NORMAL || 3035b8e80941Smrg (inst->dst.type != BRW_REGISTER_TYPE_F && 3036b8e80941Smrg inst->dst.type != BRW_REGISTER_TYPE_D && 3037b8e80941Smrg inst->dst.type != BRW_REGISTER_TYPE_UD)) 3038b8e80941Smrg continue; 3039b8e80941Smrg 3040b8e80941Smrg /* Because it is a 3-src instruction, CSEL cannot have an immediate 3041b8e80941Smrg * value as a source, but we can sometimes handle zero. 3042b8e80941Smrg */ 3043b8e80941Smrg if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR && 3044b8e80941Smrg inst->src[0].file != UNIFORM) || 3045b8e80941Smrg (inst->src[1].file != VGRF && inst->src[1].file != ATTR && 3046b8e80941Smrg inst->src[1].file != UNIFORM && !inst->src[1].is_zero())) 3047b8e80941Smrg continue; 3048b8e80941Smrg 3049b8e80941Smrg foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 3050b8e80941Smrg if (!scan_inst->flags_written()) 3051b8e80941Smrg continue; 3052b8e80941Smrg 3053b8e80941Smrg if ((scan_inst->opcode != BRW_OPCODE_CMP && 3054b8e80941Smrg scan_inst->opcode != BRW_OPCODE_MOV) || 3055b8e80941Smrg scan_inst->predicate != BRW_PREDICATE_NONE || 3056b8e80941Smrg (scan_inst->src[0].file != VGRF && 3057b8e80941Smrg scan_inst->src[0].file != ATTR && 3058b8e80941Smrg scan_inst->src[0].file != UNIFORM) || 3059b8e80941Smrg scan_inst->src[0].type != BRW_REGISTER_TYPE_F) 3060b8e80941Smrg break; 3061b8e80941Smrg 3062b8e80941Smrg if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero()) 3063b8e80941Smrg break; 3064b8e80941Smrg 3065b8e80941Smrg const brw::fs_builder ibld(this, block, inst); 3066b8e80941Smrg 3067b8e80941Smrg const enum brw_conditional_mod cond = 3068b8e80941Smrg inst->predicate_inverse 3069b8e80941Smrg ? brw_negate_cmod(scan_inst->conditional_mod) 3070b8e80941Smrg : scan_inst->conditional_mod; 3071b8e80941Smrg 3072b8e80941Smrg fs_inst *csel_inst = NULL; 3073b8e80941Smrg 3074b8e80941Smrg if (inst->src[1].file != IMM) { 3075b8e80941Smrg csel_inst = ibld.CSEL(inst->dst, 3076b8e80941Smrg inst->src[0], 3077b8e80941Smrg inst->src[1], 3078b8e80941Smrg scan_inst->src[0], 3079b8e80941Smrg cond); 3080b8e80941Smrg } else if (cond == BRW_CONDITIONAL_NZ) { 3081b8e80941Smrg /* Consider the sequence 3082b8e80941Smrg * 3083b8e80941Smrg * cmp.nz.f0 null<1>F g3<8,8,1>F 0F 3084b8e80941Smrg * (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD 3085b8e80941Smrg * 3086b8e80941Smrg * The sel will pick the immediate value 0 if r0 is ±0.0. 3087b8e80941Smrg * Therefore, this sequence is equivalent: 3088b8e80941Smrg * 3089b8e80941Smrg * cmp.nz.f0 null<1>F g3<8,8,1>F 0F 3090b8e80941Smrg * (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F 3091b8e80941Smrg * 3092b8e80941Smrg * The abs is ensures that the result is 0UD when g3 is -0.0F. 3093b8e80941Smrg * By normal cmp-sel merging, this is also equivalent: 3094b8e80941Smrg * 3095b8e80941Smrg * csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F 3096b8e80941Smrg */ 3097b8e80941Smrg csel_inst = ibld.CSEL(inst->dst, 3098b8e80941Smrg inst->src[0], 3099b8e80941Smrg scan_inst->src[0], 3100b8e80941Smrg scan_inst->src[0], 3101b8e80941Smrg cond); 3102b8e80941Smrg 3103b8e80941Smrg csel_inst->src[1].abs = true; 3104b8e80941Smrg } 3105b8e80941Smrg 3106b8e80941Smrg if (csel_inst != NULL) { 3107b8e80941Smrg progress = true; 3108b8e80941Smrg csel_inst->saturate = inst->saturate; 3109b8e80941Smrg inst->remove(block); 3110b8e80941Smrg } 3111b8e80941Smrg 3112b8e80941Smrg break; 3113b8e80941Smrg } 3114b8e80941Smrg } 3115b8e80941Smrg } 3116b8e80941Smrg 3117b8e80941Smrg return progress; 3118b8e80941Smrg} 3119b8e80941Smrg 3120b8e80941Smrgbool 3121b8e80941Smrgfs_visitor::compute_to_mrf() 3122b8e80941Smrg{ 3123b8e80941Smrg bool progress = false; 3124b8e80941Smrg int next_ip = 0; 3125b8e80941Smrg 3126b8e80941Smrg /* No MRFs on Gen >= 7. */ 3127b8e80941Smrg if (devinfo->gen >= 7) 3128b8e80941Smrg return false; 3129b8e80941Smrg 3130b8e80941Smrg calculate_live_intervals(); 3131b8e80941Smrg 3132b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 3133b8e80941Smrg int ip = next_ip; 3134b8e80941Smrg next_ip++; 3135b8e80941Smrg 3136b8e80941Smrg if (inst->opcode != BRW_OPCODE_MOV || 3137b8e80941Smrg inst->is_partial_write() || 3138b8e80941Smrg inst->dst.file != MRF || inst->src[0].file != VGRF || 3139b8e80941Smrg inst->dst.type != inst->src[0].type || 3140b8e80941Smrg inst->src[0].abs || inst->src[0].negate || 3141b8e80941Smrg !inst->src[0].is_contiguous() || 3142b8e80941Smrg inst->src[0].offset % REG_SIZE != 0) 3143b8e80941Smrg continue; 3144b8e80941Smrg 3145b8e80941Smrg /* Can't compute-to-MRF this GRF if someone else was going to 3146b8e80941Smrg * read it later. 3147b8e80941Smrg */ 3148b8e80941Smrg if (this->virtual_grf_end[inst->src[0].nr] > ip) 3149b8e80941Smrg continue; 3150b8e80941Smrg 3151b8e80941Smrg /* Found a move of a GRF to a MRF. Let's see if we can go rewrite the 3152b8e80941Smrg * things that computed the value of all GRFs of the source region. The 3153b8e80941Smrg * regs_left bitset keeps track of the registers we haven't yet found a 3154b8e80941Smrg * generating instruction for. 3155b8e80941Smrg */ 3156b8e80941Smrg unsigned regs_left = (1 << regs_read(inst, 0)) - 1; 3157b8e80941Smrg 3158b8e80941Smrg foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 3159b8e80941Smrg if (regions_overlap(scan_inst->dst, scan_inst->size_written, 3160b8e80941Smrg inst->src[0], inst->size_read(0))) { 3161b8e80941Smrg /* Found the last thing to write our reg we want to turn 3162b8e80941Smrg * into a compute-to-MRF. 3163b8e80941Smrg */ 3164b8e80941Smrg 3165b8e80941Smrg /* If this one instruction didn't populate all the 3166b8e80941Smrg * channels, bail. We might be able to rewrite everything 3167b8e80941Smrg * that writes that reg, but it would require smarter 3168b8e80941Smrg * tracking. 3169b8e80941Smrg */ 3170b8e80941Smrg if (scan_inst->is_partial_write()) 3171b8e80941Smrg break; 3172b8e80941Smrg 3173b8e80941Smrg /* Handling things not fully contained in the source of the copy 3174b8e80941Smrg * would need us to understand coalescing out more than one MOV at 3175b8e80941Smrg * a time. 3176b8e80941Smrg */ 3177b8e80941Smrg if (!region_contained_in(scan_inst->dst, scan_inst->size_written, 3178b8e80941Smrg inst->src[0], inst->size_read(0))) 3179b8e80941Smrg break; 3180b8e80941Smrg 3181b8e80941Smrg /* SEND instructions can't have MRF as a destination. */ 3182b8e80941Smrg if (scan_inst->mlen) 3183b8e80941Smrg break; 3184b8e80941Smrg 3185b8e80941Smrg if (devinfo->gen == 6) { 3186b8e80941Smrg /* gen6 math instructions must have the destination be 3187b8e80941Smrg * GRF, so no compute-to-MRF for them. 3188b8e80941Smrg */ 3189b8e80941Smrg if (scan_inst->is_math()) { 3190b8e80941Smrg break; 3191b8e80941Smrg } 3192b8e80941Smrg } 3193b8e80941Smrg 3194b8e80941Smrg /* Clear the bits for any registers this instruction overwrites. */ 3195b8e80941Smrg regs_left &= ~mask_relative_to( 3196b8e80941Smrg inst->src[0], scan_inst->dst, scan_inst->size_written); 3197b8e80941Smrg if (!regs_left) 3198b8e80941Smrg break; 3199b8e80941Smrg } 3200b8e80941Smrg 3201b8e80941Smrg /* We don't handle control flow here. Most computation of 3202b8e80941Smrg * values that end up in MRFs are shortly before the MRF 3203b8e80941Smrg * write anyway. 3204b8e80941Smrg */ 3205b8e80941Smrg if (block->start() == scan_inst) 3206b8e80941Smrg break; 3207b8e80941Smrg 3208b8e80941Smrg /* You can't read from an MRF, so if someone else reads our 3209b8e80941Smrg * MRF's source GRF that we wanted to rewrite, that stops us. 3210b8e80941Smrg */ 3211b8e80941Smrg bool interfered = false; 3212b8e80941Smrg for (int i = 0; i < scan_inst->sources; i++) { 3213b8e80941Smrg if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i), 3214b8e80941Smrg inst->src[0], inst->size_read(0))) { 3215b8e80941Smrg interfered = true; 3216b8e80941Smrg } 3217b8e80941Smrg } 3218b8e80941Smrg if (interfered) 3219b8e80941Smrg break; 3220b8e80941Smrg 3221b8e80941Smrg if (regions_overlap(scan_inst->dst, scan_inst->size_written, 3222b8e80941Smrg inst->dst, inst->size_written)) { 3223b8e80941Smrg /* If somebody else writes our MRF here, we can't 3224b8e80941Smrg * compute-to-MRF before that. 3225b8e80941Smrg */ 3226b8e80941Smrg break; 3227b8e80941Smrg } 3228b8e80941Smrg 3229b8e80941Smrg if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 && 3230b8e80941Smrg regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE, 3231b8e80941Smrg inst->dst, inst->size_written)) { 3232b8e80941Smrg /* Found a SEND instruction, which means that there are 3233b8e80941Smrg * live values in MRFs from base_mrf to base_mrf + 3234b8e80941Smrg * scan_inst->mlen - 1. Don't go pushing our MRF write up 3235b8e80941Smrg * above it. 3236b8e80941Smrg */ 3237b8e80941Smrg break; 3238b8e80941Smrg } 3239b8e80941Smrg } 3240b8e80941Smrg 3241b8e80941Smrg if (regs_left) 3242b8e80941Smrg continue; 3243b8e80941Smrg 3244b8e80941Smrg /* Found all generating instructions of our MRF's source value, so it 3245b8e80941Smrg * should be safe to rewrite them to point to the MRF directly. 3246b8e80941Smrg */ 3247b8e80941Smrg regs_left = (1 << regs_read(inst, 0)) - 1; 3248b8e80941Smrg 3249b8e80941Smrg foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 3250b8e80941Smrg if (regions_overlap(scan_inst->dst, scan_inst->size_written, 3251b8e80941Smrg inst->src[0], inst->size_read(0))) { 3252b8e80941Smrg /* Clear the bits for any registers this instruction overwrites. */ 3253b8e80941Smrg regs_left &= ~mask_relative_to( 3254b8e80941Smrg inst->src[0], scan_inst->dst, scan_inst->size_written); 3255b8e80941Smrg 3256b8e80941Smrg const unsigned rel_offset = reg_offset(scan_inst->dst) - 3257b8e80941Smrg reg_offset(inst->src[0]); 3258b8e80941Smrg 3259b8e80941Smrg if (inst->dst.nr & BRW_MRF_COMPR4) { 3260b8e80941Smrg /* Apply the same address transformation done by the hardware 3261b8e80941Smrg * for COMPR4 MRF writes. 3262b8e80941Smrg */ 3263b8e80941Smrg assert(rel_offset < 2 * REG_SIZE); 3264b8e80941Smrg scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4; 3265b8e80941Smrg 3266b8e80941Smrg /* Clear the COMPR4 bit if the generating instruction is not 3267b8e80941Smrg * compressed. 3268b8e80941Smrg */ 3269b8e80941Smrg if (scan_inst->size_written < 2 * REG_SIZE) 3270b8e80941Smrg scan_inst->dst.nr &= ~BRW_MRF_COMPR4; 3271b8e80941Smrg 3272b8e80941Smrg } else { 3273b8e80941Smrg /* Calculate the MRF number the result of this instruction is 3274b8e80941Smrg * ultimately written to. 3275b8e80941Smrg */ 3276b8e80941Smrg scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE; 3277b8e80941Smrg } 3278b8e80941Smrg 3279b8e80941Smrg scan_inst->dst.file = MRF; 3280b8e80941Smrg scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE; 3281b8e80941Smrg scan_inst->saturate |= inst->saturate; 3282b8e80941Smrg if (!regs_left) 3283b8e80941Smrg break; 3284b8e80941Smrg } 3285b8e80941Smrg } 3286b8e80941Smrg 3287b8e80941Smrg assert(!regs_left); 3288b8e80941Smrg inst->remove(block); 3289b8e80941Smrg progress = true; 3290b8e80941Smrg } 3291b8e80941Smrg 3292b8e80941Smrg if (progress) 3293b8e80941Smrg invalidate_live_intervals(); 3294b8e80941Smrg 3295b8e80941Smrg return progress; 3296b8e80941Smrg} 3297b8e80941Smrg 3298b8e80941Smrg/** 3299b8e80941Smrg * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control 3300b8e80941Smrg * flow. We could probably do better here with some form of divergence 3301b8e80941Smrg * analysis. 3302b8e80941Smrg */ 3303b8e80941Smrgbool 3304b8e80941Smrgfs_visitor::eliminate_find_live_channel() 3305b8e80941Smrg{ 3306b8e80941Smrg bool progress = false; 3307b8e80941Smrg unsigned depth = 0; 3308b8e80941Smrg 3309b8e80941Smrg if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) { 3310b8e80941Smrg /* The optimization below assumes that channel zero is live on thread 3311b8e80941Smrg * dispatch, which may not be the case if the fixed function dispatches 3312b8e80941Smrg * threads sparsely. 3313b8e80941Smrg */ 3314b8e80941Smrg return false; 3315b8e80941Smrg } 3316b8e80941Smrg 3317b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 3318b8e80941Smrg switch (inst->opcode) { 3319b8e80941Smrg case BRW_OPCODE_IF: 3320b8e80941Smrg case BRW_OPCODE_DO: 3321b8e80941Smrg depth++; 3322b8e80941Smrg break; 3323b8e80941Smrg 3324b8e80941Smrg case BRW_OPCODE_ENDIF: 3325b8e80941Smrg case BRW_OPCODE_WHILE: 3326b8e80941Smrg depth--; 3327b8e80941Smrg break; 3328b8e80941Smrg 3329b8e80941Smrg case FS_OPCODE_DISCARD_JUMP: 3330b8e80941Smrg /* This can potentially make control flow non-uniform until the end 3331b8e80941Smrg * of the program. 3332b8e80941Smrg */ 3333b8e80941Smrg return progress; 3334b8e80941Smrg 3335b8e80941Smrg case SHADER_OPCODE_FIND_LIVE_CHANNEL: 3336b8e80941Smrg if (depth == 0) { 3337b8e80941Smrg inst->opcode = BRW_OPCODE_MOV; 3338b8e80941Smrg inst->src[0] = brw_imm_ud(0u); 3339b8e80941Smrg inst->sources = 1; 3340b8e80941Smrg inst->force_writemask_all = true; 3341b8e80941Smrg progress = true; 3342b8e80941Smrg } 3343b8e80941Smrg break; 3344b8e80941Smrg 3345b8e80941Smrg default: 3346b8e80941Smrg break; 3347b8e80941Smrg } 3348b8e80941Smrg } 3349b8e80941Smrg 3350b8e80941Smrg return progress; 3351b8e80941Smrg} 3352b8e80941Smrg 3353b8e80941Smrg/** 3354b8e80941Smrg * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE 3355b8e80941Smrg * instructions to FS_OPCODE_REP_FB_WRITE. 3356b8e80941Smrg */ 3357b8e80941Smrgvoid 3358b8e80941Smrgfs_visitor::emit_repclear_shader() 3359b8e80941Smrg{ 3360b8e80941Smrg brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 3361b8e80941Smrg int base_mrf = 0; 3362b8e80941Smrg int color_mrf = base_mrf + 2; 3363b8e80941Smrg fs_inst *mov; 3364b8e80941Smrg 3365b8e80941Smrg if (uniforms > 0) { 3366b8e80941Smrg mov = bld.exec_all().group(4, 0) 3367b8e80941Smrg .MOV(brw_message_reg(color_mrf), 3368b8e80941Smrg fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)); 3369b8e80941Smrg } else { 3370b8e80941Smrg struct brw_reg reg = 3371b8e80941Smrg brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F, 3372b8e80941Smrg BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4, 3373b8e80941Smrg BRW_SWIZZLE_XYZW, WRITEMASK_XYZW); 3374b8e80941Smrg 3375b8e80941Smrg mov = bld.exec_all().group(4, 0) 3376b8e80941Smrg .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)); 3377b8e80941Smrg } 3378b8e80941Smrg 3379b8e80941Smrg fs_inst *write = NULL; 3380b8e80941Smrg if (key->nr_color_regions == 1) { 3381b8e80941Smrg write = bld.emit(FS_OPCODE_REP_FB_WRITE); 3382b8e80941Smrg write->saturate = key->clamp_fragment_color; 3383b8e80941Smrg write->base_mrf = color_mrf; 3384b8e80941Smrg write->target = 0; 3385b8e80941Smrg write->header_size = 0; 3386b8e80941Smrg write->mlen = 1; 3387b8e80941Smrg } else { 3388b8e80941Smrg assume(key->nr_color_regions > 0); 3389b8e80941Smrg 3390b8e80941Smrg struct brw_reg header = 3391b8e80941Smrg retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD); 3392b8e80941Smrg bld.exec_all().group(16, 0) 3393b8e80941Smrg .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 3394b8e80941Smrg 3395b8e80941Smrg for (int i = 0; i < key->nr_color_regions; ++i) { 3396b8e80941Smrg if (i > 0) { 3397b8e80941Smrg bld.exec_all().group(1, 0) 3398b8e80941Smrg .MOV(component(header, 2), brw_imm_ud(i)); 3399b8e80941Smrg } 3400b8e80941Smrg 3401b8e80941Smrg write = bld.emit(FS_OPCODE_REP_FB_WRITE); 3402b8e80941Smrg write->saturate = key->clamp_fragment_color; 3403b8e80941Smrg write->base_mrf = base_mrf; 3404b8e80941Smrg write->target = i; 3405b8e80941Smrg write->header_size = 2; 3406b8e80941Smrg write->mlen = 3; 3407b8e80941Smrg } 3408b8e80941Smrg } 3409b8e80941Smrg write->eot = true; 3410b8e80941Smrg write->last_rt = true; 3411b8e80941Smrg 3412b8e80941Smrg calculate_cfg(); 3413b8e80941Smrg 3414b8e80941Smrg assign_constant_locations(); 3415b8e80941Smrg assign_curb_setup(); 3416b8e80941Smrg 3417b8e80941Smrg /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ 3418b8e80941Smrg if (uniforms > 0) { 3419b8e80941Smrg assert(mov->src[0].file == FIXED_GRF); 3420b8e80941Smrg mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0); 3421b8e80941Smrg } 3422b8e80941Smrg} 3423b8e80941Smrg 3424b8e80941Smrg/** 3425b8e80941Smrg * Walks through basic blocks, looking for repeated MRF writes and 3426b8e80941Smrg * removing the later ones. 3427b8e80941Smrg */ 3428b8e80941Smrgbool 3429b8e80941Smrgfs_visitor::remove_duplicate_mrf_writes() 3430b8e80941Smrg{ 3431b8e80941Smrg fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)]; 3432b8e80941Smrg bool progress = false; 3433b8e80941Smrg 3434b8e80941Smrg /* Need to update the MRF tracking for compressed instructions. */ 3435b8e80941Smrg if (dispatch_width >= 16) 3436b8e80941Smrg return false; 3437b8e80941Smrg 3438b8e80941Smrg memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3439b8e80941Smrg 3440b8e80941Smrg foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { 3441b8e80941Smrg if (inst->is_control_flow()) { 3442b8e80941Smrg memset(last_mrf_move, 0, sizeof(last_mrf_move)); 3443b8e80941Smrg } 3444b8e80941Smrg 3445b8e80941Smrg if (inst->opcode == BRW_OPCODE_MOV && 3446b8e80941Smrg inst->dst.file == MRF) { 3447b8e80941Smrg fs_inst *prev_inst = last_mrf_move[inst->dst.nr]; 3448b8e80941Smrg if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV && 3449b8e80941Smrg inst->dst.equals(prev_inst->dst) && 3450b8e80941Smrg inst->src[0].equals(prev_inst->src[0]) && 3451b8e80941Smrg inst->saturate == prev_inst->saturate && 3452b8e80941Smrg inst->predicate == prev_inst->predicate && 3453b8e80941Smrg inst->conditional_mod == prev_inst->conditional_mod && 3454b8e80941Smrg inst->exec_size == prev_inst->exec_size) { 3455b8e80941Smrg inst->remove(block); 3456b8e80941Smrg progress = true; 3457b8e80941Smrg continue; 3458b8e80941Smrg } 3459b8e80941Smrg } 3460b8e80941Smrg 3461b8e80941Smrg /* Clear out the last-write records for MRFs that were overwritten. */ 3462b8e80941Smrg if (inst->dst.file == MRF) { 3463b8e80941Smrg last_mrf_move[inst->dst.nr] = NULL; 3464b8e80941Smrg } 3465b8e80941Smrg 3466b8e80941Smrg if (inst->mlen > 0 && inst->base_mrf != -1) { 3467b8e80941Smrg /* Found a SEND instruction, which will include two or fewer 3468b8e80941Smrg * implied MRF writes. We could do better here. 3469b8e80941Smrg */ 3470b8e80941Smrg for (int i = 0; i < implied_mrf_writes(inst); i++) { 3471b8e80941Smrg last_mrf_move[inst->base_mrf + i] = NULL; 3472b8e80941Smrg } 3473b8e80941Smrg } 3474b8e80941Smrg 3475b8e80941Smrg /* Clear out any MRF move records whose sources got overwritten. */ 3476b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { 3477b8e80941Smrg if (last_mrf_move[i] && 3478b8e80941Smrg regions_overlap(inst->dst, inst->size_written, 3479b8e80941Smrg last_mrf_move[i]->src[0], 3480b8e80941Smrg last_mrf_move[i]->size_read(0))) { 3481b8e80941Smrg last_mrf_move[i] = NULL; 3482b8e80941Smrg } 3483b8e80941Smrg } 3484b8e80941Smrg 3485b8e80941Smrg if (inst->opcode == BRW_OPCODE_MOV && 3486b8e80941Smrg inst->dst.file == MRF && 3487b8e80941Smrg inst->src[0].file != ARF && 3488b8e80941Smrg !inst->is_partial_write()) { 3489b8e80941Smrg last_mrf_move[inst->dst.nr] = inst; 3490b8e80941Smrg } 3491b8e80941Smrg } 3492b8e80941Smrg 3493b8e80941Smrg if (progress) 3494b8e80941Smrg invalidate_live_intervals(); 3495b8e80941Smrg 3496b8e80941Smrg return progress; 3497b8e80941Smrg} 3498b8e80941Smrg 3499b8e80941Smrg/** 3500b8e80941Smrg * Rounding modes for conversion instructions are included for each 3501b8e80941Smrg * conversion, but right now it is a state. So once it is set, 3502b8e80941Smrg * we don't need to call it again for subsequent calls. 3503b8e80941Smrg * 3504b8e80941Smrg * This is useful for vector/matrices conversions, as setting the 3505b8e80941Smrg * mode once is enough for the full vector/matrix 3506b8e80941Smrg */ 3507b8e80941Smrgbool 3508b8e80941Smrgfs_visitor::remove_extra_rounding_modes() 3509b8e80941Smrg{ 3510b8e80941Smrg bool progress = false; 3511b8e80941Smrg 3512b8e80941Smrg foreach_block (block, cfg) { 3513b8e80941Smrg brw_rnd_mode prev_mode = BRW_RND_MODE_UNSPECIFIED; 3514b8e80941Smrg 3515b8e80941Smrg foreach_inst_in_block_safe (fs_inst, inst, block) { 3516b8e80941Smrg if (inst->opcode == SHADER_OPCODE_RND_MODE) { 3517b8e80941Smrg assert(inst->src[0].file == BRW_IMMEDIATE_VALUE); 3518b8e80941Smrg const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d; 3519b8e80941Smrg if (mode == prev_mode) { 3520b8e80941Smrg inst->remove(block); 3521b8e80941Smrg progress = true; 3522b8e80941Smrg } else { 3523b8e80941Smrg prev_mode = mode; 3524b8e80941Smrg } 3525b8e80941Smrg } 3526b8e80941Smrg } 3527b8e80941Smrg } 3528b8e80941Smrg 3529b8e80941Smrg if (progress) 3530b8e80941Smrg invalidate_live_intervals(); 3531b8e80941Smrg 3532b8e80941Smrg return progress; 3533b8e80941Smrg} 3534b8e80941Smrg 3535b8e80941Smrgstatic void 3536b8e80941Smrgclear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) 3537b8e80941Smrg{ 3538b8e80941Smrg /* Clear the flag for registers that actually got read (as expected). */ 3539b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 3540b8e80941Smrg int grf; 3541b8e80941Smrg if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) { 3542b8e80941Smrg grf = inst->src[i].nr; 3543b8e80941Smrg } else { 3544b8e80941Smrg continue; 3545b8e80941Smrg } 3546b8e80941Smrg 3547b8e80941Smrg if (grf >= first_grf && 3548b8e80941Smrg grf < first_grf + grf_len) { 3549b8e80941Smrg deps[grf - first_grf] = false; 3550b8e80941Smrg if (inst->exec_size == 16) 3551b8e80941Smrg deps[grf - first_grf + 1] = false; 3552b8e80941Smrg } 3553b8e80941Smrg } 3554b8e80941Smrg} 3555b8e80941Smrg 3556b8e80941Smrg/** 3557b8e80941Smrg * Implements this workaround for the original 965: 3558b8e80941Smrg * 3559b8e80941Smrg * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not 3560b8e80941Smrg * check for post destination dependencies on this instruction, software 3561b8e80941Smrg * must ensure that there is no destination hazard for the case of ‘write 3562b8e80941Smrg * followed by a posted write’ shown in the following example. 3563b8e80941Smrg * 3564b8e80941Smrg * 1. mov r3 0 3565b8e80941Smrg * 2. send r3.xy <rest of send instruction> 3566b8e80941Smrg * 3. mov r2 r3 3567b8e80941Smrg * 3568b8e80941Smrg * Due to no post-destination dependency check on the ‘send’, the above 3569b8e80941Smrg * code sequence could have two instructions (1 and 2) in flight at the 3570b8e80941Smrg * same time that both consider ‘r3’ as the target of their final writes. 3571b8e80941Smrg */ 3572b8e80941Smrgvoid 3573b8e80941Smrgfs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, 3574b8e80941Smrg fs_inst *inst) 3575b8e80941Smrg{ 3576b8e80941Smrg int write_len = regs_written(inst); 3577b8e80941Smrg int first_write_grf = inst->dst.nr; 3578b8e80941Smrg bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; 3579b8e80941Smrg assert(write_len < (int)sizeof(needs_dep) - 1); 3580b8e80941Smrg 3581b8e80941Smrg memset(needs_dep, false, sizeof(needs_dep)); 3582b8e80941Smrg memset(needs_dep, true, write_len); 3583b8e80941Smrg 3584b8e80941Smrg clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); 3585b8e80941Smrg 3586b8e80941Smrg /* Walk backwards looking for writes to registers we're writing which 3587b8e80941Smrg * aren't read since being written. If we hit the start of the program, 3588b8e80941Smrg * we assume that there are no outstanding dependencies on entry to the 3589b8e80941Smrg * program. 3590b8e80941Smrg */ 3591b8e80941Smrg foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) { 3592b8e80941Smrg /* If we hit control flow, assume that there *are* outstanding 3593b8e80941Smrg * dependencies, and force their cleanup before our instruction. 3594b8e80941Smrg */ 3595b8e80941Smrg if (block->start() == scan_inst && block->num != 0) { 3596b8e80941Smrg for (int i = 0; i < write_len; i++) { 3597b8e80941Smrg if (needs_dep[i]) 3598b8e80941Smrg DEP_RESOLVE_MOV(fs_builder(this, block, inst), 3599b8e80941Smrg first_write_grf + i); 3600b8e80941Smrg } 3601b8e80941Smrg return; 3602b8e80941Smrg } 3603b8e80941Smrg 3604b8e80941Smrg /* We insert our reads as late as possible on the assumption that any 3605b8e80941Smrg * instruction but a MOV that might have left us an outstanding 3606b8e80941Smrg * dependency has more latency than a MOV. 3607b8e80941Smrg */ 3608b8e80941Smrg if (scan_inst->dst.file == VGRF) { 3609b8e80941Smrg for (unsigned i = 0; i < regs_written(scan_inst); i++) { 3610b8e80941Smrg int reg = scan_inst->dst.nr + i; 3611b8e80941Smrg 3612b8e80941Smrg if (reg >= first_write_grf && 3613b8e80941Smrg reg < first_write_grf + write_len && 3614b8e80941Smrg needs_dep[reg - first_write_grf]) { 3615b8e80941Smrg DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg); 3616b8e80941Smrg needs_dep[reg - first_write_grf] = false; 3617b8e80941Smrg if (scan_inst->exec_size == 16) 3618b8e80941Smrg needs_dep[reg - first_write_grf + 1] = false; 3619b8e80941Smrg } 3620b8e80941Smrg } 3621b8e80941Smrg } 3622b8e80941Smrg 3623b8e80941Smrg /* Clear the flag for registers that actually got read (as expected). */ 3624b8e80941Smrg clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); 3625b8e80941Smrg 3626b8e80941Smrg /* Continue the loop only if we haven't resolved all the dependencies */ 3627b8e80941Smrg int i; 3628b8e80941Smrg for (i = 0; i < write_len; i++) { 3629b8e80941Smrg if (needs_dep[i]) 3630b8e80941Smrg break; 3631b8e80941Smrg } 3632b8e80941Smrg if (i == write_len) 3633b8e80941Smrg return; 3634b8e80941Smrg } 3635b8e80941Smrg} 3636b8e80941Smrg 3637b8e80941Smrg/** 3638b8e80941Smrg * Implements this workaround for the original 965: 3639b8e80941Smrg * 3640b8e80941Smrg * "[DevBW, DevCL] Errata: A destination register from a send can not be 3641b8e80941Smrg * used as a destination register until after it has been sourced by an 3642b8e80941Smrg * instruction with a different destination register. 3643b8e80941Smrg */ 3644b8e80941Smrgvoid 3645b8e80941Smrgfs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) 3646b8e80941Smrg{ 3647b8e80941Smrg int write_len = regs_written(inst); 3648b8e80941Smrg unsigned first_write_grf = inst->dst.nr; 3649b8e80941Smrg bool needs_dep[BRW_MAX_MRF(devinfo->gen)]; 3650b8e80941Smrg assert(write_len < (int)sizeof(needs_dep) - 1); 3651b8e80941Smrg 3652b8e80941Smrg memset(needs_dep, false, sizeof(needs_dep)); 3653b8e80941Smrg memset(needs_dep, true, write_len); 3654b8e80941Smrg /* Walk forwards looking for writes to registers we're writing which aren't 3655b8e80941Smrg * read before being written. 3656b8e80941Smrg */ 3657b8e80941Smrg foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) { 3658b8e80941Smrg /* If we hit control flow, force resolve all remaining dependencies. */ 3659b8e80941Smrg if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) { 3660b8e80941Smrg for (int i = 0; i < write_len; i++) { 3661b8e80941Smrg if (needs_dep[i]) 3662b8e80941Smrg DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), 3663b8e80941Smrg first_write_grf + i); 3664b8e80941Smrg } 3665b8e80941Smrg return; 3666b8e80941Smrg } 3667b8e80941Smrg 3668b8e80941Smrg /* Clear the flag for registers that actually got read (as expected). */ 3669b8e80941Smrg clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); 3670b8e80941Smrg 3671b8e80941Smrg /* We insert our reads as late as possible since they're reading the 3672b8e80941Smrg * result of a SEND, which has massive latency. 3673b8e80941Smrg */ 3674b8e80941Smrg if (scan_inst->dst.file == VGRF && 3675b8e80941Smrg scan_inst->dst.nr >= first_write_grf && 3676b8e80941Smrg scan_inst->dst.nr < first_write_grf + write_len && 3677b8e80941Smrg needs_dep[scan_inst->dst.nr - first_write_grf]) { 3678b8e80941Smrg DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst), 3679b8e80941Smrg scan_inst->dst.nr); 3680b8e80941Smrg needs_dep[scan_inst->dst.nr - first_write_grf] = false; 3681b8e80941Smrg } 3682b8e80941Smrg 3683b8e80941Smrg /* Continue the loop only if we haven't resolved all the dependencies */ 3684b8e80941Smrg int i; 3685b8e80941Smrg for (i = 0; i < write_len; i++) { 3686b8e80941Smrg if (needs_dep[i]) 3687b8e80941Smrg break; 3688b8e80941Smrg } 3689b8e80941Smrg if (i == write_len) 3690b8e80941Smrg return; 3691b8e80941Smrg } 3692b8e80941Smrg} 3693b8e80941Smrg 3694b8e80941Smrgvoid 3695b8e80941Smrgfs_visitor::insert_gen4_send_dependency_workarounds() 3696b8e80941Smrg{ 3697b8e80941Smrg if (devinfo->gen != 4 || devinfo->is_g4x) 3698b8e80941Smrg return; 3699b8e80941Smrg 3700b8e80941Smrg bool progress = false; 3701b8e80941Smrg 3702b8e80941Smrg foreach_block_and_inst(block, fs_inst, inst, cfg) { 3703b8e80941Smrg if (inst->mlen != 0 && inst->dst.file == VGRF) { 3704b8e80941Smrg insert_gen4_pre_send_dependency_workarounds(block, inst); 3705b8e80941Smrg insert_gen4_post_send_dependency_workarounds(block, inst); 3706b8e80941Smrg progress = true; 3707b8e80941Smrg } 3708b8e80941Smrg } 3709b8e80941Smrg 3710b8e80941Smrg if (progress) 3711b8e80941Smrg invalidate_live_intervals(); 3712b8e80941Smrg} 3713b8e80941Smrg 3714b8e80941Smrg/** 3715b8e80941Smrg * Turns the generic expression-style uniform pull constant load instruction 3716b8e80941Smrg * into a hardware-specific series of instructions for loading a pull 3717b8e80941Smrg * constant. 3718b8e80941Smrg * 3719b8e80941Smrg * The expression style allows the CSE pass before this to optimize out 3720b8e80941Smrg * repeated loads from the same offset, and gives the pre-register-allocation 3721b8e80941Smrg * scheduling full flexibility, while the conversion to native instructions 3722b8e80941Smrg * allows the post-register-allocation scheduler the best information 3723b8e80941Smrg * possible. 3724b8e80941Smrg * 3725b8e80941Smrg * Note that execution masking for setting up pull constant loads is special: 3726b8e80941Smrg * the channels that need to be written are unrelated to the current execution 3727b8e80941Smrg * mask, since a later instruction will use one of the result channels as a 3728b8e80941Smrg * source operand for all 8 or 16 of its channels. 3729b8e80941Smrg */ 3730b8e80941Smrgvoid 3731b8e80941Smrgfs_visitor::lower_uniform_pull_constant_loads() 3732b8e80941Smrg{ 3733b8e80941Smrg foreach_block_and_inst (block, fs_inst, inst, cfg) { 3734b8e80941Smrg if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) 3735b8e80941Smrg continue; 3736b8e80941Smrg 3737b8e80941Smrg if (devinfo->gen >= 7) { 3738b8e80941Smrg const fs_builder ubld = fs_builder(this, block, inst).exec_all(); 3739b8e80941Smrg const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD); 3740b8e80941Smrg 3741b8e80941Smrg ubld.group(8, 0).MOV(payload, 3742b8e80941Smrg retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 3743b8e80941Smrg ubld.group(1, 0).MOV(component(payload, 2), 3744b8e80941Smrg brw_imm_ud(inst->src[1].ud / 16)); 3745b8e80941Smrg 3746b8e80941Smrg inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; 3747b8e80941Smrg inst->src[1] = payload; 3748b8e80941Smrg inst->header_size = 1; 3749b8e80941Smrg inst->mlen = 1; 3750b8e80941Smrg 3751b8e80941Smrg invalidate_live_intervals(); 3752b8e80941Smrg } else { 3753b8e80941Smrg /* Before register allocation, we didn't tell the scheduler about the 3754b8e80941Smrg * MRF we use. We know it's safe to use this MRF because nothing 3755b8e80941Smrg * else does except for register spill/unspill, which generates and 3756b8e80941Smrg * uses its MRF within a single IR instruction. 3757b8e80941Smrg */ 3758b8e80941Smrg inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; 3759b8e80941Smrg inst->mlen = 1; 3760b8e80941Smrg } 3761b8e80941Smrg } 3762b8e80941Smrg} 3763b8e80941Smrg 3764b8e80941Smrgbool 3765b8e80941Smrgfs_visitor::lower_load_payload() 3766b8e80941Smrg{ 3767b8e80941Smrg bool progress = false; 3768b8e80941Smrg 3769b8e80941Smrg foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { 3770b8e80941Smrg if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) 3771b8e80941Smrg continue; 3772b8e80941Smrg 3773b8e80941Smrg assert(inst->dst.file == MRF || inst->dst.file == VGRF); 3774b8e80941Smrg assert(inst->saturate == false); 3775b8e80941Smrg fs_reg dst = inst->dst; 3776b8e80941Smrg 3777b8e80941Smrg /* Get rid of COMPR4. We'll add it back in if we need it */ 3778b8e80941Smrg if (dst.file == MRF) 3779b8e80941Smrg dst.nr = dst.nr & ~BRW_MRF_COMPR4; 3780b8e80941Smrg 3781b8e80941Smrg const fs_builder ibld(this, block, inst); 3782b8e80941Smrg const fs_builder hbld = ibld.exec_all().group(8, 0); 3783b8e80941Smrg 3784b8e80941Smrg for (uint8_t i = 0; i < inst->header_size; i++) { 3785b8e80941Smrg if (inst->src[i].file != BAD_FILE) { 3786b8e80941Smrg fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); 3787b8e80941Smrg fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); 3788b8e80941Smrg hbld.MOV(mov_dst, mov_src); 3789b8e80941Smrg } 3790b8e80941Smrg dst = offset(dst, hbld, 1); 3791b8e80941Smrg } 3792b8e80941Smrg 3793b8e80941Smrg if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && 3794b8e80941Smrg inst->exec_size > 8) { 3795b8e80941Smrg /* In this case, the payload portion of the LOAD_PAYLOAD isn't 3796b8e80941Smrg * a straightforward copy. Instead, the result of the 3797b8e80941Smrg * LOAD_PAYLOAD is treated as interleaved and the first four 3798b8e80941Smrg * non-header sources are unpacked as: 3799b8e80941Smrg * 3800b8e80941Smrg * m + 0: r0 3801b8e80941Smrg * m + 1: g0 3802b8e80941Smrg * m + 2: b0 3803b8e80941Smrg * m + 3: a0 3804b8e80941Smrg * m + 4: r1 3805b8e80941Smrg * m + 5: g1 3806b8e80941Smrg * m + 6: b1 3807b8e80941Smrg * m + 7: a1 3808b8e80941Smrg * 3809b8e80941Smrg * This is used for gen <= 5 fb writes. 3810b8e80941Smrg */ 3811b8e80941Smrg assert(inst->exec_size == 16); 3812b8e80941Smrg assert(inst->header_size + 4 <= inst->sources); 3813b8e80941Smrg for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { 3814b8e80941Smrg if (inst->src[i].file != BAD_FILE) { 3815b8e80941Smrg if (devinfo->has_compr4) { 3816b8e80941Smrg fs_reg compr4_dst = retype(dst, inst->src[i].type); 3817b8e80941Smrg compr4_dst.nr |= BRW_MRF_COMPR4; 3818b8e80941Smrg ibld.MOV(compr4_dst, inst->src[i]); 3819b8e80941Smrg } else { 3820b8e80941Smrg /* Platform doesn't have COMPR4. We have to fake it */ 3821b8e80941Smrg fs_reg mov_dst = retype(dst, inst->src[i].type); 3822b8e80941Smrg ibld.half(0).MOV(mov_dst, half(inst->src[i], 0)); 3823b8e80941Smrg mov_dst.nr += 4; 3824b8e80941Smrg ibld.half(1).MOV(mov_dst, half(inst->src[i], 1)); 3825b8e80941Smrg } 3826b8e80941Smrg } 3827b8e80941Smrg 3828b8e80941Smrg dst.nr++; 3829b8e80941Smrg } 3830b8e80941Smrg 3831b8e80941Smrg /* The loop above only ever incremented us through the first set 3832b8e80941Smrg * of 4 registers. However, thanks to the magic of COMPR4, we 3833b8e80941Smrg * actually wrote to the first 8 registers, so we need to take 3834b8e80941Smrg * that into account now. 3835b8e80941Smrg */ 3836b8e80941Smrg dst.nr += 4; 3837b8e80941Smrg 3838b8e80941Smrg /* The COMPR4 code took care of the first 4 sources. We'll let 3839b8e80941Smrg * the regular path handle any remaining sources. Yes, we are 3840b8e80941Smrg * modifying the instruction but we're about to delete it so 3841b8e80941Smrg * this really doesn't hurt anything. 3842b8e80941Smrg */ 3843b8e80941Smrg inst->header_size += 4; 3844b8e80941Smrg } 3845b8e80941Smrg 3846b8e80941Smrg for (uint8_t i = inst->header_size; i < inst->sources; i++) { 3847b8e80941Smrg if (inst->src[i].file != BAD_FILE) { 3848b8e80941Smrg dst.type = inst->src[i].type; 3849b8e80941Smrg ibld.MOV(dst, inst->src[i]); 3850b8e80941Smrg } else { 3851b8e80941Smrg dst.type = BRW_REGISTER_TYPE_UD; 3852b8e80941Smrg } 3853b8e80941Smrg dst = offset(dst, ibld, 1); 3854b8e80941Smrg } 3855b8e80941Smrg 3856b8e80941Smrg inst->remove(block); 3857b8e80941Smrg progress = true; 3858b8e80941Smrg } 3859b8e80941Smrg 3860b8e80941Smrg if (progress) 3861b8e80941Smrg invalidate_live_intervals(); 3862b8e80941Smrg 3863b8e80941Smrg return progress; 3864b8e80941Smrg} 3865b8e80941Smrg 3866b8e80941Smrgbool 3867b8e80941Smrgfs_visitor::lower_linterp() 3868b8e80941Smrg{ 3869b8e80941Smrg bool progress = false; 3870b8e80941Smrg 3871b8e80941Smrg if (devinfo->gen < 11) 3872b8e80941Smrg return false; 3873b8e80941Smrg 3874b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 3875b8e80941Smrg const fs_builder ibld(this, block, inst); 3876b8e80941Smrg 3877b8e80941Smrg if (inst->opcode != FS_OPCODE_LINTERP) 3878b8e80941Smrg continue; 3879b8e80941Smrg 3880b8e80941Smrg fs_reg dwP = component(inst->src[1], 0); 3881b8e80941Smrg fs_reg dwQ = component(inst->src[1], 1); 3882b8e80941Smrg fs_reg dwR = component(inst->src[1], 3); 3883b8e80941Smrg for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 8); i++) { 3884b8e80941Smrg const fs_builder hbld(ibld.half(i)); 3885b8e80941Smrg fs_reg dst = half(inst->dst, i); 3886b8e80941Smrg fs_reg delta_xy = offset(inst->src[0], ibld, i); 3887b8e80941Smrg hbld.MAD(dst, dwR, half(delta_xy, 0), dwP); 3888b8e80941Smrg fs_inst *mad = hbld.MAD(dst, dst, half(delta_xy, 1), dwQ); 3889b8e80941Smrg 3890b8e80941Smrg /* Propagate conditional mod and saturate from the original 3891b8e80941Smrg * instruction to the second MAD instruction. 3892b8e80941Smrg */ 3893b8e80941Smrg set_saturate(inst->saturate, mad); 3894b8e80941Smrg set_condmod(inst->conditional_mod, mad); 3895b8e80941Smrg } 3896b8e80941Smrg 3897b8e80941Smrg inst->remove(block); 3898b8e80941Smrg progress = true; 3899b8e80941Smrg } 3900b8e80941Smrg 3901b8e80941Smrg if (progress) 3902b8e80941Smrg invalidate_live_intervals(); 3903b8e80941Smrg 3904b8e80941Smrg return progress; 3905b8e80941Smrg} 3906b8e80941Smrg 3907b8e80941Smrgbool 3908b8e80941Smrgfs_visitor::lower_integer_multiplication() 3909b8e80941Smrg{ 3910b8e80941Smrg bool progress = false; 3911b8e80941Smrg 3912b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 3913b8e80941Smrg const fs_builder ibld(this, block, inst); 3914b8e80941Smrg 3915b8e80941Smrg if (inst->opcode == BRW_OPCODE_MUL) { 3916b8e80941Smrg if (inst->dst.is_accumulator() || 3917b8e80941Smrg (inst->dst.type != BRW_REGISTER_TYPE_D && 3918b8e80941Smrg inst->dst.type != BRW_REGISTER_TYPE_UD)) 3919b8e80941Smrg continue; 3920b8e80941Smrg 3921b8e80941Smrg if (devinfo->has_integer_dword_mul) 3922b8e80941Smrg continue; 3923b8e80941Smrg 3924b8e80941Smrg if (inst->src[1].file == IMM && 3925b8e80941Smrg inst->src[1].ud < (1 << 16)) { 3926b8e80941Smrg /* The MUL instruction isn't commutative. On Gen <= 6, only the low 3927b8e80941Smrg * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of 3928b8e80941Smrg * src1 are used. 3929b8e80941Smrg * 3930b8e80941Smrg * If multiplying by an immediate value that fits in 16-bits, do a 3931b8e80941Smrg * single MUL instruction with that value in the proper location. 3932b8e80941Smrg */ 3933b8e80941Smrg if (devinfo->gen < 7) { 3934b8e80941Smrg fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), 3935b8e80941Smrg inst->dst.type); 3936b8e80941Smrg ibld.MOV(imm, inst->src[1]); 3937b8e80941Smrg ibld.MUL(inst->dst, imm, inst->src[0]); 3938b8e80941Smrg } else { 3939b8e80941Smrg const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD); 3940b8e80941Smrg ibld.MUL(inst->dst, inst->src[0], 3941b8e80941Smrg ud ? brw_imm_uw(inst->src[1].ud) 3942b8e80941Smrg : brw_imm_w(inst->src[1].d)); 3943b8e80941Smrg } 3944b8e80941Smrg } else { 3945b8e80941Smrg /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot 3946b8e80941Smrg * do 32-bit integer multiplication in one instruction, but instead 3947b8e80941Smrg * must do a sequence (which actually calculates a 64-bit result): 3948b8e80941Smrg * 3949b8e80941Smrg * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D 3950b8e80941Smrg * mach(8) null g3<8,8,1>D g4<8,8,1>D 3951b8e80941Smrg * mov(8) g2<1>D acc0<8,8,1>D 3952b8e80941Smrg * 3953b8e80941Smrg * But on Gen > 6, the ability to use second accumulator register 3954b8e80941Smrg * (acc1) for non-float data types was removed, preventing a simple 3955b8e80941Smrg * implementation in SIMD16. A 16-channel result can be calculated by 3956b8e80941Smrg * executing the three instructions twice in SIMD8, once with quarter 3957b8e80941Smrg * control of 1Q for the first eight channels and again with 2Q for 3958b8e80941Smrg * the second eight channels. 3959b8e80941Smrg * 3960b8e80941Smrg * Which accumulator register is implicitly accessed (by AccWrEnable 3961b8e80941Smrg * for instance) is determined by the quarter control. Unfortunately 3962b8e80941Smrg * Ivybridge (and presumably Baytrail) has a hardware bug in which an 3963b8e80941Smrg * implicit accumulator access by an instruction with 2Q will access 3964b8e80941Smrg * acc1 regardless of whether the data type is usable in acc1. 3965b8e80941Smrg * 3966b8e80941Smrg * Specifically, the 2Q mach(8) writes acc1 which does not exist for 3967b8e80941Smrg * integer data types. 3968b8e80941Smrg * 3969b8e80941Smrg * Since we only want the low 32-bits of the result, we can do two 3970b8e80941Smrg * 32-bit x 16-bit multiplies (like the mul and mach are doing), and 3971b8e80941Smrg * adjust the high result and add them (like the mach is doing): 3972b8e80941Smrg * 3973b8e80941Smrg * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW 3974b8e80941Smrg * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW 3975b8e80941Smrg * shl(8) g9<1>D g8<8,8,1>D 16D 3976b8e80941Smrg * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D 3977b8e80941Smrg * 3978b8e80941Smrg * We avoid the shl instruction by realizing that we only want to add 3979b8e80941Smrg * the low 16-bits of the "high" result to the high 16-bits of the 3980b8e80941Smrg * "low" result and using proper regioning on the add: 3981b8e80941Smrg * 3982b8e80941Smrg * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW 3983b8e80941Smrg * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW 3984b8e80941Smrg * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW 3985b8e80941Smrg * 3986b8e80941Smrg * Since it does not use the (single) accumulator register, we can 3987b8e80941Smrg * schedule multi-component multiplications much better. 3988b8e80941Smrg */ 3989b8e80941Smrg 3990b8e80941Smrg bool needs_mov = false; 3991b8e80941Smrg fs_reg orig_dst = inst->dst; 3992b8e80941Smrg 3993b8e80941Smrg /* Get a new VGRF for the "low" 32x16-bit multiplication result if 3994b8e80941Smrg * reusing the original destination is impossible due to hardware 3995b8e80941Smrg * restrictions, source/destination overlap, or it being the null 3996b8e80941Smrg * register. 3997b8e80941Smrg */ 3998b8e80941Smrg fs_reg low = inst->dst; 3999b8e80941Smrg if (orig_dst.is_null() || orig_dst.file == MRF || 4000b8e80941Smrg regions_overlap(inst->dst, inst->size_written, 4001b8e80941Smrg inst->src[0], inst->size_read(0)) || 4002b8e80941Smrg regions_overlap(inst->dst, inst->size_written, 4003b8e80941Smrg inst->src[1], inst->size_read(1)) || 4004b8e80941Smrg inst->dst.stride >= 4) { 4005b8e80941Smrg needs_mov = true; 4006b8e80941Smrg low = fs_reg(VGRF, alloc.allocate(regs_written(inst)), 4007b8e80941Smrg inst->dst.type); 4008b8e80941Smrg } 4009b8e80941Smrg 4010b8e80941Smrg /* Get a new VGRF but keep the same stride as inst->dst */ 4011b8e80941Smrg fs_reg high(VGRF, alloc.allocate(regs_written(inst)), 4012b8e80941Smrg inst->dst.type); 4013b8e80941Smrg high.stride = inst->dst.stride; 4014b8e80941Smrg high.offset = inst->dst.offset % REG_SIZE; 4015b8e80941Smrg 4016b8e80941Smrg if (devinfo->gen >= 7) { 4017b8e80941Smrg if (inst->src[1].abs) 4018b8e80941Smrg lower_src_modifiers(this, block, inst, 1); 4019b8e80941Smrg 4020b8e80941Smrg if (inst->src[1].file == IMM) { 4021b8e80941Smrg ibld.MUL(low, inst->src[0], 4022b8e80941Smrg brw_imm_uw(inst->src[1].ud & 0xffff)); 4023b8e80941Smrg ibld.MUL(high, inst->src[0], 4024b8e80941Smrg brw_imm_uw(inst->src[1].ud >> 16)); 4025b8e80941Smrg } else { 4026b8e80941Smrg ibld.MUL(low, inst->src[0], 4027b8e80941Smrg subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0)); 4028b8e80941Smrg ibld.MUL(high, inst->src[0], 4029b8e80941Smrg subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1)); 4030b8e80941Smrg } 4031b8e80941Smrg } else { 4032b8e80941Smrg if (inst->src[0].abs) 4033b8e80941Smrg lower_src_modifiers(this, block, inst, 0); 4034b8e80941Smrg 4035b8e80941Smrg ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0), 4036b8e80941Smrg inst->src[1]); 4037b8e80941Smrg ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1), 4038b8e80941Smrg inst->src[1]); 4039b8e80941Smrg } 4040b8e80941Smrg 4041b8e80941Smrg ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1), 4042b8e80941Smrg subscript(low, BRW_REGISTER_TYPE_UW, 1), 4043b8e80941Smrg subscript(high, BRW_REGISTER_TYPE_UW, 0)); 4044b8e80941Smrg 4045b8e80941Smrg if (needs_mov || inst->conditional_mod) { 4046b8e80941Smrg set_condmod(inst->conditional_mod, 4047b8e80941Smrg ibld.MOV(orig_dst, low)); 4048b8e80941Smrg } 4049b8e80941Smrg } 4050b8e80941Smrg 4051b8e80941Smrg } else if (inst->opcode == SHADER_OPCODE_MULH) { 4052b8e80941Smrg /* According to the BDW+ BSpec page for the "Multiply Accumulate 4053b8e80941Smrg * High" instruction: 4054b8e80941Smrg * 4055b8e80941Smrg * "An added preliminary mov is required for source modification on 4056b8e80941Smrg * src1: 4057b8e80941Smrg * mov (8) r3.0<1>:d -r3<8;8,1>:d 4058b8e80941Smrg * mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw 4059b8e80941Smrg * mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d" 4060b8e80941Smrg */ 4061b8e80941Smrg if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs)) 4062b8e80941Smrg lower_src_modifiers(this, block, inst, 1); 4063b8e80941Smrg 4064b8e80941Smrg /* Should have been lowered to 8-wide. */ 4065b8e80941Smrg assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst)); 4066b8e80941Smrg const fs_reg acc = retype(brw_acc_reg(inst->exec_size), 4067b8e80941Smrg inst->dst.type); 4068b8e80941Smrg fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]); 4069b8e80941Smrg fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]); 4070b8e80941Smrg 4071b8e80941Smrg if (devinfo->gen >= 8) { 4072b8e80941Smrg /* Until Gen8, integer multiplies read 32-bits from one source, 4073b8e80941Smrg * and 16-bits from the other, and relying on the MACH instruction 4074b8e80941Smrg * to generate the high bits of the result. 4075b8e80941Smrg * 4076b8e80941Smrg * On Gen8, the multiply instruction does a full 32x32-bit 4077b8e80941Smrg * multiply, but in order to do a 64-bit multiply we can simulate 4078b8e80941Smrg * the previous behavior and then use a MACH instruction. 4079b8e80941Smrg */ 4080b8e80941Smrg assert(mul->src[1].type == BRW_REGISTER_TYPE_D || 4081b8e80941Smrg mul->src[1].type == BRW_REGISTER_TYPE_UD); 4082b8e80941Smrg mul->src[1].type = BRW_REGISTER_TYPE_UW; 4083b8e80941Smrg mul->src[1].stride *= 2; 4084b8e80941Smrg 4085b8e80941Smrg } else if (devinfo->gen == 7 && !devinfo->is_haswell && 4086b8e80941Smrg inst->group > 0) { 4087b8e80941Smrg /* Among other things the quarter control bits influence which 4088b8e80941Smrg * accumulator register is used by the hardware for instructions 4089b8e80941Smrg * that access the accumulator implicitly (e.g. MACH). A 4090b8e80941Smrg * second-half instruction would normally map to acc1, which 4091b8e80941Smrg * doesn't exist on Gen7 and up (the hardware does emulate it for 4092b8e80941Smrg * floating-point instructions *only* by taking advantage of the 4093b8e80941Smrg * extra precision of acc0 not normally used for floating point 4094b8e80941Smrg * arithmetic). 4095b8e80941Smrg * 4096b8e80941Smrg * HSW and up are careful enough not to try to access an 4097b8e80941Smrg * accumulator register that doesn't exist, but on earlier Gen7 4098b8e80941Smrg * hardware we need to make sure that the quarter control bits are 4099b8e80941Smrg * zero to avoid non-deterministic behaviour and emit an extra MOV 4100b8e80941Smrg * to get the result masked correctly according to the current 4101b8e80941Smrg * channel enables. 4102b8e80941Smrg */ 4103b8e80941Smrg mach->group = 0; 4104b8e80941Smrg mach->force_writemask_all = true; 4105b8e80941Smrg mach->dst = ibld.vgrf(inst->dst.type); 4106b8e80941Smrg ibld.MOV(inst->dst, mach->dst); 4107b8e80941Smrg } 4108b8e80941Smrg } else { 4109b8e80941Smrg continue; 4110b8e80941Smrg } 4111b8e80941Smrg 4112b8e80941Smrg inst->remove(block); 4113b8e80941Smrg progress = true; 4114b8e80941Smrg } 4115b8e80941Smrg 4116b8e80941Smrg if (progress) 4117b8e80941Smrg invalidate_live_intervals(); 4118b8e80941Smrg 4119b8e80941Smrg return progress; 4120b8e80941Smrg} 4121b8e80941Smrg 4122b8e80941Smrgbool 4123b8e80941Smrgfs_visitor::lower_minmax() 4124b8e80941Smrg{ 4125b8e80941Smrg assert(devinfo->gen < 6); 4126b8e80941Smrg 4127b8e80941Smrg bool progress = false; 4128b8e80941Smrg 4129b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 4130b8e80941Smrg const fs_builder ibld(this, block, inst); 4131b8e80941Smrg 4132b8e80941Smrg if (inst->opcode == BRW_OPCODE_SEL && 4133b8e80941Smrg inst->predicate == BRW_PREDICATE_NONE) { 4134b8e80941Smrg /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of 4135b8e80941Smrg * the original SEL.L/GE instruction 4136b8e80941Smrg */ 4137b8e80941Smrg ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], 4138b8e80941Smrg inst->conditional_mod); 4139b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 4140b8e80941Smrg inst->conditional_mod = BRW_CONDITIONAL_NONE; 4141b8e80941Smrg 4142b8e80941Smrg progress = true; 4143b8e80941Smrg } 4144b8e80941Smrg } 4145b8e80941Smrg 4146b8e80941Smrg if (progress) 4147b8e80941Smrg invalidate_live_intervals(); 4148b8e80941Smrg 4149b8e80941Smrg return progress; 4150b8e80941Smrg} 4151b8e80941Smrg 4152b8e80941Smrgstatic void 4153b8e80941Smrgsetup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key, 4154b8e80941Smrg fs_reg *dst, fs_reg color, unsigned components) 4155b8e80941Smrg{ 4156b8e80941Smrg if (key->clamp_fragment_color) { 4157b8e80941Smrg fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4); 4158b8e80941Smrg assert(color.type == BRW_REGISTER_TYPE_F); 4159b8e80941Smrg 4160b8e80941Smrg for (unsigned i = 0; i < components; i++) 4161b8e80941Smrg set_saturate(true, 4162b8e80941Smrg bld.MOV(offset(tmp, bld, i), offset(color, bld, i))); 4163b8e80941Smrg 4164b8e80941Smrg color = tmp; 4165b8e80941Smrg } 4166b8e80941Smrg 4167b8e80941Smrg for (unsigned i = 0; i < components; i++) 4168b8e80941Smrg dst[i] = offset(color, bld, i); 4169b8e80941Smrg} 4170b8e80941Smrg 4171b8e80941Smrgstatic void 4172b8e80941Smrglower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst, 4173b8e80941Smrg const struct brw_wm_prog_data *prog_data, 4174b8e80941Smrg const brw_wm_prog_key *key, 4175b8e80941Smrg const fs_visitor::thread_payload &payload) 4176b8e80941Smrg{ 4177b8e80941Smrg assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM); 4178b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 4179b8e80941Smrg const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0]; 4180b8e80941Smrg const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1]; 4181b8e80941Smrg const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA]; 4182b8e80941Smrg const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH]; 4183b8e80941Smrg const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH]; 4184b8e80941Smrg const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL]; 4185b8e80941Smrg fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK]; 4186b8e80941Smrg const unsigned components = 4187b8e80941Smrg inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud; 4188b8e80941Smrg 4189b8e80941Smrg /* We can potentially have a message length of up to 15, so we have to set 4190b8e80941Smrg * base_mrf to either 0 or 1 in order to fit in m0..m15. 4191b8e80941Smrg */ 4192b8e80941Smrg fs_reg sources[15]; 4193b8e80941Smrg int header_size = 2, payload_header_size; 4194b8e80941Smrg unsigned length = 0; 4195b8e80941Smrg 4196b8e80941Smrg if (devinfo->gen < 6) { 4197b8e80941Smrg /* TODO: Support SIMD32 on gen4-5 */ 4198b8e80941Smrg assert(bld.group() < 16); 4199b8e80941Smrg 4200b8e80941Smrg /* For gen4-5, we always have a header consisting of g0 and g1. We have 4201b8e80941Smrg * an implied MOV from g0,g1 to the start of the message. The MOV from 4202b8e80941Smrg * g0 is handled by the hardware and the MOV from g1 is provided by the 4203b8e80941Smrg * generator. This is required because, on gen4-5, the generator may 4204b8e80941Smrg * generate two write messages with different message lengths in order 4205b8e80941Smrg * to handle AA data properly. 4206b8e80941Smrg * 4207b8e80941Smrg * Also, since the pixel mask goes in the g0 portion of the message and 4208b8e80941Smrg * since render target writes are the last thing in the shader, we write 4209b8e80941Smrg * the pixel mask directly into g0 and it will get copied as part of the 4210b8e80941Smrg * implied write. 4211b8e80941Smrg */ 4212b8e80941Smrg if (prog_data->uses_kill) { 4213b8e80941Smrg bld.exec_all().group(1, 0) 4214b8e80941Smrg .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 4215b8e80941Smrg brw_flag_reg(0, 1)); 4216b8e80941Smrg } 4217b8e80941Smrg 4218b8e80941Smrg assert(length == 0); 4219b8e80941Smrg length = 2; 4220b8e80941Smrg } else if ((devinfo->gen <= 7 && !devinfo->is_haswell && 4221b8e80941Smrg prog_data->uses_kill) || 4222b8e80941Smrg color1.file != BAD_FILE || 4223b8e80941Smrg key->nr_color_regions > 1) { 4224b8e80941Smrg /* From the Sandy Bridge PRM, volume 4, page 198: 4225b8e80941Smrg * 4226b8e80941Smrg * "Dispatched Pixel Enables. One bit per pixel indicating 4227b8e80941Smrg * which pixels were originally enabled when the thread was 4228b8e80941Smrg * dispatched. This field is only required for the end-of- 4229b8e80941Smrg * thread message and on all dual-source messages." 4230b8e80941Smrg */ 4231b8e80941Smrg const fs_builder ubld = bld.exec_all().group(8, 0); 4232b8e80941Smrg 4233b8e80941Smrg fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4234b8e80941Smrg if (bld.group() < 16) { 4235b8e80941Smrg /* The header starts off as g0 and g1 for the first half */ 4236b8e80941Smrg ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), 4237b8e80941Smrg BRW_REGISTER_TYPE_UD)); 4238b8e80941Smrg } else { 4239b8e80941Smrg /* The header starts off as g0 and g2 for the second half */ 4240b8e80941Smrg assert(bld.group() < 32); 4241b8e80941Smrg const fs_reg header_sources[2] = { 4242b8e80941Smrg retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), 4243b8e80941Smrg retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD), 4244b8e80941Smrg }; 4245b8e80941Smrg ubld.LOAD_PAYLOAD(header, header_sources, 2, 0); 4246b8e80941Smrg } 4247b8e80941Smrg 4248b8e80941Smrg uint32_t g00_bits = 0; 4249b8e80941Smrg 4250b8e80941Smrg /* Set "Source0 Alpha Present to RenderTarget" bit in message 4251b8e80941Smrg * header. 4252b8e80941Smrg */ 4253b8e80941Smrg if (inst->target > 0 && prog_data->replicate_alpha) 4254b8e80941Smrg g00_bits |= 1 << 11; 4255b8e80941Smrg 4256b8e80941Smrg /* Set computes stencil to render target */ 4257b8e80941Smrg if (prog_data->computed_stencil) 4258b8e80941Smrg g00_bits |= 1 << 14; 4259b8e80941Smrg 4260b8e80941Smrg if (g00_bits) { 4261b8e80941Smrg /* OR extra bits into g0.0 */ 4262b8e80941Smrg ubld.group(1, 0).OR(component(header, 0), 4263b8e80941Smrg retype(brw_vec1_grf(0, 0), 4264b8e80941Smrg BRW_REGISTER_TYPE_UD), 4265b8e80941Smrg brw_imm_ud(g00_bits)); 4266b8e80941Smrg } 4267b8e80941Smrg 4268b8e80941Smrg /* Set the render target index for choosing BLEND_STATE. */ 4269b8e80941Smrg if (inst->target > 0) { 4270b8e80941Smrg ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target)); 4271b8e80941Smrg } 4272b8e80941Smrg 4273b8e80941Smrg if (prog_data->uses_kill) { 4274b8e80941Smrg assert(bld.group() < 16); 4275b8e80941Smrg ubld.group(1, 0).MOV(retype(component(header, 15), 4276b8e80941Smrg BRW_REGISTER_TYPE_UW), 4277b8e80941Smrg brw_flag_reg(0, 1)); 4278b8e80941Smrg } 4279b8e80941Smrg 4280b8e80941Smrg assert(length == 0); 4281b8e80941Smrg sources[0] = header; 4282b8e80941Smrg sources[1] = horiz_offset(header, 8); 4283b8e80941Smrg length = 2; 4284b8e80941Smrg } 4285b8e80941Smrg assert(length == 0 || length == 2); 4286b8e80941Smrg header_size = length; 4287b8e80941Smrg 4288b8e80941Smrg if (payload.aa_dest_stencil_reg[0]) { 4289b8e80941Smrg assert(inst->group < 16); 4290b8e80941Smrg sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1)); 4291b8e80941Smrg bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha") 4292b8e80941Smrg .MOV(sources[length], 4293b8e80941Smrg fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0))); 4294b8e80941Smrg length++; 4295b8e80941Smrg } 4296b8e80941Smrg 4297b8e80941Smrg if (src0_alpha.file != BAD_FILE) { 4298b8e80941Smrg for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) { 4299b8e80941Smrg const fs_builder &ubld = bld.exec_all().group(8, i) 4300b8e80941Smrg .annotate("FB write src0 alpha"); 4301b8e80941Smrg const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F); 4302b8e80941Smrg ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8)); 4303b8e80941Smrg setup_color_payload(ubld, key, &sources[length], tmp, 1); 4304b8e80941Smrg length++; 4305b8e80941Smrg } 4306b8e80941Smrg } else if (prog_data->replicate_alpha && inst->target != 0) { 4307b8e80941Smrg /* Handle the case when fragment shader doesn't write to draw buffer 4308b8e80941Smrg * zero. No need to call setup_color_payload() for src0_alpha because 4309b8e80941Smrg * alpha value will be undefined. 4310b8e80941Smrg */ 4311b8e80941Smrg length += bld.dispatch_width() / 8; 4312b8e80941Smrg } 4313b8e80941Smrg 4314b8e80941Smrg if (sample_mask.file != BAD_FILE) { 4315b8e80941Smrg sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1), 4316b8e80941Smrg BRW_REGISTER_TYPE_UD); 4317b8e80941Smrg 4318b8e80941Smrg /* Hand over gl_SampleMask. Only the lower 16 bits of each channel are 4319b8e80941Smrg * relevant. Since it's unsigned single words one vgrf is always 4320b8e80941Smrg * 16-wide, but only the lower or higher 8 channels will be used by the 4321b8e80941Smrg * hardware when doing a SIMD8 write depending on whether we have 4322b8e80941Smrg * selected the subspans for the first or second half respectively. 4323b8e80941Smrg */ 4324b8e80941Smrg assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4); 4325b8e80941Smrg sample_mask.type = BRW_REGISTER_TYPE_UW; 4326b8e80941Smrg sample_mask.stride *= 2; 4327b8e80941Smrg 4328b8e80941Smrg bld.exec_all().annotate("FB write oMask") 4329b8e80941Smrg .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW), 4330b8e80941Smrg inst->group % 16), 4331b8e80941Smrg sample_mask); 4332b8e80941Smrg length++; 4333b8e80941Smrg } 4334b8e80941Smrg 4335b8e80941Smrg payload_header_size = length; 4336b8e80941Smrg 4337b8e80941Smrg setup_color_payload(bld, key, &sources[length], color0, components); 4338b8e80941Smrg length += 4; 4339b8e80941Smrg 4340b8e80941Smrg if (color1.file != BAD_FILE) { 4341b8e80941Smrg setup_color_payload(bld, key, &sources[length], color1, components); 4342b8e80941Smrg length += 4; 4343b8e80941Smrg } 4344b8e80941Smrg 4345b8e80941Smrg if (src_depth.file != BAD_FILE) { 4346b8e80941Smrg sources[length] = src_depth; 4347b8e80941Smrg length++; 4348b8e80941Smrg } 4349b8e80941Smrg 4350b8e80941Smrg if (dst_depth.file != BAD_FILE) { 4351b8e80941Smrg sources[length] = dst_depth; 4352b8e80941Smrg length++; 4353b8e80941Smrg } 4354b8e80941Smrg 4355b8e80941Smrg if (src_stencil.file != BAD_FILE) { 4356b8e80941Smrg assert(devinfo->gen >= 9); 4357b8e80941Smrg assert(bld.dispatch_width() == 8); 4358b8e80941Smrg 4359b8e80941Smrg /* XXX: src_stencil is only available on gen9+. dst_depth is never 4360b8e80941Smrg * available on gen9+. As such it's impossible to have both enabled at the 4361b8e80941Smrg * same time and therefore length cannot overrun the array. 4362b8e80941Smrg */ 4363b8e80941Smrg assert(length < 15); 4364b8e80941Smrg 4365b8e80941Smrg sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD); 4366b8e80941Smrg bld.exec_all().annotate("FB write OS") 4367b8e80941Smrg .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB), 4368b8e80941Smrg subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0)); 4369b8e80941Smrg length++; 4370b8e80941Smrg } 4371b8e80941Smrg 4372b8e80941Smrg fs_inst *load; 4373b8e80941Smrg if (devinfo->gen >= 7) { 4374b8e80941Smrg /* Send from the GRF */ 4375b8e80941Smrg fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F); 4376b8e80941Smrg load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size); 4377b8e80941Smrg payload.nr = bld.shader->alloc.allocate(regs_written(load)); 4378b8e80941Smrg load->dst = payload; 4379b8e80941Smrg 4380b8e80941Smrg inst->src[0] = payload; 4381b8e80941Smrg inst->resize_sources(1); 4382b8e80941Smrg } else { 4383b8e80941Smrg /* Send from the MRF */ 4384b8e80941Smrg load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F), 4385b8e80941Smrg sources, length, payload_header_size); 4386b8e80941Smrg 4387b8e80941Smrg /* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD 4388b8e80941Smrg * will do this for us if we just give it a COMPR4 destination. 4389b8e80941Smrg */ 4390b8e80941Smrg if (devinfo->gen < 6 && bld.dispatch_width() == 16) 4391b8e80941Smrg load->dst.nr |= BRW_MRF_COMPR4; 4392b8e80941Smrg 4393b8e80941Smrg if (devinfo->gen < 6) { 4394b8e80941Smrg /* Set up src[0] for the implied MOV from grf0-1 */ 4395b8e80941Smrg inst->resize_sources(1); 4396b8e80941Smrg inst->src[0] = brw_vec8_grf(0, 0); 4397b8e80941Smrg } else { 4398b8e80941Smrg inst->resize_sources(0); 4399b8e80941Smrg } 4400b8e80941Smrg inst->base_mrf = 1; 4401b8e80941Smrg } 4402b8e80941Smrg 4403b8e80941Smrg inst->opcode = FS_OPCODE_FB_WRITE; 4404b8e80941Smrg inst->mlen = regs_written(load); 4405b8e80941Smrg inst->header_size = header_size; 4406b8e80941Smrg} 4407b8e80941Smrg 4408b8e80941Smrgstatic void 4409b8e80941Smrglower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst) 4410b8e80941Smrg{ 4411b8e80941Smrg const fs_builder &ubld = bld.exec_all().group(8, 0); 4412b8e80941Smrg const unsigned length = 2; 4413b8e80941Smrg const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length); 4414b8e80941Smrg 4415b8e80941Smrg if (bld.group() < 16) { 4416b8e80941Smrg ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0), 4417b8e80941Smrg BRW_REGISTER_TYPE_UD)); 4418b8e80941Smrg } else { 4419b8e80941Smrg assert(bld.group() < 32); 4420b8e80941Smrg const fs_reg header_sources[] = { 4421b8e80941Smrg retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD), 4422b8e80941Smrg retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD) 4423b8e80941Smrg }; 4424b8e80941Smrg ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0); 4425b8e80941Smrg } 4426b8e80941Smrg 4427b8e80941Smrg inst->resize_sources(1); 4428b8e80941Smrg inst->src[0] = header; 4429b8e80941Smrg inst->opcode = FS_OPCODE_FB_READ; 4430b8e80941Smrg inst->mlen = length; 4431b8e80941Smrg inst->header_size = length; 4432b8e80941Smrg} 4433b8e80941Smrg 4434b8e80941Smrgstatic void 4435b8e80941Smrglower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op, 4436b8e80941Smrg const fs_reg &coordinate, 4437b8e80941Smrg const fs_reg &shadow_c, 4438b8e80941Smrg const fs_reg &lod, const fs_reg &lod2, 4439b8e80941Smrg const fs_reg &surface, 4440b8e80941Smrg const fs_reg &sampler, 4441b8e80941Smrg unsigned coord_components, 4442b8e80941Smrg unsigned grad_components) 4443b8e80941Smrg{ 4444b8e80941Smrg const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB || 4445b8e80941Smrg op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS); 4446b8e80941Smrg fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F); 4447b8e80941Smrg fs_reg msg_end = msg_begin; 4448b8e80941Smrg 4449b8e80941Smrg /* g0 header. */ 4450b8e80941Smrg msg_end = offset(msg_end, bld.group(8, 0), 1); 4451b8e80941Smrg 4452b8e80941Smrg for (unsigned i = 0; i < coord_components; i++) 4453b8e80941Smrg bld.MOV(retype(offset(msg_end, bld, i), coordinate.type), 4454b8e80941Smrg offset(coordinate, bld, i)); 4455b8e80941Smrg 4456b8e80941Smrg msg_end = offset(msg_end, bld, coord_components); 4457b8e80941Smrg 4458b8e80941Smrg /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8 4459b8e80941Smrg * require all three components to be present and zero if they are unused. 4460b8e80941Smrg */ 4461b8e80941Smrg if (coord_components > 0 && 4462b8e80941Smrg (has_lod || shadow_c.file != BAD_FILE || 4463b8e80941Smrg (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) { 4464b8e80941Smrg for (unsigned i = coord_components; i < 3; i++) 4465b8e80941Smrg bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f)); 4466b8e80941Smrg 4467b8e80941Smrg msg_end = offset(msg_end, bld, 3 - coord_components); 4468b8e80941Smrg } 4469b8e80941Smrg 4470b8e80941Smrg if (op == SHADER_OPCODE_TXD) { 4471b8e80941Smrg /* TXD unsupported in SIMD16 mode. */ 4472b8e80941Smrg assert(bld.dispatch_width() == 8); 4473b8e80941Smrg 4474b8e80941Smrg /* the slots for u and v are always present, but r is optional */ 4475b8e80941Smrg if (coord_components < 2) 4476b8e80941Smrg msg_end = offset(msg_end, bld, 2 - coord_components); 4477b8e80941Smrg 4478b8e80941Smrg /* P = u, v, r 4479b8e80941Smrg * dPdx = dudx, dvdx, drdx 4480b8e80941Smrg * dPdy = dudy, dvdy, drdy 4481b8e80941Smrg * 4482b8e80941Smrg * 1-arg: Does not exist. 4483b8e80941Smrg * 4484b8e80941Smrg * 2-arg: dudx dvdx dudy dvdy 4485b8e80941Smrg * dPdx.x dPdx.y dPdy.x dPdy.y 4486b8e80941Smrg * m4 m5 m6 m7 4487b8e80941Smrg * 4488b8e80941Smrg * 3-arg: dudx dvdx drdx dudy dvdy drdy 4489b8e80941Smrg * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z 4490b8e80941Smrg * m5 m6 m7 m8 m9 m10 4491b8e80941Smrg */ 4492b8e80941Smrg for (unsigned i = 0; i < grad_components; i++) 4493b8e80941Smrg bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i)); 4494b8e80941Smrg 4495b8e80941Smrg msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); 4496b8e80941Smrg 4497b8e80941Smrg for (unsigned i = 0; i < grad_components; i++) 4498b8e80941Smrg bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i)); 4499b8e80941Smrg 4500b8e80941Smrg msg_end = offset(msg_end, bld, MAX2(grad_components, 2)); 4501b8e80941Smrg } 4502b8e80941Smrg 4503b8e80941Smrg if (has_lod) { 4504b8e80941Smrg /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without* 4505b8e80941Smrg * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode. 4506b8e80941Smrg */ 4507b8e80941Smrg assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 : 4508b8e80941Smrg bld.dispatch_width() == 16); 4509b8e80941Smrg 4510b8e80941Smrg const brw_reg_type type = 4511b8e80941Smrg (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ? 4512b8e80941Smrg BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F); 4513b8e80941Smrg bld.MOV(retype(msg_end, type), lod); 4514b8e80941Smrg msg_end = offset(msg_end, bld, 1); 4515b8e80941Smrg } 4516b8e80941Smrg 4517b8e80941Smrg if (shadow_c.file != BAD_FILE) { 4518b8e80941Smrg if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) { 4519b8e80941Smrg /* There's no plain shadow compare message, so we use shadow 4520b8e80941Smrg * compare with a bias of 0.0. 4521b8e80941Smrg */ 4522b8e80941Smrg bld.MOV(msg_end, brw_imm_f(0.0f)); 4523b8e80941Smrg msg_end = offset(msg_end, bld, 1); 4524b8e80941Smrg } 4525b8e80941Smrg 4526b8e80941Smrg bld.MOV(msg_end, shadow_c); 4527b8e80941Smrg msg_end = offset(msg_end, bld, 1); 4528b8e80941Smrg } 4529b8e80941Smrg 4530b8e80941Smrg inst->opcode = op; 4531b8e80941Smrg inst->src[0] = reg_undef; 4532b8e80941Smrg inst->src[1] = surface; 4533b8e80941Smrg inst->src[2] = sampler; 4534b8e80941Smrg inst->resize_sources(3); 4535b8e80941Smrg inst->base_mrf = msg_begin.nr; 4536b8e80941Smrg inst->mlen = msg_end.nr - msg_begin.nr; 4537b8e80941Smrg inst->header_size = 1; 4538b8e80941Smrg} 4539b8e80941Smrg 4540b8e80941Smrgstatic void 4541b8e80941Smrglower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op, 4542b8e80941Smrg const fs_reg &coordinate, 4543b8e80941Smrg const fs_reg &shadow_c, 4544b8e80941Smrg const fs_reg &lod, const fs_reg &lod2, 4545b8e80941Smrg const fs_reg &sample_index, 4546b8e80941Smrg const fs_reg &surface, 4547b8e80941Smrg const fs_reg &sampler, 4548b8e80941Smrg unsigned coord_components, 4549b8e80941Smrg unsigned grad_components) 4550b8e80941Smrg{ 4551b8e80941Smrg fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F); 4552b8e80941Smrg fs_reg msg_coords = message; 4553b8e80941Smrg unsigned header_size = 0; 4554b8e80941Smrg 4555b8e80941Smrg if (inst->offset != 0) { 4556b8e80941Smrg /* The offsets set up by the visitor are in the m1 header, so we can't 4557b8e80941Smrg * go headerless. 4558b8e80941Smrg */ 4559b8e80941Smrg header_size = 1; 4560b8e80941Smrg message.nr--; 4561b8e80941Smrg } 4562b8e80941Smrg 4563b8e80941Smrg for (unsigned i = 0; i < coord_components; i++) 4564b8e80941Smrg bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), 4565b8e80941Smrg offset(coordinate, bld, i)); 4566b8e80941Smrg 4567b8e80941Smrg fs_reg msg_end = offset(msg_coords, bld, coord_components); 4568b8e80941Smrg fs_reg msg_lod = offset(msg_coords, bld, 4); 4569b8e80941Smrg 4570b8e80941Smrg if (shadow_c.file != BAD_FILE) { 4571b8e80941Smrg fs_reg msg_shadow = msg_lod; 4572b8e80941Smrg bld.MOV(msg_shadow, shadow_c); 4573b8e80941Smrg msg_lod = offset(msg_shadow, bld, 1); 4574b8e80941Smrg msg_end = msg_lod; 4575b8e80941Smrg } 4576b8e80941Smrg 4577b8e80941Smrg switch (op) { 4578b8e80941Smrg case SHADER_OPCODE_TXL: 4579b8e80941Smrg case FS_OPCODE_TXB: 4580b8e80941Smrg bld.MOV(msg_lod, lod); 4581b8e80941Smrg msg_end = offset(msg_lod, bld, 1); 4582b8e80941Smrg break; 4583b8e80941Smrg case SHADER_OPCODE_TXD: 4584b8e80941Smrg /** 4585b8e80941Smrg * P = u, v, r 4586b8e80941Smrg * dPdx = dudx, dvdx, drdx 4587b8e80941Smrg * dPdy = dudy, dvdy, drdy 4588b8e80941Smrg * 4589b8e80941Smrg * Load up these values: 4590b8e80941Smrg * - dudx dudy dvdx dvdy drdx drdy 4591b8e80941Smrg * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z 4592b8e80941Smrg */ 4593b8e80941Smrg msg_end = msg_lod; 4594b8e80941Smrg for (unsigned i = 0; i < grad_components; i++) { 4595b8e80941Smrg bld.MOV(msg_end, offset(lod, bld, i)); 4596b8e80941Smrg msg_end = offset(msg_end, bld, 1); 4597b8e80941Smrg 4598b8e80941Smrg bld.MOV(msg_end, offset(lod2, bld, i)); 4599b8e80941Smrg msg_end = offset(msg_end, bld, 1); 4600b8e80941Smrg } 4601b8e80941Smrg break; 4602b8e80941Smrg case SHADER_OPCODE_TXS: 4603b8e80941Smrg msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD); 4604b8e80941Smrg bld.MOV(msg_lod, lod); 4605b8e80941Smrg msg_end = offset(msg_lod, bld, 1); 4606b8e80941Smrg break; 4607b8e80941Smrg case SHADER_OPCODE_TXF: 4608b8e80941Smrg msg_lod = offset(msg_coords, bld, 3); 4609b8e80941Smrg bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod); 4610b8e80941Smrg msg_end = offset(msg_lod, bld, 1); 4611b8e80941Smrg break; 4612b8e80941Smrg case SHADER_OPCODE_TXF_CMS: 4613b8e80941Smrg msg_lod = offset(msg_coords, bld, 3); 4614b8e80941Smrg /* lod */ 4615b8e80941Smrg bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); 4616b8e80941Smrg /* sample index */ 4617b8e80941Smrg bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index); 4618b8e80941Smrg msg_end = offset(msg_lod, bld, 2); 4619b8e80941Smrg break; 4620b8e80941Smrg default: 4621b8e80941Smrg break; 4622b8e80941Smrg } 4623b8e80941Smrg 4624b8e80941Smrg inst->opcode = op; 4625b8e80941Smrg inst->src[0] = reg_undef; 4626b8e80941Smrg inst->src[1] = surface; 4627b8e80941Smrg inst->src[2] = sampler; 4628b8e80941Smrg inst->resize_sources(3); 4629b8e80941Smrg inst->base_mrf = message.nr; 4630b8e80941Smrg inst->mlen = msg_end.nr - message.nr; 4631b8e80941Smrg inst->header_size = header_size; 4632b8e80941Smrg 4633b8e80941Smrg /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ 4634b8e80941Smrg assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); 4635b8e80941Smrg} 4636b8e80941Smrg 4637b8e80941Smrgstatic bool 4638b8e80941Smrgis_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler) 4639b8e80941Smrg{ 4640b8e80941Smrg if (devinfo->gen < 8 && !devinfo->is_haswell) 4641b8e80941Smrg return false; 4642b8e80941Smrg 4643b8e80941Smrg return sampler.file != IMM || sampler.ud >= 16; 4644b8e80941Smrg} 4645b8e80941Smrg 4646b8e80941Smrgstatic unsigned 4647b8e80941Smrgsampler_msg_type(const gen_device_info *devinfo, 4648b8e80941Smrg opcode opcode, bool shadow_compare) 4649b8e80941Smrg{ 4650b8e80941Smrg assert(devinfo->gen >= 5); 4651b8e80941Smrg switch (opcode) { 4652b8e80941Smrg case SHADER_OPCODE_TEX: 4653b8e80941Smrg return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE : 4654b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE; 4655b8e80941Smrg case FS_OPCODE_TXB: 4656b8e80941Smrg return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE : 4657b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS; 4658b8e80941Smrg case SHADER_OPCODE_TXL: 4659b8e80941Smrg return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE : 4660b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 4661b8e80941Smrg case SHADER_OPCODE_TXL_LZ: 4662b8e80941Smrg return shadow_compare ? GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ : 4663b8e80941Smrg GEN9_SAMPLER_MESSAGE_SAMPLE_LZ; 4664b8e80941Smrg case SHADER_OPCODE_TXS: 4665b8e80941Smrg case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 4666b8e80941Smrg return GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 4667b8e80941Smrg case SHADER_OPCODE_TXD: 4668b8e80941Smrg assert(!shadow_compare || devinfo->gen >= 8 || devinfo->is_haswell); 4669b8e80941Smrg return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE : 4670b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 4671b8e80941Smrg case SHADER_OPCODE_TXF: 4672b8e80941Smrg return GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 4673b8e80941Smrg case SHADER_OPCODE_TXF_LZ: 4674b8e80941Smrg assert(devinfo->gen >= 9); 4675b8e80941Smrg return GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ; 4676b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W: 4677b8e80941Smrg assert(devinfo->gen >= 9); 4678b8e80941Smrg return GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 4679b8e80941Smrg case SHADER_OPCODE_TXF_CMS: 4680b8e80941Smrg return devinfo->gen >= 7 ? GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS : 4681b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 4682b8e80941Smrg case SHADER_OPCODE_TXF_UMS: 4683b8e80941Smrg assert(devinfo->gen >= 7); 4684b8e80941Smrg return GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS; 4685b8e80941Smrg case SHADER_OPCODE_TXF_MCS: 4686b8e80941Smrg assert(devinfo->gen >= 7); 4687b8e80941Smrg return GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 4688b8e80941Smrg case SHADER_OPCODE_LOD: 4689b8e80941Smrg return GEN5_SAMPLER_MESSAGE_LOD; 4690b8e80941Smrg case SHADER_OPCODE_TG4: 4691b8e80941Smrg assert(devinfo->gen >= 7); 4692b8e80941Smrg return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C : 4693b8e80941Smrg GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 4694b8e80941Smrg break; 4695b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET: 4696b8e80941Smrg assert(devinfo->gen >= 7); 4697b8e80941Smrg return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C : 4698b8e80941Smrg GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 4699b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO: 4700b8e80941Smrg return GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 4701b8e80941Smrg default: 4702b8e80941Smrg unreachable("not reached"); 4703b8e80941Smrg } 4704b8e80941Smrg} 4705b8e80941Smrg 4706b8e80941Smrgstatic void 4707b8e80941Smrglower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op, 4708b8e80941Smrg const fs_reg &coordinate, 4709b8e80941Smrg const fs_reg &shadow_c, 4710b8e80941Smrg fs_reg lod, const fs_reg &lod2, 4711b8e80941Smrg const fs_reg &min_lod, 4712b8e80941Smrg const fs_reg &sample_index, 4713b8e80941Smrg const fs_reg &mcs, 4714b8e80941Smrg const fs_reg &surface, 4715b8e80941Smrg const fs_reg &sampler, 4716b8e80941Smrg const fs_reg &surface_handle, 4717b8e80941Smrg const fs_reg &sampler_handle, 4718b8e80941Smrg const fs_reg &tg4_offset, 4719b8e80941Smrg unsigned coord_components, 4720b8e80941Smrg unsigned grad_components) 4721b8e80941Smrg{ 4722b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 4723b8e80941Smrg const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data; 4724b8e80941Smrg unsigned reg_width = bld.dispatch_width() / 8; 4725b8e80941Smrg unsigned header_size = 0, length = 0; 4726b8e80941Smrg fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE]; 4727b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(sources); i++) 4728b8e80941Smrg sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F); 4729b8e80941Smrg 4730b8e80941Smrg /* We must have exactly one of surface/sampler and surface/sampler_handle */ 4731b8e80941Smrg assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); 4732b8e80941Smrg assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE)); 4733b8e80941Smrg 4734b8e80941Smrg if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET || 4735b8e80941Smrg inst->offset != 0 || inst->eot || 4736b8e80941Smrg op == SHADER_OPCODE_SAMPLEINFO || 4737b8e80941Smrg sampler_handle.file != BAD_FILE || 4738b8e80941Smrg is_high_sampler(devinfo, sampler)) { 4739b8e80941Smrg /* For general texture offsets (no txf workaround), we need a header to 4740b8e80941Smrg * put them in. 4741b8e80941Smrg * 4742b8e80941Smrg * TG4 needs to place its channel select in the header, for interaction 4743b8e80941Smrg * with ARB_texture_swizzle. The sampler index is only 4-bits, so for 4744b8e80941Smrg * larger sampler numbers we need to offset the Sampler State Pointer in 4745b8e80941Smrg * the header. 4746b8e80941Smrg */ 4747b8e80941Smrg fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD); 4748b8e80941Smrg header_size = 1; 4749b8e80941Smrg length++; 4750b8e80941Smrg 4751b8e80941Smrg /* If we're requesting fewer than four channels worth of response, 4752b8e80941Smrg * and we have an explicit header, we need to set up the sampler 4753b8e80941Smrg * writemask. It's reversed from normal: 1 means "don't write". 4754b8e80941Smrg */ 4755b8e80941Smrg if (!inst->eot && regs_written(inst) != 4 * reg_width) { 4756b8e80941Smrg assert(regs_written(inst) % reg_width == 0); 4757b8e80941Smrg unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf; 4758b8e80941Smrg inst->offset |= mask << 12; 4759b8e80941Smrg } 4760b8e80941Smrg 4761b8e80941Smrg /* Build the actual header */ 4762b8e80941Smrg const fs_builder ubld = bld.exec_all().group(8, 0); 4763b8e80941Smrg const fs_builder ubld1 = ubld.group(1, 0); 4764b8e80941Smrg ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 4765b8e80941Smrg if (inst->offset) { 4766b8e80941Smrg ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset)); 4767b8e80941Smrg } else if (bld.shader->stage != MESA_SHADER_VERTEX && 4768b8e80941Smrg bld.shader->stage != MESA_SHADER_FRAGMENT) { 4769b8e80941Smrg /* The vertex and fragment stages have g0.2 set to 0, so 4770b8e80941Smrg * header0.2 is 0 when g0 is copied. Other stages may not, so we 4771b8e80941Smrg * must set it to 0 to avoid setting undesirable bits in the 4772b8e80941Smrg * message. 4773b8e80941Smrg */ 4774b8e80941Smrg ubld1.MOV(component(header, 2), brw_imm_ud(0)); 4775b8e80941Smrg } 4776b8e80941Smrg 4777b8e80941Smrg if (sampler_handle.file != BAD_FILE) { 4778b8e80941Smrg /* Bindless sampler handles aren't relative to the sampler state 4779b8e80941Smrg * pointer passed into the shader through SAMPLER_STATE_POINTERS_*. 4780b8e80941Smrg * Instead, it's an absolute pointer relative to dynamic state base 4781b8e80941Smrg * address. 4782b8e80941Smrg * 4783b8e80941Smrg * Sampler states are 16 bytes each and the pointer we give here has 4784b8e80941Smrg * to be 32-byte aligned. In order to avoid more indirect messages 4785b8e80941Smrg * than required, we assume that all bindless sampler states are 4786b8e80941Smrg * 32-byte aligned. This sacrifices a bit of general state base 4787b8e80941Smrg * address space but means we can do something more efficient in the 4788b8e80941Smrg * shader. 4789b8e80941Smrg */ 4790b8e80941Smrg ubld1.MOV(component(header, 3), sampler_handle); 4791b8e80941Smrg } else if (is_high_sampler(devinfo, sampler)) { 4792b8e80941Smrg if (sampler.file == BRW_IMMEDIATE_VALUE) { 4793b8e80941Smrg assert(sampler.ud >= 16); 4794b8e80941Smrg const int sampler_state_size = 16; /* 16 bytes */ 4795b8e80941Smrg 4796b8e80941Smrg ubld1.ADD(component(header, 3), 4797b8e80941Smrg retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 4798b8e80941Smrg brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size)); 4799b8e80941Smrg } else { 4800b8e80941Smrg fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD); 4801b8e80941Smrg ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0)); 4802b8e80941Smrg ubld1.SHL(tmp, tmp, brw_imm_ud(4)); 4803b8e80941Smrg ubld1.ADD(component(header, 3), 4804b8e80941Smrg retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD), 4805b8e80941Smrg tmp); 4806b8e80941Smrg } 4807b8e80941Smrg } 4808b8e80941Smrg } 4809b8e80941Smrg 4810b8e80941Smrg if (shadow_c.file != BAD_FILE) { 4811b8e80941Smrg bld.MOV(sources[length], shadow_c); 4812b8e80941Smrg length++; 4813b8e80941Smrg } 4814b8e80941Smrg 4815b8e80941Smrg bool coordinate_done = false; 4816b8e80941Smrg 4817b8e80941Smrg /* Set up the LOD info */ 4818b8e80941Smrg switch (op) { 4819b8e80941Smrg case FS_OPCODE_TXB: 4820b8e80941Smrg case SHADER_OPCODE_TXL: 4821b8e80941Smrg if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) { 4822b8e80941Smrg op = SHADER_OPCODE_TXL_LZ; 4823b8e80941Smrg break; 4824b8e80941Smrg } 4825b8e80941Smrg bld.MOV(sources[length], lod); 4826b8e80941Smrg length++; 4827b8e80941Smrg break; 4828b8e80941Smrg case SHADER_OPCODE_TXD: 4829b8e80941Smrg /* TXD should have been lowered in SIMD16 mode. */ 4830b8e80941Smrg assert(bld.dispatch_width() == 8); 4831b8e80941Smrg 4832b8e80941Smrg /* Load dPdx and the coordinate together: 4833b8e80941Smrg * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z 4834b8e80941Smrg */ 4835b8e80941Smrg for (unsigned i = 0; i < coord_components; i++) { 4836b8e80941Smrg bld.MOV(sources[length++], offset(coordinate, bld, i)); 4837b8e80941Smrg 4838b8e80941Smrg /* For cube map array, the coordinate is (u,v,r,ai) but there are 4839b8e80941Smrg * only derivatives for (u, v, r). 4840b8e80941Smrg */ 4841b8e80941Smrg if (i < grad_components) { 4842b8e80941Smrg bld.MOV(sources[length++], offset(lod, bld, i)); 4843b8e80941Smrg bld.MOV(sources[length++], offset(lod2, bld, i)); 4844b8e80941Smrg } 4845b8e80941Smrg } 4846b8e80941Smrg 4847b8e80941Smrg coordinate_done = true; 4848b8e80941Smrg break; 4849b8e80941Smrg case SHADER_OPCODE_TXS: 4850b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod); 4851b8e80941Smrg length++; 4852b8e80941Smrg break; 4853b8e80941Smrg case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 4854b8e80941Smrg /* We need an LOD; just use 0 */ 4855b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0)); 4856b8e80941Smrg length++; 4857b8e80941Smrg break; 4858b8e80941Smrg case SHADER_OPCODE_TXF: 4859b8e80941Smrg /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. 4860b8e80941Smrg * On Gen9 they are u, v, lod, r 4861b8e80941Smrg */ 4862b8e80941Smrg bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate); 4863b8e80941Smrg 4864b8e80941Smrg if (devinfo->gen >= 9) { 4865b8e80941Smrg if (coord_components >= 2) { 4866b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), 4867b8e80941Smrg offset(coordinate, bld, 1)); 4868b8e80941Smrg } else { 4869b8e80941Smrg sources[length] = brw_imm_d(0); 4870b8e80941Smrg } 4871b8e80941Smrg length++; 4872b8e80941Smrg } 4873b8e80941Smrg 4874b8e80941Smrg if (devinfo->gen >= 9 && lod.is_zero()) { 4875b8e80941Smrg op = SHADER_OPCODE_TXF_LZ; 4876b8e80941Smrg } else { 4877b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod); 4878b8e80941Smrg length++; 4879b8e80941Smrg } 4880b8e80941Smrg 4881b8e80941Smrg for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) 4882b8e80941Smrg bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), 4883b8e80941Smrg offset(coordinate, bld, i)); 4884b8e80941Smrg 4885b8e80941Smrg coordinate_done = true; 4886b8e80941Smrg break; 4887b8e80941Smrg 4888b8e80941Smrg case SHADER_OPCODE_TXF_CMS: 4889b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W: 4890b8e80941Smrg case SHADER_OPCODE_TXF_UMS: 4891b8e80941Smrg case SHADER_OPCODE_TXF_MCS: 4892b8e80941Smrg if (op == SHADER_OPCODE_TXF_UMS || 4893b8e80941Smrg op == SHADER_OPCODE_TXF_CMS || 4894b8e80941Smrg op == SHADER_OPCODE_TXF_CMS_W) { 4895b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index); 4896b8e80941Smrg length++; 4897b8e80941Smrg } 4898b8e80941Smrg 4899b8e80941Smrg if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) { 4900b8e80941Smrg /* Data from the multisample control surface. */ 4901b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs); 4902b8e80941Smrg length++; 4903b8e80941Smrg 4904b8e80941Smrg /* On Gen9+ we'll use ld2dms_w instead which has two registers for 4905b8e80941Smrg * the MCS data. 4906b8e80941Smrg */ 4907b8e80941Smrg if (op == SHADER_OPCODE_TXF_CMS_W) { 4908b8e80941Smrg bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), 4909b8e80941Smrg mcs.file == IMM ? 4910b8e80941Smrg mcs : 4911b8e80941Smrg offset(mcs, bld, 1)); 4912b8e80941Smrg length++; 4913b8e80941Smrg } 4914b8e80941Smrg } 4915b8e80941Smrg 4916b8e80941Smrg /* There is no offsetting for this message; just copy in the integer 4917b8e80941Smrg * texture coordinates. 4918b8e80941Smrg */ 4919b8e80941Smrg for (unsigned i = 0; i < coord_components; i++) 4920b8e80941Smrg bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), 4921b8e80941Smrg offset(coordinate, bld, i)); 4922b8e80941Smrg 4923b8e80941Smrg coordinate_done = true; 4924b8e80941Smrg break; 4925b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET: 4926b8e80941Smrg /* More crazy intermixing */ 4927b8e80941Smrg for (unsigned i = 0; i < 2; i++) /* u, v */ 4928b8e80941Smrg bld.MOV(sources[length++], offset(coordinate, bld, i)); 4929b8e80941Smrg 4930b8e80941Smrg for (unsigned i = 0; i < 2; i++) /* offu, offv */ 4931b8e80941Smrg bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), 4932b8e80941Smrg offset(tg4_offset, bld, i)); 4933b8e80941Smrg 4934b8e80941Smrg if (coord_components == 3) /* r if present */ 4935b8e80941Smrg bld.MOV(sources[length++], offset(coordinate, bld, 2)); 4936b8e80941Smrg 4937b8e80941Smrg coordinate_done = true; 4938b8e80941Smrg break; 4939b8e80941Smrg default: 4940b8e80941Smrg break; 4941b8e80941Smrg } 4942b8e80941Smrg 4943b8e80941Smrg /* Set up the coordinate (except for cases where it was done above) */ 4944b8e80941Smrg if (!coordinate_done) { 4945b8e80941Smrg for (unsigned i = 0; i < coord_components; i++) 4946b8e80941Smrg bld.MOV(sources[length++], offset(coordinate, bld, i)); 4947b8e80941Smrg } 4948b8e80941Smrg 4949b8e80941Smrg if (min_lod.file != BAD_FILE) { 4950b8e80941Smrg /* Account for all of the missing coordinate sources */ 4951b8e80941Smrg length += 4 - coord_components; 4952b8e80941Smrg if (op == SHADER_OPCODE_TXD) 4953b8e80941Smrg length += (3 - grad_components) * 2; 4954b8e80941Smrg 4955b8e80941Smrg bld.MOV(sources[length++], min_lod); 4956b8e80941Smrg } 4957b8e80941Smrg 4958b8e80941Smrg unsigned mlen; 4959b8e80941Smrg if (reg_width == 2) 4960b8e80941Smrg mlen = length * reg_width - header_size; 4961b8e80941Smrg else 4962b8e80941Smrg mlen = length * reg_width; 4963b8e80941Smrg 4964b8e80941Smrg const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen), 4965b8e80941Smrg BRW_REGISTER_TYPE_F); 4966b8e80941Smrg bld.LOAD_PAYLOAD(src_payload, sources, length, header_size); 4967b8e80941Smrg 4968b8e80941Smrg /* Generate the SEND. */ 4969b8e80941Smrg inst->opcode = SHADER_OPCODE_SEND; 4970b8e80941Smrg inst->mlen = mlen; 4971b8e80941Smrg inst->header_size = header_size; 4972b8e80941Smrg 4973b8e80941Smrg const unsigned msg_type = 4974b8e80941Smrg sampler_msg_type(devinfo, op, inst->shadow_compare); 4975b8e80941Smrg const unsigned simd_mode = 4976b8e80941Smrg inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : 4977b8e80941Smrg BRW_SAMPLER_SIMD_MODE_SIMD16; 4978b8e80941Smrg 4979b8e80941Smrg uint32_t base_binding_table_index; 4980b8e80941Smrg switch (op) { 4981b8e80941Smrg case SHADER_OPCODE_TG4: 4982b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET: 4983b8e80941Smrg base_binding_table_index = prog_data->binding_table.gather_texture_start; 4984b8e80941Smrg break; 4985b8e80941Smrg case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 4986b8e80941Smrg base_binding_table_index = prog_data->binding_table.image_start; 4987b8e80941Smrg break; 4988b8e80941Smrg default: 4989b8e80941Smrg base_binding_table_index = prog_data->binding_table.texture_start; 4990b8e80941Smrg break; 4991b8e80941Smrg } 4992b8e80941Smrg 4993b8e80941Smrg inst->sfid = BRW_SFID_SAMPLER; 4994b8e80941Smrg if (surface.file == IMM && 4995b8e80941Smrg (sampler.file == IMM || sampler_handle.file != BAD_FILE)) { 4996b8e80941Smrg inst->desc = brw_sampler_desc(devinfo, 4997b8e80941Smrg surface.ud + base_binding_table_index, 4998b8e80941Smrg sampler.file == IMM ? sampler.ud % 16 : 0, 4999b8e80941Smrg msg_type, 5000b8e80941Smrg simd_mode, 5001b8e80941Smrg 0 /* return_format unused on gen7+ */); 5002b8e80941Smrg inst->src[0] = brw_imm_ud(0); 5003b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5004b8e80941Smrg } else if (surface_handle.file != BAD_FILE) { 5005b8e80941Smrg /* Bindless surface */ 5006b8e80941Smrg assert(devinfo->gen >= 9); 5007b8e80941Smrg inst->desc = brw_sampler_desc(devinfo, 5008b8e80941Smrg GEN9_BTI_BINDLESS, 5009b8e80941Smrg sampler.file == IMM ? sampler.ud % 16 : 0, 5010b8e80941Smrg msg_type, 5011b8e80941Smrg simd_mode, 5012b8e80941Smrg 0 /* return_format unused on gen7+ */); 5013b8e80941Smrg 5014b8e80941Smrg /* For bindless samplers, the entire address is included in the message 5015b8e80941Smrg * header so we can leave the portion in the message descriptor 0. 5016b8e80941Smrg */ 5017b8e80941Smrg if (sampler_handle.file != BAD_FILE || sampler.file == IMM) { 5018b8e80941Smrg inst->src[0] = brw_imm_ud(0); 5019b8e80941Smrg } else { 5020b8e80941Smrg const fs_builder ubld = bld.group(1, 0).exec_all(); 5021b8e80941Smrg fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5022b8e80941Smrg ubld.SHL(desc, sampler, brw_imm_ud(8)); 5023b8e80941Smrg inst->src[0] = desc; 5024b8e80941Smrg } 5025b8e80941Smrg 5026b8e80941Smrg /* We assume that the driver provided the handle in the top 20 bits so 5027b8e80941Smrg * we can use the surface handle directly as the extended descriptor. 5028b8e80941Smrg */ 5029b8e80941Smrg inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); 5030b8e80941Smrg } else { 5031b8e80941Smrg /* Immediate portion of the descriptor */ 5032b8e80941Smrg inst->desc = brw_sampler_desc(devinfo, 5033b8e80941Smrg 0, /* surface */ 5034b8e80941Smrg 0, /* sampler */ 5035b8e80941Smrg msg_type, 5036b8e80941Smrg simd_mode, 5037b8e80941Smrg 0 /* return_format unused on gen7+ */); 5038b8e80941Smrg const fs_builder ubld = bld.group(1, 0).exec_all(); 5039b8e80941Smrg fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5040b8e80941Smrg if (surface.equals(sampler)) { 5041b8e80941Smrg /* This case is common in GL */ 5042b8e80941Smrg ubld.MUL(desc, surface, brw_imm_ud(0x101)); 5043b8e80941Smrg } else { 5044b8e80941Smrg if (sampler_handle.file != BAD_FILE) { 5045b8e80941Smrg ubld.MOV(desc, surface); 5046b8e80941Smrg } else if (sampler.file == IMM) { 5047b8e80941Smrg ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8)); 5048b8e80941Smrg } else { 5049b8e80941Smrg ubld.SHL(desc, sampler, brw_imm_ud(8)); 5050b8e80941Smrg ubld.OR(desc, desc, surface); 5051b8e80941Smrg } 5052b8e80941Smrg } 5053b8e80941Smrg if (base_binding_table_index) 5054b8e80941Smrg ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index)); 5055b8e80941Smrg ubld.AND(desc, desc, brw_imm_ud(0xfff)); 5056b8e80941Smrg 5057b8e80941Smrg inst->src[0] = component(desc, 0); 5058b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5059b8e80941Smrg } 5060b8e80941Smrg 5061b8e80941Smrg inst->src[2] = src_payload; 5062b8e80941Smrg inst->resize_sources(3); 5063b8e80941Smrg 5064b8e80941Smrg if (inst->eot) { 5065b8e80941Smrg /* EOT sampler messages don't make sense to split because it would 5066b8e80941Smrg * involve ending half of the thread early. 5067b8e80941Smrg */ 5068b8e80941Smrg assert(inst->group == 0); 5069b8e80941Smrg /* We need to use SENDC for EOT sampler messages */ 5070b8e80941Smrg inst->check_tdr = true; 5071b8e80941Smrg inst->send_has_side_effects = true; 5072b8e80941Smrg } 5073b8e80941Smrg 5074b8e80941Smrg /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */ 5075b8e80941Smrg assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE); 5076b8e80941Smrg} 5077b8e80941Smrg 5078b8e80941Smrgstatic void 5079b8e80941Smrglower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op) 5080b8e80941Smrg{ 5081b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 5082b8e80941Smrg const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE]; 5083b8e80941Smrg const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C]; 5084b8e80941Smrg const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD]; 5085b8e80941Smrg const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2]; 5086b8e80941Smrg const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD]; 5087b8e80941Smrg const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX]; 5088b8e80941Smrg const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS]; 5089b8e80941Smrg const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE]; 5090b8e80941Smrg const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER]; 5091b8e80941Smrg const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]; 5092b8e80941Smrg const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE]; 5093b8e80941Smrg const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET]; 5094b8e80941Smrg assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM); 5095b8e80941Smrg const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud; 5096b8e80941Smrg assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM); 5097b8e80941Smrg const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud; 5098b8e80941Smrg 5099b8e80941Smrg if (devinfo->gen >= 7) { 5100b8e80941Smrg lower_sampler_logical_send_gen7(bld, inst, op, coordinate, 5101b8e80941Smrg shadow_c, lod, lod2, min_lod, 5102b8e80941Smrg sample_index, 5103b8e80941Smrg mcs, surface, sampler, 5104b8e80941Smrg surface_handle, sampler_handle, 5105b8e80941Smrg tg4_offset, 5106b8e80941Smrg coord_components, grad_components); 5107b8e80941Smrg } else if (devinfo->gen >= 5) { 5108b8e80941Smrg lower_sampler_logical_send_gen5(bld, inst, op, coordinate, 5109b8e80941Smrg shadow_c, lod, lod2, sample_index, 5110b8e80941Smrg surface, sampler, 5111b8e80941Smrg coord_components, grad_components); 5112b8e80941Smrg } else { 5113b8e80941Smrg lower_sampler_logical_send_gen4(bld, inst, op, coordinate, 5114b8e80941Smrg shadow_c, lod, lod2, 5115b8e80941Smrg surface, sampler, 5116b8e80941Smrg coord_components, grad_components); 5117b8e80941Smrg } 5118b8e80941Smrg} 5119b8e80941Smrg 5120b8e80941Smrg/** 5121b8e80941Smrg * Initialize the header present in some typed and untyped surface 5122b8e80941Smrg * messages. 5123b8e80941Smrg */ 5124b8e80941Smrgstatic fs_reg 5125b8e80941Smrgemit_surface_header(const fs_builder &bld, const fs_reg &sample_mask) 5126b8e80941Smrg{ 5127b8e80941Smrg fs_builder ubld = bld.exec_all().group(8, 0); 5128b8e80941Smrg const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5129b8e80941Smrg ubld.MOV(dst, brw_imm_d(0)); 5130b8e80941Smrg ubld.group(1, 0).MOV(component(dst, 7), sample_mask); 5131b8e80941Smrg return dst; 5132b8e80941Smrg} 5133b8e80941Smrg 5134b8e80941Smrgstatic void 5135b8e80941Smrglower_surface_logical_send(const fs_builder &bld, fs_inst *inst) 5136b8e80941Smrg{ 5137b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 5138b8e80941Smrg 5139b8e80941Smrg /* Get the logical send arguments. */ 5140b8e80941Smrg const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS]; 5141b8e80941Smrg const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA]; 5142b8e80941Smrg const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE]; 5143b8e80941Smrg const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE]; 5144b8e80941Smrg const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS]; 5145b8e80941Smrg const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG]; 5146b8e80941Smrg assert(arg.file == IMM); 5147b8e80941Smrg 5148b8e80941Smrg /* We must have exactly one of surface and surface_handle */ 5149b8e80941Smrg assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE)); 5150b8e80941Smrg 5151b8e80941Smrg /* Calculate the total number of components of the payload. */ 5152b8e80941Smrg const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS); 5153b8e80941Smrg const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA); 5154b8e80941Smrg 5155b8e80941Smrg const bool is_typed_access = 5156b8e80941Smrg inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL || 5157b8e80941Smrg inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL || 5158b8e80941Smrg inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL; 5159b8e80941Smrg 5160b8e80941Smrg /* From the BDW PRM Volume 7, page 147: 5161b8e80941Smrg * 5162b8e80941Smrg * "For the Data Cache Data Port*, the header must be present for the 5163b8e80941Smrg * following message types: [...] Typed read/write/atomics" 5164b8e80941Smrg * 5165b8e80941Smrg * Earlier generations have a similar wording. Because of this restriction 5166b8e80941Smrg * we don't attempt to implement sample masks via predication for such 5167b8e80941Smrg * messages prior to Gen9, since we have to provide a header anyway. On 5168b8e80941Smrg * Gen11+ the header has been removed so we can only use predication. 5169b8e80941Smrg */ 5170b8e80941Smrg const unsigned header_sz = devinfo->gen < 9 && is_typed_access ? 1 : 0; 5171b8e80941Smrg 5172b8e80941Smrg const bool has_side_effects = inst->has_side_effects(); 5173b8e80941Smrg fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() : 5174b8e80941Smrg fs_reg(brw_imm_d(0xffff)); 5175b8e80941Smrg 5176b8e80941Smrg fs_reg payload, payload2; 5177b8e80941Smrg unsigned mlen, ex_mlen = 0; 5178b8e80941Smrg if (devinfo->gen >= 9) { 5179b8e80941Smrg /* We have split sends on gen9 and above */ 5180b8e80941Smrg assert(header_sz == 0); 5181b8e80941Smrg payload = bld.move_to_vgrf(addr, addr_sz); 5182b8e80941Smrg payload2 = bld.move_to_vgrf(src, src_sz); 5183b8e80941Smrg mlen = addr_sz * (inst->exec_size / 8); 5184b8e80941Smrg ex_mlen = src_sz * (inst->exec_size / 8); 5185b8e80941Smrg } else { 5186b8e80941Smrg /* Allocate space for the payload. */ 5187b8e80941Smrg const unsigned sz = header_sz + addr_sz + src_sz; 5188b8e80941Smrg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); 5189b8e80941Smrg fs_reg *const components = new fs_reg[sz]; 5190b8e80941Smrg unsigned n = 0; 5191b8e80941Smrg 5192b8e80941Smrg /* Construct the payload. */ 5193b8e80941Smrg if (header_sz) 5194b8e80941Smrg components[n++] = emit_surface_header(bld, sample_mask); 5195b8e80941Smrg 5196b8e80941Smrg for (unsigned i = 0; i < addr_sz; i++) 5197b8e80941Smrg components[n++] = offset(addr, bld, i); 5198b8e80941Smrg 5199b8e80941Smrg for (unsigned i = 0; i < src_sz; i++) 5200b8e80941Smrg components[n++] = offset(src, bld, i); 5201b8e80941Smrg 5202b8e80941Smrg bld.LOAD_PAYLOAD(payload, components, sz, header_sz); 5203b8e80941Smrg mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8; 5204b8e80941Smrg 5205b8e80941Smrg delete[] components; 5206b8e80941Smrg } 5207b8e80941Smrg 5208b8e80941Smrg /* Predicate the instruction on the sample mask if no header is 5209b8e80941Smrg * provided. 5210b8e80941Smrg */ 5211b8e80941Smrg if (!header_sz && sample_mask.file != BAD_FILE && 5212b8e80941Smrg sample_mask.file != IMM) { 5213b8e80941Smrg const fs_builder ubld = bld.group(1, 0).exec_all(); 5214b8e80941Smrg if (inst->predicate) { 5215b8e80941Smrg assert(inst->predicate == BRW_PREDICATE_NORMAL); 5216b8e80941Smrg assert(!inst->predicate_inverse); 5217b8e80941Smrg assert(inst->flag_subreg < 2); 5218b8e80941Smrg /* Combine the sample mask with the existing predicate by using a 5219b8e80941Smrg * vertical predication mode. 5220b8e80941Smrg */ 5221b8e80941Smrg inst->predicate = BRW_PREDICATE_ALIGN1_ALLV; 5222b8e80941Smrg ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2), 5223b8e80941Smrg sample_mask.type), 5224b8e80941Smrg sample_mask); 5225b8e80941Smrg } else { 5226b8e80941Smrg inst->flag_subreg = 2; 5227b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 5228b8e80941Smrg inst->predicate_inverse = false; 5229b8e80941Smrg ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), 5230b8e80941Smrg sample_mask); 5231b8e80941Smrg } 5232b8e80941Smrg } 5233b8e80941Smrg 5234b8e80941Smrg uint32_t sfid; 5235b8e80941Smrg switch (inst->opcode) { 5236b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 5237b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 5238b8e80941Smrg /* Byte scattered opcodes go through the normal data cache */ 5239b8e80941Smrg sfid = GEN7_SFID_DATAPORT_DATA_CACHE; 5240b8e80941Smrg break; 5241b8e80941Smrg 5242b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 5243b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 5244b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 5245b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 5246b8e80941Smrg /* Untyped Surface messages go through the data cache but the SFID value 5247b8e80941Smrg * changed on Haswell. 5248b8e80941Smrg */ 5249b8e80941Smrg sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 5250b8e80941Smrg HSW_SFID_DATAPORT_DATA_CACHE_1 : 5251b8e80941Smrg GEN7_SFID_DATAPORT_DATA_CACHE); 5252b8e80941Smrg break; 5253b8e80941Smrg 5254b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 5255b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 5256b8e80941Smrg case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 5257b8e80941Smrg /* Typed surface messages go through the render cache on IVB and the 5258b8e80941Smrg * data cache on HSW+. 5259b8e80941Smrg */ 5260b8e80941Smrg sfid = (devinfo->gen >= 8 || devinfo->is_haswell ? 5261b8e80941Smrg HSW_SFID_DATAPORT_DATA_CACHE_1 : 5262b8e80941Smrg GEN6_SFID_DATAPORT_RENDER_CACHE); 5263b8e80941Smrg break; 5264b8e80941Smrg 5265b8e80941Smrg default: 5266b8e80941Smrg unreachable("Unsupported surface opcode"); 5267b8e80941Smrg } 5268b8e80941Smrg 5269b8e80941Smrg uint32_t desc; 5270b8e80941Smrg switch (inst->opcode) { 5271b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 5272b8e80941Smrg desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, 5273b8e80941Smrg arg.ud, /* num_channels */ 5274b8e80941Smrg false /* write */); 5275b8e80941Smrg break; 5276b8e80941Smrg 5277b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 5278b8e80941Smrg desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size, 5279b8e80941Smrg arg.ud, /* num_channels */ 5280b8e80941Smrg true /* write */); 5281b8e80941Smrg break; 5282b8e80941Smrg 5283b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 5284b8e80941Smrg desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, 5285b8e80941Smrg arg.ud, /* bit_size */ 5286b8e80941Smrg false /* write */); 5287b8e80941Smrg break; 5288b8e80941Smrg 5289b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 5290b8e80941Smrg desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size, 5291b8e80941Smrg arg.ud, /* bit_size */ 5292b8e80941Smrg true /* write */); 5293b8e80941Smrg break; 5294b8e80941Smrg 5295b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 5296b8e80941Smrg desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size, 5297b8e80941Smrg arg.ud, /* atomic_op */ 5298b8e80941Smrg !inst->dst.is_null()); 5299b8e80941Smrg break; 5300b8e80941Smrg 5301b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 5302b8e80941Smrg desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size, 5303b8e80941Smrg arg.ud, /* atomic_op */ 5304b8e80941Smrg !inst->dst.is_null()); 5305b8e80941Smrg break; 5306b8e80941Smrg 5307b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 5308b8e80941Smrg desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, 5309b8e80941Smrg arg.ud, /* num_channels */ 5310b8e80941Smrg false /* write */); 5311b8e80941Smrg break; 5312b8e80941Smrg 5313b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 5314b8e80941Smrg desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group, 5315b8e80941Smrg arg.ud, /* num_channels */ 5316b8e80941Smrg true /* write */); 5317b8e80941Smrg break; 5318b8e80941Smrg 5319b8e80941Smrg case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 5320b8e80941Smrg desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group, 5321b8e80941Smrg arg.ud, /* atomic_op */ 5322b8e80941Smrg !inst->dst.is_null()); 5323b8e80941Smrg break; 5324b8e80941Smrg 5325b8e80941Smrg default: 5326b8e80941Smrg unreachable("Unknown surface logical instruction"); 5327b8e80941Smrg } 5328b8e80941Smrg 5329b8e80941Smrg /* Update the original instruction. */ 5330b8e80941Smrg inst->opcode = SHADER_OPCODE_SEND; 5331b8e80941Smrg inst->mlen = mlen; 5332b8e80941Smrg inst->ex_mlen = ex_mlen; 5333b8e80941Smrg inst->header_size = header_sz; 5334b8e80941Smrg inst->send_has_side_effects = has_side_effects; 5335b8e80941Smrg inst->send_is_volatile = !has_side_effects; 5336b8e80941Smrg 5337b8e80941Smrg /* Set up SFID and descriptors */ 5338b8e80941Smrg inst->sfid = sfid; 5339b8e80941Smrg inst->desc = desc; 5340b8e80941Smrg if (surface.file == IMM) { 5341b8e80941Smrg inst->desc |= surface.ud & 0xff; 5342b8e80941Smrg inst->src[0] = brw_imm_ud(0); 5343b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5344b8e80941Smrg } else if (surface_handle.file != BAD_FILE) { 5345b8e80941Smrg /* Bindless surface */ 5346b8e80941Smrg assert(devinfo->gen >= 9); 5347b8e80941Smrg inst->desc |= GEN9_BTI_BINDLESS; 5348b8e80941Smrg inst->src[0] = brw_imm_ud(0); 5349b8e80941Smrg 5350b8e80941Smrg /* We assume that the driver provided the handle in the top 20 bits so 5351b8e80941Smrg * we can use the surface handle directly as the extended descriptor. 5352b8e80941Smrg */ 5353b8e80941Smrg inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD); 5354b8e80941Smrg } else { 5355b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 5356b8e80941Smrg fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5357b8e80941Smrg ubld.AND(tmp, surface, brw_imm_ud(0xff)); 5358b8e80941Smrg inst->src[0] = component(tmp, 0); 5359b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5360b8e80941Smrg } 5361b8e80941Smrg 5362b8e80941Smrg /* Finally, the payload */ 5363b8e80941Smrg inst->src[2] = payload; 5364b8e80941Smrg inst->src[3] = payload2; 5365b8e80941Smrg 5366b8e80941Smrg inst->resize_sources(4); 5367b8e80941Smrg} 5368b8e80941Smrg 5369b8e80941Smrgstatic void 5370b8e80941Smrglower_a64_logical_send(const fs_builder &bld, fs_inst *inst) 5371b8e80941Smrg{ 5372b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 5373b8e80941Smrg 5374b8e80941Smrg const fs_reg &addr = inst->src[0]; 5375b8e80941Smrg const fs_reg &src = inst->src[1]; 5376b8e80941Smrg const unsigned src_comps = inst->components_read(1); 5377b8e80941Smrg assert(inst->src[2].file == IMM); 5378b8e80941Smrg const unsigned arg = inst->src[2].ud; 5379b8e80941Smrg const bool has_side_effects = inst->has_side_effects(); 5380b8e80941Smrg 5381b8e80941Smrg /* If the surface message has side effects and we're a fragment shader, we 5382b8e80941Smrg * have to predicate with the sample mask to avoid helper invocations. 5383b8e80941Smrg */ 5384b8e80941Smrg if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) { 5385b8e80941Smrg inst->flag_subreg = 2; 5386b8e80941Smrg inst->predicate = BRW_PREDICATE_NORMAL; 5387b8e80941Smrg inst->predicate_inverse = false; 5388b8e80941Smrg 5389b8e80941Smrg fs_reg sample_mask = bld.sample_mask_reg(); 5390b8e80941Smrg const fs_builder ubld = bld.group(1, 0).exec_all(); 5391b8e80941Smrg ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type), 5392b8e80941Smrg sample_mask); 5393b8e80941Smrg } 5394b8e80941Smrg 5395b8e80941Smrg fs_reg payload, payload2; 5396b8e80941Smrg unsigned mlen, ex_mlen = 0; 5397b8e80941Smrg if (devinfo->gen >= 9) { 5398b8e80941Smrg /* On Skylake and above, we have SENDS */ 5399b8e80941Smrg mlen = 2 * (inst->exec_size / 8); 5400b8e80941Smrg ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE; 5401b8e80941Smrg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD); 5402b8e80941Smrg payload2 = retype(bld.move_to_vgrf(src, src_comps), 5403b8e80941Smrg BRW_REGISTER_TYPE_UD); 5404b8e80941Smrg } else { 5405b8e80941Smrg /* Add two because the address is 64-bit */ 5406b8e80941Smrg const unsigned dwords = 2 + src_comps; 5407b8e80941Smrg mlen = dwords * (inst->exec_size / 8); 5408b8e80941Smrg 5409b8e80941Smrg fs_reg sources[5]; 5410b8e80941Smrg 5411b8e80941Smrg sources[0] = addr; 5412b8e80941Smrg 5413b8e80941Smrg for (unsigned i = 0; i < src_comps; i++) 5414b8e80941Smrg sources[1 + i] = offset(src, bld, i); 5415b8e80941Smrg 5416b8e80941Smrg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords); 5417b8e80941Smrg bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0); 5418b8e80941Smrg } 5419b8e80941Smrg 5420b8e80941Smrg uint32_t desc; 5421b8e80941Smrg switch (inst->opcode) { 5422b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 5423b8e80941Smrg desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, 5424b8e80941Smrg arg, /* num_channels */ 5425b8e80941Smrg false /* write */); 5426b8e80941Smrg break; 5427b8e80941Smrg 5428b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 5429b8e80941Smrg desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size, 5430b8e80941Smrg arg, /* num_channels */ 5431b8e80941Smrg true /* write */); 5432b8e80941Smrg break; 5433b8e80941Smrg 5434b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 5435b8e80941Smrg desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, 5436b8e80941Smrg arg, /* bit_size */ 5437b8e80941Smrg false /* write */); 5438b8e80941Smrg break; 5439b8e80941Smrg 5440b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 5441b8e80941Smrg desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size, 5442b8e80941Smrg arg, /* bit_size */ 5443b8e80941Smrg true /* write */); 5444b8e80941Smrg break; 5445b8e80941Smrg 5446b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 5447b8e80941Smrg desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32, 5448b8e80941Smrg arg, /* atomic_op */ 5449b8e80941Smrg !inst->dst.is_null()); 5450b8e80941Smrg break; 5451b8e80941Smrg 5452b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 5453b8e80941Smrg desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64, 5454b8e80941Smrg arg, /* atomic_op */ 5455b8e80941Smrg !inst->dst.is_null()); 5456b8e80941Smrg break; 5457b8e80941Smrg 5458b8e80941Smrg 5459b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: 5460b8e80941Smrg desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size, 5461b8e80941Smrg arg, /* atomic_op */ 5462b8e80941Smrg !inst->dst.is_null()); 5463b8e80941Smrg break; 5464b8e80941Smrg 5465b8e80941Smrg default: 5466b8e80941Smrg unreachable("Unknown A64 logical instruction"); 5467b8e80941Smrg } 5468b8e80941Smrg 5469b8e80941Smrg /* Update the original instruction. */ 5470b8e80941Smrg inst->opcode = SHADER_OPCODE_SEND; 5471b8e80941Smrg inst->mlen = mlen; 5472b8e80941Smrg inst->ex_mlen = ex_mlen; 5473b8e80941Smrg inst->header_size = 0; 5474b8e80941Smrg inst->send_has_side_effects = has_side_effects; 5475b8e80941Smrg inst->send_is_volatile = !has_side_effects; 5476b8e80941Smrg 5477b8e80941Smrg /* Set up SFID and descriptors */ 5478b8e80941Smrg inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1; 5479b8e80941Smrg inst->desc = desc; 5480b8e80941Smrg inst->resize_sources(4); 5481b8e80941Smrg inst->src[0] = brw_imm_ud(0); /* desc */ 5482b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5483b8e80941Smrg inst->src[2] = payload; 5484b8e80941Smrg inst->src[3] = payload2; 5485b8e80941Smrg} 5486b8e80941Smrg 5487b8e80941Smrgstatic void 5488b8e80941Smrglower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst) 5489b8e80941Smrg{ 5490b8e80941Smrg const gen_device_info *devinfo = bld.shader->devinfo; 5491b8e80941Smrg 5492b8e80941Smrg if (devinfo->gen >= 7) { 5493b8e80941Smrg fs_reg index = inst->src[0]; 5494b8e80941Smrg /* We are switching the instruction from an ALU-like instruction to a 5495b8e80941Smrg * send-from-grf instruction. Since sends can't handle strides or 5496b8e80941Smrg * source modifiers, we have to make a copy of the offset source. 5497b8e80941Smrg */ 5498b8e80941Smrg fs_reg offset = bld.vgrf(BRW_REGISTER_TYPE_UD); 5499b8e80941Smrg bld.MOV(offset, inst->src[1]); 5500b8e80941Smrg 5501b8e80941Smrg const unsigned simd_mode = 5502b8e80941Smrg inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : 5503b8e80941Smrg BRW_SAMPLER_SIMD_MODE_SIMD16; 5504b8e80941Smrg 5505b8e80941Smrg inst->opcode = SHADER_OPCODE_SEND; 5506b8e80941Smrg inst->mlen = inst->exec_size / 8; 5507b8e80941Smrg inst->resize_sources(3); 5508b8e80941Smrg 5509b8e80941Smrg inst->sfid = BRW_SFID_SAMPLER; 5510b8e80941Smrg inst->desc = brw_sampler_desc(devinfo, 0, 0, 5511b8e80941Smrg GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 5512b8e80941Smrg simd_mode, 0); 5513b8e80941Smrg if (index.file == IMM) { 5514b8e80941Smrg inst->desc |= index.ud & 0xff; 5515b8e80941Smrg inst->src[0] = brw_imm_ud(0); 5516b8e80941Smrg } else { 5517b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 5518b8e80941Smrg fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5519b8e80941Smrg ubld.AND(tmp, index, brw_imm_ud(0xff)); 5520b8e80941Smrg inst->src[0] = component(tmp, 0); 5521b8e80941Smrg } 5522b8e80941Smrg inst->src[1] = brw_imm_ud(0); /* ex_desc */ 5523b8e80941Smrg inst->src[2] = offset; /* payload */ 5524b8e80941Smrg } else { 5525b8e80941Smrg const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen), 5526b8e80941Smrg BRW_REGISTER_TYPE_UD); 5527b8e80941Smrg 5528b8e80941Smrg bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]); 5529b8e80941Smrg 5530b8e80941Smrg inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4; 5531b8e80941Smrg inst->resize_sources(1); 5532b8e80941Smrg inst->base_mrf = payload.nr; 5533b8e80941Smrg inst->header_size = 1; 5534b8e80941Smrg inst->mlen = 1 + inst->exec_size / 8; 5535b8e80941Smrg } 5536b8e80941Smrg} 5537b8e80941Smrg 5538b8e80941Smrgstatic void 5539b8e80941Smrglower_math_logical_send(const fs_builder &bld, fs_inst *inst) 5540b8e80941Smrg{ 5541b8e80941Smrg assert(bld.shader->devinfo->gen < 6); 5542b8e80941Smrg 5543b8e80941Smrg inst->base_mrf = 2; 5544b8e80941Smrg inst->mlen = inst->sources * inst->exec_size / 8; 5545b8e80941Smrg 5546b8e80941Smrg if (inst->sources > 1) { 5547b8e80941Smrg /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 5548b8e80941Smrg * "Message Payload": 5549b8e80941Smrg * 5550b8e80941Smrg * "Operand0[7]. For the INT DIV functions, this operand is the 5551b8e80941Smrg * denominator." 5552b8e80941Smrg * ... 5553b8e80941Smrg * "Operand1[7]. For the INT DIV functions, this operand is the 5554b8e80941Smrg * numerator." 5555b8e80941Smrg */ 5556b8e80941Smrg const bool is_int_div = inst->opcode != SHADER_OPCODE_POW; 5557b8e80941Smrg const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0]; 5558b8e80941Smrg const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1]; 5559b8e80941Smrg 5560b8e80941Smrg inst->resize_sources(1); 5561b8e80941Smrg inst->src[0] = src0; 5562b8e80941Smrg 5563b8e80941Smrg assert(inst->exec_size == 8); 5564b8e80941Smrg bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1); 5565b8e80941Smrg } 5566b8e80941Smrg} 5567b8e80941Smrg 5568b8e80941Smrgbool 5569b8e80941Smrgfs_visitor::lower_logical_sends() 5570b8e80941Smrg{ 5571b8e80941Smrg bool progress = false; 5572b8e80941Smrg 5573b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 5574b8e80941Smrg const fs_builder ibld(this, block, inst); 5575b8e80941Smrg 5576b8e80941Smrg switch (inst->opcode) { 5577b8e80941Smrg case FS_OPCODE_FB_WRITE_LOGICAL: 5578b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 5579b8e80941Smrg lower_fb_write_logical_send(ibld, inst, 5580b8e80941Smrg brw_wm_prog_data(prog_data), 5581b8e80941Smrg (const brw_wm_prog_key *)key, 5582b8e80941Smrg payload); 5583b8e80941Smrg break; 5584b8e80941Smrg 5585b8e80941Smrg case FS_OPCODE_FB_READ_LOGICAL: 5586b8e80941Smrg lower_fb_read_logical_send(ibld, inst); 5587b8e80941Smrg break; 5588b8e80941Smrg 5589b8e80941Smrg case SHADER_OPCODE_TEX_LOGICAL: 5590b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX); 5591b8e80941Smrg break; 5592b8e80941Smrg 5593b8e80941Smrg case SHADER_OPCODE_TXD_LOGICAL: 5594b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD); 5595b8e80941Smrg break; 5596b8e80941Smrg 5597b8e80941Smrg case SHADER_OPCODE_TXF_LOGICAL: 5598b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF); 5599b8e80941Smrg break; 5600b8e80941Smrg 5601b8e80941Smrg case SHADER_OPCODE_TXL_LOGICAL: 5602b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL); 5603b8e80941Smrg break; 5604b8e80941Smrg 5605b8e80941Smrg case SHADER_OPCODE_TXS_LOGICAL: 5606b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS); 5607b8e80941Smrg break; 5608b8e80941Smrg 5609b8e80941Smrg case SHADER_OPCODE_IMAGE_SIZE_LOGICAL: 5610b8e80941Smrg lower_sampler_logical_send(ibld, inst, 5611b8e80941Smrg SHADER_OPCODE_IMAGE_SIZE_LOGICAL); 5612b8e80941Smrg break; 5613b8e80941Smrg 5614b8e80941Smrg case FS_OPCODE_TXB_LOGICAL: 5615b8e80941Smrg lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB); 5616b8e80941Smrg break; 5617b8e80941Smrg 5618b8e80941Smrg case SHADER_OPCODE_TXF_CMS_LOGICAL: 5619b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS); 5620b8e80941Smrg break; 5621b8e80941Smrg 5622b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 5623b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W); 5624b8e80941Smrg break; 5625b8e80941Smrg 5626b8e80941Smrg case SHADER_OPCODE_TXF_UMS_LOGICAL: 5627b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS); 5628b8e80941Smrg break; 5629b8e80941Smrg 5630b8e80941Smrg case SHADER_OPCODE_TXF_MCS_LOGICAL: 5631b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS); 5632b8e80941Smrg break; 5633b8e80941Smrg 5634b8e80941Smrg case SHADER_OPCODE_LOD_LOGICAL: 5635b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD); 5636b8e80941Smrg break; 5637b8e80941Smrg 5638b8e80941Smrg case SHADER_OPCODE_TG4_LOGICAL: 5639b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4); 5640b8e80941Smrg break; 5641b8e80941Smrg 5642b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 5643b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET); 5644b8e80941Smrg break; 5645b8e80941Smrg 5646b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO_LOGICAL: 5647b8e80941Smrg lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO); 5648b8e80941Smrg break; 5649b8e80941Smrg 5650b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 5651b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 5652b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 5653b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 5654b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 5655b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 5656b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 5657b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 5658b8e80941Smrg case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 5659b8e80941Smrg lower_surface_logical_send(ibld, inst); 5660b8e80941Smrg break; 5661b8e80941Smrg 5662b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 5663b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 5664b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 5665b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 5666b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 5667b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 5668b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: 5669b8e80941Smrg lower_a64_logical_send(ibld, inst); 5670b8e80941Smrg break; 5671b8e80941Smrg 5672b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: 5673b8e80941Smrg lower_varying_pull_constant_logical_send(ibld, inst); 5674b8e80941Smrg break; 5675b8e80941Smrg 5676b8e80941Smrg case SHADER_OPCODE_RCP: 5677b8e80941Smrg case SHADER_OPCODE_RSQ: 5678b8e80941Smrg case SHADER_OPCODE_SQRT: 5679b8e80941Smrg case SHADER_OPCODE_EXP2: 5680b8e80941Smrg case SHADER_OPCODE_LOG2: 5681b8e80941Smrg case SHADER_OPCODE_SIN: 5682b8e80941Smrg case SHADER_OPCODE_COS: 5683b8e80941Smrg case SHADER_OPCODE_POW: 5684b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 5685b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 5686b8e80941Smrg /* The math opcodes are overloaded for the send-like and 5687b8e80941Smrg * expression-like instructions which seems kind of icky. Gen6+ has 5688b8e80941Smrg * a native (but rather quirky) MATH instruction so we don't need to 5689b8e80941Smrg * do anything here. On Gen4-5 we'll have to lower the Gen6-like 5690b8e80941Smrg * logical instructions (which we can easily recognize because they 5691b8e80941Smrg * have mlen = 0) into send-like virtual instructions. 5692b8e80941Smrg */ 5693b8e80941Smrg if (devinfo->gen < 6 && inst->mlen == 0) { 5694b8e80941Smrg lower_math_logical_send(ibld, inst); 5695b8e80941Smrg break; 5696b8e80941Smrg 5697b8e80941Smrg } else { 5698b8e80941Smrg continue; 5699b8e80941Smrg } 5700b8e80941Smrg 5701b8e80941Smrg default: 5702b8e80941Smrg continue; 5703b8e80941Smrg } 5704b8e80941Smrg 5705b8e80941Smrg progress = true; 5706b8e80941Smrg } 5707b8e80941Smrg 5708b8e80941Smrg if (progress) 5709b8e80941Smrg invalidate_live_intervals(); 5710b8e80941Smrg 5711b8e80941Smrg return progress; 5712b8e80941Smrg} 5713b8e80941Smrg 5714b8e80941Smrgstatic bool 5715b8e80941Smrgis_mixed_float_with_fp32_dst(const fs_inst *inst) 5716b8e80941Smrg{ 5717b8e80941Smrg /* This opcode sometimes uses :W type on the source even if the operand is 5718b8e80941Smrg * a :HF, because in gen7 there is no support for :HF, and thus it uses :W. 5719b8e80941Smrg */ 5720b8e80941Smrg if (inst->opcode == BRW_OPCODE_F16TO32) 5721b8e80941Smrg return true; 5722b8e80941Smrg 5723b8e80941Smrg if (inst->dst.type != BRW_REGISTER_TYPE_F) 5724b8e80941Smrg return false; 5725b8e80941Smrg 5726b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 5727b8e80941Smrg if (inst->src[i].type == BRW_REGISTER_TYPE_HF) 5728b8e80941Smrg return true; 5729b8e80941Smrg } 5730b8e80941Smrg 5731b8e80941Smrg return false; 5732b8e80941Smrg} 5733b8e80941Smrg 5734b8e80941Smrgstatic bool 5735b8e80941Smrgis_mixed_float_with_packed_fp16_dst(const fs_inst *inst) 5736b8e80941Smrg{ 5737b8e80941Smrg /* This opcode sometimes uses :W type on the destination even if the 5738b8e80941Smrg * destination is a :HF, because in gen7 there is no support for :HF, and 5739b8e80941Smrg * thus it uses :W. 5740b8e80941Smrg */ 5741b8e80941Smrg if (inst->opcode == BRW_OPCODE_F32TO16 && 5742b8e80941Smrg inst->dst.stride == 1) 5743b8e80941Smrg return true; 5744b8e80941Smrg 5745b8e80941Smrg if (inst->dst.type != BRW_REGISTER_TYPE_HF || 5746b8e80941Smrg inst->dst.stride != 1) 5747b8e80941Smrg return false; 5748b8e80941Smrg 5749b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 5750b8e80941Smrg if (inst->src[i].type == BRW_REGISTER_TYPE_F) 5751b8e80941Smrg return true; 5752b8e80941Smrg } 5753b8e80941Smrg 5754b8e80941Smrg return false; 5755b8e80941Smrg} 5756b8e80941Smrg 5757b8e80941Smrg/** 5758b8e80941Smrg * Get the closest allowed SIMD width for instruction \p inst accounting for 5759b8e80941Smrg * some common regioning and execution control restrictions that apply to FPU 5760b8e80941Smrg * instructions. These restrictions don't necessarily have any relevance to 5761b8e80941Smrg * instructions not executed by the FPU pipeline like extended math, control 5762b8e80941Smrg * flow or send message instructions. 5763b8e80941Smrg * 5764b8e80941Smrg * For virtual opcodes it's really up to the instruction -- In some cases 5765b8e80941Smrg * (e.g. where a virtual instruction unrolls into a simple sequence of FPU 5766b8e80941Smrg * instructions) it may simplify virtual instruction lowering if we can 5767b8e80941Smrg * enforce FPU-like regioning restrictions already on the virtual instruction, 5768b8e80941Smrg * in other cases (e.g. virtual send-like instructions) this may be 5769b8e80941Smrg * excessively restrictive. 5770b8e80941Smrg */ 5771b8e80941Smrgstatic unsigned 5772b8e80941Smrgget_fpu_lowered_simd_width(const struct gen_device_info *devinfo, 5773b8e80941Smrg const fs_inst *inst) 5774b8e80941Smrg{ 5775b8e80941Smrg /* Maximum execution size representable in the instruction controls. */ 5776b8e80941Smrg unsigned max_width = MIN2(32, inst->exec_size); 5777b8e80941Smrg 5778b8e80941Smrg /* According to the PRMs: 5779b8e80941Smrg * "A. In Direct Addressing mode, a source cannot span more than 2 5780b8e80941Smrg * adjacent GRF registers. 5781b8e80941Smrg * B. A destination cannot span more than 2 adjacent GRF registers." 5782b8e80941Smrg * 5783b8e80941Smrg * Look for the source or destination with the largest register region 5784b8e80941Smrg * which is the one that is going to limit the overall execution size of 5785b8e80941Smrg * the instruction due to this rule. 5786b8e80941Smrg */ 5787b8e80941Smrg unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); 5788b8e80941Smrg 5789b8e80941Smrg for (unsigned i = 0; i < inst->sources; i++) 5790b8e80941Smrg reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); 5791b8e80941Smrg 5792b8e80941Smrg /* Calculate the maximum execution size of the instruction based on the 5793b8e80941Smrg * factor by which it goes over the hardware limit of 2 GRFs. 5794b8e80941Smrg */ 5795b8e80941Smrg if (reg_count > 2) 5796b8e80941Smrg max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2)); 5797b8e80941Smrg 5798b8e80941Smrg /* According to the IVB PRMs: 5799b8e80941Smrg * "When destination spans two registers, the source MUST span two 5800b8e80941Smrg * registers. The exception to the above rule: 5801b8e80941Smrg * 5802b8e80941Smrg * - When source is scalar, the source registers are not incremented. 5803b8e80941Smrg * - When source is packed integer Word and destination is packed 5804b8e80941Smrg * integer DWord, the source register is not incremented but the 5805b8e80941Smrg * source sub register is incremented." 5806b8e80941Smrg * 5807b8e80941Smrg * The hardware specs from Gen4 to Gen7.5 mention similar regioning 5808b8e80941Smrg * restrictions. The code below intentionally doesn't check whether the 5809b8e80941Smrg * destination type is integer because empirically the hardware doesn't 5810b8e80941Smrg * seem to care what the actual type is as long as it's dword-aligned. 5811b8e80941Smrg */ 5812b8e80941Smrg if (devinfo->gen < 8) { 5813b8e80941Smrg for (unsigned i = 0; i < inst->sources; i++) { 5814b8e80941Smrg /* IVB implements DF scalars as <0;2,1> regions. */ 5815b8e80941Smrg const bool is_scalar_exception = is_uniform(inst->src[i]) && 5816b8e80941Smrg (devinfo->is_haswell || type_sz(inst->src[i].type) != 8); 5817b8e80941Smrg const bool is_packed_word_exception = 5818b8e80941Smrg type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 && 5819b8e80941Smrg type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1; 5820b8e80941Smrg 5821b8e80941Smrg /* We check size_read(i) against size_written instead of REG_SIZE 5822b8e80941Smrg * because we want to properly handle SIMD32. In SIMD32, you can end 5823b8e80941Smrg * up with writes to 4 registers and a source that reads 2 registers 5824b8e80941Smrg * and we may still need to lower all the way to SIMD8 in that case. 5825b8e80941Smrg */ 5826b8e80941Smrg if (inst->size_written > REG_SIZE && 5827b8e80941Smrg inst->size_read(i) != 0 && 5828b8e80941Smrg inst->size_read(i) < inst->size_written && 5829b8e80941Smrg !is_scalar_exception && !is_packed_word_exception) { 5830b8e80941Smrg const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE); 5831b8e80941Smrg max_width = MIN2(max_width, inst->exec_size / reg_count); 5832b8e80941Smrg } 5833b8e80941Smrg } 5834b8e80941Smrg } 5835b8e80941Smrg 5836b8e80941Smrg if (devinfo->gen < 6) { 5837b8e80941Smrg /* From the G45 PRM, Volume 4 Page 361: 5838b8e80941Smrg * 5839b8e80941Smrg * "Operand Alignment Rule: With the exceptions listed below, a 5840b8e80941Smrg * source/destination operand in general should be aligned to even 5841b8e80941Smrg * 256-bit physical register with a region size equal to two 256-bit 5842b8e80941Smrg * physical registers." 5843b8e80941Smrg * 5844b8e80941Smrg * Normally we enforce this by allocating virtual registers to the 5845b8e80941Smrg * even-aligned class. But we need to handle payload registers. 5846b8e80941Smrg */ 5847b8e80941Smrg for (unsigned i = 0; i < inst->sources; i++) { 5848b8e80941Smrg if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) && 5849b8e80941Smrg inst->size_read(i) > REG_SIZE) { 5850b8e80941Smrg max_width = MIN2(max_width, 8); 5851b8e80941Smrg } 5852b8e80941Smrg } 5853b8e80941Smrg } 5854b8e80941Smrg 5855b8e80941Smrg /* From the IVB PRMs: 5856b8e80941Smrg * "When an instruction is SIMD32, the low 16 bits of the execution mask 5857b8e80941Smrg * are applied for both halves of the SIMD32 instruction. If different 5858b8e80941Smrg * execution mask channels are required, split the instruction into two 5859b8e80941Smrg * SIMD16 instructions." 5860b8e80941Smrg * 5861b8e80941Smrg * There is similar text in the HSW PRMs. Gen4-6 don't even implement 5862b8e80941Smrg * 32-wide control flow support in hardware and will behave similarly. 5863b8e80941Smrg */ 5864b8e80941Smrg if (devinfo->gen < 8 && !inst->force_writemask_all) 5865b8e80941Smrg max_width = MIN2(max_width, 16); 5866b8e80941Smrg 5867b8e80941Smrg /* From the IVB PRMs (applies to HSW too): 5868b8e80941Smrg * "Instructions with condition modifiers must not use SIMD32." 5869b8e80941Smrg * 5870b8e80941Smrg * From the BDW PRMs (applies to later hardware too): 5871b8e80941Smrg * "Ternary instruction with condition modifiers must not use SIMD32." 5872b8e80941Smrg */ 5873b8e80941Smrg if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo))) 5874b8e80941Smrg max_width = MIN2(max_width, 16); 5875b8e80941Smrg 5876b8e80941Smrg /* From the IVB PRMs (applies to other devices that don't have the 5877b8e80941Smrg * gen_device_info::supports_simd16_3src flag set): 5878b8e80941Smrg * "In Align16 access mode, SIMD16 is not allowed for DW operations and 5879b8e80941Smrg * SIMD8 is not allowed for DF operations." 5880b8e80941Smrg */ 5881b8e80941Smrg if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src) 5882b8e80941Smrg max_width = MIN2(max_width, inst->exec_size / reg_count); 5883b8e80941Smrg 5884b8e80941Smrg /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is 5885b8e80941Smrg * the 8-bit quarter of the execution mask signals specified in the 5886b8e80941Smrg * instruction control fields) for the second compressed half of any 5887b8e80941Smrg * single-precision instruction (for double-precision instructions 5888b8e80941Smrg * it's hardwired to use NibCtrl+1, at least on HSW), which means that 5889b8e80941Smrg * the EU will apply the wrong execution controls for the second 5890b8e80941Smrg * sequential GRF write if the number of channels per GRF is not exactly 5891b8e80941Smrg * eight in single-precision mode (or four in double-float mode). 5892b8e80941Smrg * 5893b8e80941Smrg * In this situation we calculate the maximum size of the split 5894b8e80941Smrg * instructions so they only ever write to a single register. 5895b8e80941Smrg */ 5896b8e80941Smrg if (devinfo->gen < 8 && inst->size_written > REG_SIZE && 5897b8e80941Smrg !inst->force_writemask_all) { 5898b8e80941Smrg const unsigned channels_per_grf = inst->exec_size / 5899b8e80941Smrg DIV_ROUND_UP(inst->size_written, REG_SIZE); 5900b8e80941Smrg const unsigned exec_type_size = get_exec_type_size(inst); 5901b8e80941Smrg assert(exec_type_size); 5902b8e80941Smrg 5903b8e80941Smrg /* The hardware shifts exactly 8 channels per compressed half of the 5904b8e80941Smrg * instruction in single-precision mode and exactly 4 in double-precision. 5905b8e80941Smrg */ 5906b8e80941Smrg if (channels_per_grf != (exec_type_size == 8 ? 4 : 8)) 5907b8e80941Smrg max_width = MIN2(max_width, channels_per_grf); 5908b8e80941Smrg 5909b8e80941Smrg /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT 5910b8e80941Smrg * because HW applies the same channel enable signals to both halves of 5911b8e80941Smrg * the compressed instruction which will be just wrong under 5912b8e80941Smrg * non-uniform control flow. 5913b8e80941Smrg */ 5914b8e80941Smrg if (devinfo->gen == 7 && !devinfo->is_haswell && 5915b8e80941Smrg (exec_type_size == 8 || type_sz(inst->dst.type) == 8)) 5916b8e80941Smrg max_width = MIN2(max_width, 4); 5917b8e80941Smrg } 5918b8e80941Smrg 5919b8e80941Smrg /* From the SKL PRM, Special Restrictions for Handling Mixed Mode 5920b8e80941Smrg * Float Operations: 5921b8e80941Smrg * 5922b8e80941Smrg * "No SIMD16 in mixed mode when destination is f32. Instruction 5923b8e80941Smrg * execution size must be no more than 8." 5924b8e80941Smrg * 5925b8e80941Smrg * FIXME: the simulator doesn't seem to complain if we don't do this and 5926b8e80941Smrg * empirical testing with existing CTS tests show that they pass just fine 5927b8e80941Smrg * without implementing this, however, since our interpretation of the PRM 5928b8e80941Smrg * is that conversion MOVs between HF and F are still mixed-float 5929b8e80941Smrg * instructions (and therefore subject to this restriction) we decided to 5930b8e80941Smrg * split them to be safe. Might be useful to do additional investigation to 5931b8e80941Smrg * lift the restriction if we can ensure that it is safe though, since these 5932b8e80941Smrg * conversions are common when half-float types are involved since many 5933b8e80941Smrg * instructions do not support HF types and conversions from/to F are 5934b8e80941Smrg * required. 5935b8e80941Smrg */ 5936b8e80941Smrg if (is_mixed_float_with_fp32_dst(inst)) 5937b8e80941Smrg max_width = MIN2(max_width, 8); 5938b8e80941Smrg 5939b8e80941Smrg /* From the SKL PRM, Special Restrictions for Handling Mixed Mode 5940b8e80941Smrg * Float Operations: 5941b8e80941Smrg * 5942b8e80941Smrg * "No SIMD16 in mixed mode when destination is packed f16 for both 5943b8e80941Smrg * Align1 and Align16." 5944b8e80941Smrg */ 5945b8e80941Smrg if (is_mixed_float_with_packed_fp16_dst(inst)) 5946b8e80941Smrg max_width = MIN2(max_width, 8); 5947b8e80941Smrg 5948b8e80941Smrg /* Only power-of-two execution sizes are representable in the instruction 5949b8e80941Smrg * control fields. 5950b8e80941Smrg */ 5951b8e80941Smrg return 1 << _mesa_logbase2(max_width); 5952b8e80941Smrg} 5953b8e80941Smrg 5954b8e80941Smrg/** 5955b8e80941Smrg * Get the maximum allowed SIMD width for instruction \p inst accounting for 5956b8e80941Smrg * various payload size restrictions that apply to sampler message 5957b8e80941Smrg * instructions. 5958b8e80941Smrg * 5959b8e80941Smrg * This is only intended to provide a maximum theoretical bound for the 5960b8e80941Smrg * execution size of the message based on the number of argument components 5961b8e80941Smrg * alone, which in most cases will determine whether the SIMD8 or SIMD16 5962b8e80941Smrg * variant of the message can be used, though some messages may have 5963b8e80941Smrg * additional restrictions not accounted for here (e.g. pre-ILK hardware uses 5964b8e80941Smrg * the message length to determine the exact SIMD width and argument count, 5965b8e80941Smrg * which makes a number of sampler message combinations impossible to 5966b8e80941Smrg * represent). 5967b8e80941Smrg */ 5968b8e80941Smrgstatic unsigned 5969b8e80941Smrgget_sampler_lowered_simd_width(const struct gen_device_info *devinfo, 5970b8e80941Smrg const fs_inst *inst) 5971b8e80941Smrg{ 5972b8e80941Smrg /* If we have a min_lod parameter on anything other than a simple sample 5973b8e80941Smrg * message, it will push it over 5 arguments and we have to fall back to 5974b8e80941Smrg * SIMD8. 5975b8e80941Smrg */ 5976b8e80941Smrg if (inst->opcode != SHADER_OPCODE_TEX && 5977b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_MIN_LOD)) 5978b8e80941Smrg return 8; 5979b8e80941Smrg 5980b8e80941Smrg /* Calculate the number of coordinate components that have to be present 5981b8e80941Smrg * assuming that additional arguments follow the texel coordinates in the 5982b8e80941Smrg * message payload. On IVB+ there is no need for padding, on ILK-SNB we 5983b8e80941Smrg * need to pad to four or three components depending on the message, 5984b8e80941Smrg * pre-ILK we need to pad to at most three components. 5985b8e80941Smrg */ 5986b8e80941Smrg const unsigned req_coord_components = 5987b8e80941Smrg (devinfo->gen >= 7 || 5988b8e80941Smrg !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 : 5989b8e80941Smrg (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL && 5990b8e80941Smrg inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 : 5991b8e80941Smrg 3; 5992b8e80941Smrg 5993b8e80941Smrg /* On Gen9+ the LOD argument is for free if we're able to use the LZ 5994b8e80941Smrg * variant of the TXL or TXF message. 5995b8e80941Smrg */ 5996b8e80941Smrg const bool implicit_lod = devinfo->gen >= 9 && 5997b8e80941Smrg (inst->opcode == SHADER_OPCODE_TXL || 5998b8e80941Smrg inst->opcode == SHADER_OPCODE_TXF) && 5999b8e80941Smrg inst->src[TEX_LOGICAL_SRC_LOD].is_zero(); 6000b8e80941Smrg 6001b8e80941Smrg /* Calculate the total number of argument components that need to be passed 6002b8e80941Smrg * to the sampler unit. 6003b8e80941Smrg */ 6004b8e80941Smrg const unsigned num_payload_components = 6005b8e80941Smrg MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE), 6006b8e80941Smrg req_coord_components) + 6007b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) + 6008b8e80941Smrg (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) + 6009b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_LOD2) + 6010b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) + 6011b8e80941Smrg (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ? 6012b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) + 6013b8e80941Smrg inst->components_read(TEX_LOGICAL_SRC_MCS); 6014b8e80941Smrg 6015b8e80941Smrg /* SIMD16 messages with more than five arguments exceed the maximum message 6016b8e80941Smrg * size supported by the sampler, regardless of whether a header is 6017b8e80941Smrg * provided or not. 6018b8e80941Smrg */ 6019b8e80941Smrg return MIN2(inst->exec_size, 6020b8e80941Smrg num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16); 6021b8e80941Smrg} 6022b8e80941Smrg 6023b8e80941Smrg/** 6024b8e80941Smrg * Get the closest native SIMD width supported by the hardware for instruction 6025b8e80941Smrg * \p inst. The instruction will be left untouched by 6026b8e80941Smrg * fs_visitor::lower_simd_width() if the returned value is equal to the 6027b8e80941Smrg * original execution size. 6028b8e80941Smrg */ 6029b8e80941Smrgstatic unsigned 6030b8e80941Smrgget_lowered_simd_width(const struct gen_device_info *devinfo, 6031b8e80941Smrg const fs_inst *inst) 6032b8e80941Smrg{ 6033b8e80941Smrg switch (inst->opcode) { 6034b8e80941Smrg case BRW_OPCODE_MOV: 6035b8e80941Smrg case BRW_OPCODE_SEL: 6036b8e80941Smrg case BRW_OPCODE_NOT: 6037b8e80941Smrg case BRW_OPCODE_AND: 6038b8e80941Smrg case BRW_OPCODE_OR: 6039b8e80941Smrg case BRW_OPCODE_XOR: 6040b8e80941Smrg case BRW_OPCODE_SHR: 6041b8e80941Smrg case BRW_OPCODE_SHL: 6042b8e80941Smrg case BRW_OPCODE_ASR: 6043b8e80941Smrg case BRW_OPCODE_CMPN: 6044b8e80941Smrg case BRW_OPCODE_CSEL: 6045b8e80941Smrg case BRW_OPCODE_F32TO16: 6046b8e80941Smrg case BRW_OPCODE_F16TO32: 6047b8e80941Smrg case BRW_OPCODE_BFREV: 6048b8e80941Smrg case BRW_OPCODE_BFE: 6049b8e80941Smrg case BRW_OPCODE_ADD: 6050b8e80941Smrg case BRW_OPCODE_MUL: 6051b8e80941Smrg case BRW_OPCODE_AVG: 6052b8e80941Smrg case BRW_OPCODE_FRC: 6053b8e80941Smrg case BRW_OPCODE_RNDU: 6054b8e80941Smrg case BRW_OPCODE_RNDD: 6055b8e80941Smrg case BRW_OPCODE_RNDE: 6056b8e80941Smrg case BRW_OPCODE_RNDZ: 6057b8e80941Smrg case BRW_OPCODE_LZD: 6058b8e80941Smrg case BRW_OPCODE_FBH: 6059b8e80941Smrg case BRW_OPCODE_FBL: 6060b8e80941Smrg case BRW_OPCODE_CBIT: 6061b8e80941Smrg case BRW_OPCODE_SAD2: 6062b8e80941Smrg case BRW_OPCODE_MAD: 6063b8e80941Smrg case BRW_OPCODE_LRP: 6064b8e80941Smrg case FS_OPCODE_PACK: 6065b8e80941Smrg case SHADER_OPCODE_SEL_EXEC: 6066b8e80941Smrg case SHADER_OPCODE_CLUSTER_BROADCAST: 6067b8e80941Smrg return get_fpu_lowered_simd_width(devinfo, inst); 6068b8e80941Smrg 6069b8e80941Smrg case BRW_OPCODE_CMP: { 6070b8e80941Smrg /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that 6071b8e80941Smrg * when the destination is a GRF the dependency-clear bit on the flag 6072b8e80941Smrg * register is cleared early. 6073b8e80941Smrg * 6074b8e80941Smrg * Suggested workarounds are to disable coissuing CMP instructions 6075b8e80941Smrg * or to split CMP(16) instructions into two CMP(8) instructions. 6076b8e80941Smrg * 6077b8e80941Smrg * We choose to split into CMP(8) instructions since disabling 6078b8e80941Smrg * coissuing would affect CMP instructions not otherwise affected by 6079b8e80941Smrg * the errata. 6080b8e80941Smrg */ 6081b8e80941Smrg const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell && 6082b8e80941Smrg !inst->dst.is_null() ? 8 : ~0); 6083b8e80941Smrg return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst)); 6084b8e80941Smrg } 6085b8e80941Smrg case BRW_OPCODE_BFI1: 6086b8e80941Smrg case BRW_OPCODE_BFI2: 6087b8e80941Smrg /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we 6088b8e80941Smrg * should 6089b8e80941Smrg * "Force BFI instructions to be executed always in SIMD8." 6090b8e80941Smrg */ 6091b8e80941Smrg return MIN2(devinfo->is_haswell ? 8 : ~0u, 6092b8e80941Smrg get_fpu_lowered_simd_width(devinfo, inst)); 6093b8e80941Smrg 6094b8e80941Smrg case BRW_OPCODE_IF: 6095b8e80941Smrg assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16); 6096b8e80941Smrg return inst->exec_size; 6097b8e80941Smrg 6098b8e80941Smrg case SHADER_OPCODE_RCP: 6099b8e80941Smrg case SHADER_OPCODE_RSQ: 6100b8e80941Smrg case SHADER_OPCODE_SQRT: 6101b8e80941Smrg case SHADER_OPCODE_EXP2: 6102b8e80941Smrg case SHADER_OPCODE_LOG2: 6103b8e80941Smrg case SHADER_OPCODE_SIN: 6104b8e80941Smrg case SHADER_OPCODE_COS: { 6105b8e80941Smrg /* Unary extended math instructions are limited to SIMD8 on Gen4 and 6106b8e80941Smrg * Gen6. Extended Math Function is limited to SIMD8 with half-float. 6107b8e80941Smrg */ 6108b8e80941Smrg if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x)) 6109b8e80941Smrg return MIN2(8, inst->exec_size); 6110b8e80941Smrg if (inst->dst.type == BRW_REGISTER_TYPE_HF) 6111b8e80941Smrg return MIN2(8, inst->exec_size); 6112b8e80941Smrg return MIN2(16, inst->exec_size); 6113b8e80941Smrg } 6114b8e80941Smrg 6115b8e80941Smrg case SHADER_OPCODE_POW: { 6116b8e80941Smrg /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited 6117b8e80941Smrg * to SIMD8 with half-float 6118b8e80941Smrg */ 6119b8e80941Smrg if (devinfo->gen < 7) 6120b8e80941Smrg return MIN2(8, inst->exec_size); 6121b8e80941Smrg if (inst->dst.type == BRW_REGISTER_TYPE_HF) 6122b8e80941Smrg return MIN2(8, inst->exec_size); 6123b8e80941Smrg return MIN2(16, inst->exec_size); 6124b8e80941Smrg } 6125b8e80941Smrg 6126b8e80941Smrg case SHADER_OPCODE_INT_QUOTIENT: 6127b8e80941Smrg case SHADER_OPCODE_INT_REMAINDER: 6128b8e80941Smrg /* Integer division is limited to SIMD8 on all generations. */ 6129b8e80941Smrg return MIN2(8, inst->exec_size); 6130b8e80941Smrg 6131b8e80941Smrg case FS_OPCODE_LINTERP: 6132b8e80941Smrg case SHADER_OPCODE_GET_BUFFER_SIZE: 6133b8e80941Smrg case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 6134b8e80941Smrg case FS_OPCODE_PACK_HALF_2x16_SPLIT: 6135b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 6136b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 6137b8e80941Smrg case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 6138b8e80941Smrg return MIN2(16, inst->exec_size); 6139b8e80941Smrg 6140b8e80941Smrg case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL: 6141b8e80941Smrg /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch 6142b8e80941Smrg * message used to implement varying pull constant loads, so expand it 6143b8e80941Smrg * to SIMD16. An alternative with longer message payload length but 6144b8e80941Smrg * shorter return payload would be to use the SIMD8 sampler message that 6145b8e80941Smrg * takes (header, u, v, r) as parameters instead of (header, u). 6146b8e80941Smrg */ 6147b8e80941Smrg return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size)); 6148b8e80941Smrg 6149b8e80941Smrg case FS_OPCODE_DDX_COARSE: 6150b8e80941Smrg case FS_OPCODE_DDX_FINE: 6151b8e80941Smrg case FS_OPCODE_DDY_COARSE: 6152b8e80941Smrg case FS_OPCODE_DDY_FINE: 6153b8e80941Smrg /* The implementation of this virtual opcode may require emitting 6154b8e80941Smrg * compressed Align16 instructions, which are severely limited on some 6155b8e80941Smrg * generations. 6156b8e80941Smrg * 6157b8e80941Smrg * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register 6158b8e80941Smrg * Region Restrictions): 6159b8e80941Smrg * 6160b8e80941Smrg * "In Align16 access mode, SIMD16 is not allowed for DW operations 6161b8e80941Smrg * and SIMD8 is not allowed for DF operations." 6162b8e80941Smrg * 6163b8e80941Smrg * In this context, "DW operations" means "operations acting on 32-bit 6164b8e80941Smrg * values", so it includes operations on floats. 6165b8e80941Smrg * 6166b8e80941Smrg * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3 6167b8e80941Smrg * (Instruction Compression -> Rules and Restrictions): 6168b8e80941Smrg * 6169b8e80941Smrg * "A compressed instruction must be in Align1 access mode. Align16 6170b8e80941Smrg * mode instructions cannot be compressed." 6171b8e80941Smrg * 6172b8e80941Smrg * Similar text exists in the g45 PRM. 6173b8e80941Smrg * 6174b8e80941Smrg * Empirically, compressed align16 instructions using odd register 6175b8e80941Smrg * numbers don't appear to work on Sandybridge either. 6176b8e80941Smrg */ 6177b8e80941Smrg return (devinfo->gen == 4 || devinfo->gen == 6 || 6178b8e80941Smrg (devinfo->gen == 7 && !devinfo->is_haswell) ? 6179b8e80941Smrg MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size)); 6180b8e80941Smrg 6181b8e80941Smrg case SHADER_OPCODE_MULH: 6182b8e80941Smrg /* MULH is lowered to the MUL/MACH sequence using the accumulator, which 6183b8e80941Smrg * is 8-wide on Gen7+. 6184b8e80941Smrg */ 6185b8e80941Smrg return (devinfo->gen >= 7 ? 8 : 6186b8e80941Smrg get_fpu_lowered_simd_width(devinfo, inst)); 6187b8e80941Smrg 6188b8e80941Smrg case FS_OPCODE_FB_WRITE_LOGICAL: 6189b8e80941Smrg /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them 6190b8e80941Smrg * here. 6191b8e80941Smrg */ 6192b8e80941Smrg assert(devinfo->gen != 6 || 6193b8e80941Smrg inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE || 6194b8e80941Smrg inst->exec_size == 8); 6195b8e80941Smrg /* Dual-source FB writes are unsupported in SIMD16 mode. */ 6196b8e80941Smrg return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ? 6197b8e80941Smrg 8 : MIN2(16, inst->exec_size)); 6198b8e80941Smrg 6199b8e80941Smrg case FS_OPCODE_FB_READ_LOGICAL: 6200b8e80941Smrg return MIN2(16, inst->exec_size); 6201b8e80941Smrg 6202b8e80941Smrg case SHADER_OPCODE_TEX_LOGICAL: 6203b8e80941Smrg case SHADER_OPCODE_TXF_CMS_LOGICAL: 6204b8e80941Smrg case SHADER_OPCODE_TXF_UMS_LOGICAL: 6205b8e80941Smrg case SHADER_OPCODE_TXF_MCS_LOGICAL: 6206b8e80941Smrg case SHADER_OPCODE_LOD_LOGICAL: 6207b8e80941Smrg case SHADER_OPCODE_TG4_LOGICAL: 6208b8e80941Smrg case SHADER_OPCODE_SAMPLEINFO_LOGICAL: 6209b8e80941Smrg case SHADER_OPCODE_TXF_CMS_W_LOGICAL: 6210b8e80941Smrg case SHADER_OPCODE_TG4_OFFSET_LOGICAL: 6211b8e80941Smrg return get_sampler_lowered_simd_width(devinfo, inst); 6212b8e80941Smrg 6213b8e80941Smrg case SHADER_OPCODE_TXD_LOGICAL: 6214b8e80941Smrg /* TXD is unsupported in SIMD16 mode. */ 6215b8e80941Smrg return 8; 6216b8e80941Smrg 6217b8e80941Smrg case SHADER_OPCODE_TXL_LOGICAL: 6218b8e80941Smrg case FS_OPCODE_TXB_LOGICAL: 6219b8e80941Smrg /* Only one execution size is representable pre-ILK depending on whether 6220b8e80941Smrg * the shadow reference argument is present. 6221b8e80941Smrg */ 6222b8e80941Smrg if (devinfo->gen == 4) 6223b8e80941Smrg return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8; 6224b8e80941Smrg else 6225b8e80941Smrg return get_sampler_lowered_simd_width(devinfo, inst); 6226b8e80941Smrg 6227b8e80941Smrg case SHADER_OPCODE_TXF_LOGICAL: 6228b8e80941Smrg case SHADER_OPCODE_TXS_LOGICAL: 6229b8e80941Smrg /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD 6230b8e80941Smrg * messages. Use SIMD16 instead. 6231b8e80941Smrg */ 6232b8e80941Smrg if (devinfo->gen == 4) 6233b8e80941Smrg return 16; 6234b8e80941Smrg else 6235b8e80941Smrg return get_sampler_lowered_simd_width(devinfo, inst); 6236b8e80941Smrg 6237b8e80941Smrg case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: 6238b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL: 6239b8e80941Smrg case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL: 6240b8e80941Smrg return 8; 6241b8e80941Smrg 6242b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL: 6243b8e80941Smrg case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: 6244b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL: 6245b8e80941Smrg case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL: 6246b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL: 6247b8e80941Smrg case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL: 6248b8e80941Smrg return MIN2(16, inst->exec_size); 6249b8e80941Smrg 6250b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL: 6251b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL: 6252b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL: 6253b8e80941Smrg case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL: 6254b8e80941Smrg return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size); 6255b8e80941Smrg 6256b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL: 6257b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: 6258b8e80941Smrg case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL: 6259b8e80941Smrg return 8; 6260b8e80941Smrg 6261b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8: 6262b8e80941Smrg case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 6263b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8: 6264b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 6265b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 6266b8e80941Smrg case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 6267b8e80941Smrg return MIN2(8, inst->exec_size); 6268b8e80941Smrg 6269b8e80941Smrg case SHADER_OPCODE_QUAD_SWIZZLE: { 6270b8e80941Smrg const unsigned swiz = inst->src[1].ud; 6271b8e80941Smrg return (is_uniform(inst->src[0]) ? 6272b8e80941Smrg get_fpu_lowered_simd_width(devinfo, inst) : 6273b8e80941Smrg devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 : 6274b8e80941Smrg swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 : 6275b8e80941Smrg get_fpu_lowered_simd_width(devinfo, inst)); 6276b8e80941Smrg } 6277b8e80941Smrg case SHADER_OPCODE_MOV_INDIRECT: { 6278b8e80941Smrg /* From IVB and HSW PRMs: 6279b8e80941Smrg * 6280b8e80941Smrg * "2.When the destination requires two registers and the sources are 6281b8e80941Smrg * indirect, the sources must use 1x1 regioning mode. 6282b8e80941Smrg * 6283b8e80941Smrg * In case of DF instructions in HSW/IVB, the exec_size is limited by 6284b8e80941Smrg * the EU decompression logic not handling VxH indirect addressing 6285b8e80941Smrg * correctly. 6286b8e80941Smrg */ 6287b8e80941Smrg const unsigned max_size = (devinfo->gen >= 8 ? 2 : 1) * REG_SIZE; 6288b8e80941Smrg /* Prior to Broadwell, we only have 8 address subregisters. */ 6289b8e80941Smrg return MIN3(devinfo->gen >= 8 ? 16 : 8, 6290b8e80941Smrg max_size / (inst->dst.stride * type_sz(inst->dst.type)), 6291b8e80941Smrg inst->exec_size); 6292b8e80941Smrg } 6293b8e80941Smrg 6294b8e80941Smrg case SHADER_OPCODE_LOAD_PAYLOAD: { 6295b8e80941Smrg const unsigned reg_count = 6296b8e80941Smrg DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); 6297b8e80941Smrg 6298b8e80941Smrg if (reg_count > 2) { 6299b8e80941Smrg /* Only LOAD_PAYLOAD instructions with per-channel destination region 6300b8e80941Smrg * can be easily lowered (which excludes headers and heterogeneous 6301b8e80941Smrg * types). 6302b8e80941Smrg */ 6303b8e80941Smrg assert(!inst->header_size); 6304b8e80941Smrg for (unsigned i = 0; i < inst->sources; i++) 6305b8e80941Smrg assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) || 6306b8e80941Smrg inst->src[i].file == BAD_FILE); 6307b8e80941Smrg 6308b8e80941Smrg return inst->exec_size / DIV_ROUND_UP(reg_count, 2); 6309b8e80941Smrg } else { 6310b8e80941Smrg return inst->exec_size; 6311b8e80941Smrg } 6312b8e80941Smrg } 6313b8e80941Smrg default: 6314b8e80941Smrg return inst->exec_size; 6315b8e80941Smrg } 6316b8e80941Smrg} 6317b8e80941Smrg 6318b8e80941Smrg/** 6319b8e80941Smrg * Return true if splitting out the group of channels of instruction \p inst 6320b8e80941Smrg * given by lbld.group() requires allocating a temporary for the i-th source 6321b8e80941Smrg * of the lowered instruction. 6322b8e80941Smrg */ 6323b8e80941Smrgstatic inline bool 6324b8e80941Smrgneeds_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i) 6325b8e80941Smrg{ 6326b8e80941Smrg return !(is_periodic(inst->src[i], lbld.dispatch_width()) || 6327b8e80941Smrg (inst->components_read(i) == 1 && 6328b8e80941Smrg lbld.dispatch_width() <= inst->exec_size)) || 6329b8e80941Smrg (inst->flags_written() & 6330b8e80941Smrg flag_mask(inst->src[i], type_sz(inst->src[i].type))); 6331b8e80941Smrg} 6332b8e80941Smrg 6333b8e80941Smrg/** 6334b8e80941Smrg * Extract the data that would be consumed by the channel group given by 6335b8e80941Smrg * lbld.group() from the i-th source region of instruction \p inst and return 6336b8e80941Smrg * it as result in packed form. 6337b8e80941Smrg */ 6338b8e80941Smrgstatic fs_reg 6339b8e80941Smrgemit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i) 6340b8e80941Smrg{ 6341b8e80941Smrg assert(lbld.group() >= inst->group); 6342b8e80941Smrg 6343b8e80941Smrg /* Specified channel group from the source region. */ 6344b8e80941Smrg const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group); 6345b8e80941Smrg 6346b8e80941Smrg if (needs_src_copy(lbld, inst, i)) { 6347b8e80941Smrg /* Builder of the right width to perform the copy avoiding uninitialized 6348b8e80941Smrg * data if the lowered execution size is greater than the original 6349b8e80941Smrg * execution size of the instruction. 6350b8e80941Smrg */ 6351b8e80941Smrg const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(), 6352b8e80941Smrg inst->exec_size), 0); 6353b8e80941Smrg const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i)); 6354b8e80941Smrg 6355b8e80941Smrg for (unsigned k = 0; k < inst->components_read(i); ++k) 6356b8e80941Smrg cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k)); 6357b8e80941Smrg 6358b8e80941Smrg return tmp; 6359b8e80941Smrg 6360b8e80941Smrg } else if (is_periodic(inst->src[i], lbld.dispatch_width())) { 6361b8e80941Smrg /* The source is invariant for all dispatch_width-wide groups of the 6362b8e80941Smrg * original region. 6363b8e80941Smrg */ 6364b8e80941Smrg return inst->src[i]; 6365b8e80941Smrg 6366b8e80941Smrg } else { 6367b8e80941Smrg /* We can just point the lowered instruction at the right channel group 6368b8e80941Smrg * from the original region. 6369b8e80941Smrg */ 6370b8e80941Smrg return src; 6371b8e80941Smrg } 6372b8e80941Smrg} 6373b8e80941Smrg 6374b8e80941Smrg/** 6375b8e80941Smrg * Return true if splitting out the group of channels of instruction \p inst 6376b8e80941Smrg * given by lbld.group() requires allocating a temporary for the destination 6377b8e80941Smrg * of the lowered instruction and copying the data back to the original 6378b8e80941Smrg * destination region. 6379b8e80941Smrg */ 6380b8e80941Smrgstatic inline bool 6381b8e80941Smrgneeds_dst_copy(const fs_builder &lbld, const fs_inst *inst) 6382b8e80941Smrg{ 6383b8e80941Smrg /* If the instruction writes more than one component we'll have to shuffle 6384b8e80941Smrg * the results of multiple lowered instructions in order to make sure that 6385b8e80941Smrg * they end up arranged correctly in the original destination region. 6386b8e80941Smrg */ 6387b8e80941Smrg if (inst->size_written > inst->dst.component_size(inst->exec_size)) 6388b8e80941Smrg return true; 6389b8e80941Smrg 6390b8e80941Smrg /* If the lowered execution size is larger than the original the result of 6391b8e80941Smrg * the instruction won't fit in the original destination, so we'll have to 6392b8e80941Smrg * allocate a temporary in any case. 6393b8e80941Smrg */ 6394b8e80941Smrg if (lbld.dispatch_width() > inst->exec_size) 6395b8e80941Smrg return true; 6396b8e80941Smrg 6397b8e80941Smrg for (unsigned i = 0; i < inst->sources; i++) { 6398b8e80941Smrg /* If we already made a copy of the source for other reasons there won't 6399b8e80941Smrg * be any overlap with the destination. 6400b8e80941Smrg */ 6401b8e80941Smrg if (needs_src_copy(lbld, inst, i)) 6402b8e80941Smrg continue; 6403b8e80941Smrg 6404b8e80941Smrg /* In order to keep the logic simple we emit a copy whenever the 6405b8e80941Smrg * destination region doesn't exactly match an overlapping source, which 6406b8e80941Smrg * may point at the source and destination not being aligned group by 6407b8e80941Smrg * group which could cause one of the lowered instructions to overwrite 6408b8e80941Smrg * the data read from the same source by other lowered instructions. 6409b8e80941Smrg */ 6410b8e80941Smrg if (regions_overlap(inst->dst, inst->size_written, 6411b8e80941Smrg inst->src[i], inst->size_read(i)) && 6412b8e80941Smrg !inst->dst.equals(inst->src[i])) 6413b8e80941Smrg return true; 6414b8e80941Smrg } 6415b8e80941Smrg 6416b8e80941Smrg return false; 6417b8e80941Smrg} 6418b8e80941Smrg 6419b8e80941Smrg/** 6420b8e80941Smrg * Insert data from a packed temporary into the channel group given by 6421b8e80941Smrg * lbld.group() of the destination region of instruction \p inst and return 6422b8e80941Smrg * the temporary as result. Any copy instructions that are required for 6423b8e80941Smrg * unzipping the previous value (in the case of partial writes) will be 6424b8e80941Smrg * inserted using \p lbld_before and any copy instructions required for 6425b8e80941Smrg * zipping up the destination of \p inst will be inserted using \p lbld_after. 6426b8e80941Smrg */ 6427b8e80941Smrgstatic fs_reg 6428b8e80941Smrgemit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after, 6429b8e80941Smrg fs_inst *inst) 6430b8e80941Smrg{ 6431b8e80941Smrg assert(lbld_before.dispatch_width() == lbld_after.dispatch_width()); 6432b8e80941Smrg assert(lbld_before.group() == lbld_after.group()); 6433b8e80941Smrg assert(lbld_after.group() >= inst->group); 6434b8e80941Smrg 6435b8e80941Smrg /* Specified channel group from the destination region. */ 6436b8e80941Smrg const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group); 6437b8e80941Smrg const unsigned dst_size = inst->size_written / 6438b8e80941Smrg inst->dst.component_size(inst->exec_size); 6439b8e80941Smrg 6440b8e80941Smrg if (needs_dst_copy(lbld_after, inst)) { 6441b8e80941Smrg const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size); 6442b8e80941Smrg 6443b8e80941Smrg if (inst->predicate) { 6444b8e80941Smrg /* Handle predication by copying the original contents of 6445b8e80941Smrg * the destination into the temporary before emitting the 6446b8e80941Smrg * lowered instruction. 6447b8e80941Smrg */ 6448b8e80941Smrg const fs_builder gbld_before = 6449b8e80941Smrg lbld_before.group(MIN2(lbld_before.dispatch_width(), 6450b8e80941Smrg inst->exec_size), 0); 6451b8e80941Smrg for (unsigned k = 0; k < dst_size; ++k) { 6452b8e80941Smrg gbld_before.MOV(offset(tmp, lbld_before, k), 6453b8e80941Smrg offset(dst, inst->exec_size, k)); 6454b8e80941Smrg } 6455b8e80941Smrg } 6456b8e80941Smrg 6457b8e80941Smrg const fs_builder gbld_after = 6458b8e80941Smrg lbld_after.group(MIN2(lbld_after.dispatch_width(), 6459b8e80941Smrg inst->exec_size), 0); 6460b8e80941Smrg for (unsigned k = 0; k < dst_size; ++k) { 6461b8e80941Smrg /* Use a builder of the right width to perform the copy avoiding 6462b8e80941Smrg * uninitialized data if the lowered execution size is greater than 6463b8e80941Smrg * the original execution size of the instruction. 6464b8e80941Smrg */ 6465b8e80941Smrg gbld_after.MOV(offset(dst, inst->exec_size, k), 6466b8e80941Smrg offset(tmp, lbld_after, k)); 6467b8e80941Smrg } 6468b8e80941Smrg 6469b8e80941Smrg return tmp; 6470b8e80941Smrg 6471b8e80941Smrg } else { 6472b8e80941Smrg /* No need to allocate a temporary for the lowered instruction, just 6473b8e80941Smrg * take the right group of channels from the original region. 6474b8e80941Smrg */ 6475b8e80941Smrg return dst; 6476b8e80941Smrg } 6477b8e80941Smrg} 6478b8e80941Smrg 6479b8e80941Smrgbool 6480b8e80941Smrgfs_visitor::lower_simd_width() 6481b8e80941Smrg{ 6482b8e80941Smrg bool progress = false; 6483b8e80941Smrg 6484b8e80941Smrg foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { 6485b8e80941Smrg const unsigned lower_width = get_lowered_simd_width(devinfo, inst); 6486b8e80941Smrg 6487b8e80941Smrg if (lower_width != inst->exec_size) { 6488b8e80941Smrg /* Builder matching the original instruction. We may also need to 6489b8e80941Smrg * emit an instruction of width larger than the original, set the 6490b8e80941Smrg * execution size of the builder to the highest of both for now so 6491b8e80941Smrg * we're sure that both cases can be handled. 6492b8e80941Smrg */ 6493b8e80941Smrg const unsigned max_width = MAX2(inst->exec_size, lower_width); 6494b8e80941Smrg const fs_builder ibld = bld.at(block, inst) 6495b8e80941Smrg .exec_all(inst->force_writemask_all) 6496b8e80941Smrg .group(max_width, inst->group / max_width); 6497b8e80941Smrg 6498b8e80941Smrg /* Split the copies in chunks of the execution width of either the 6499b8e80941Smrg * original or the lowered instruction, whichever is lower. 6500b8e80941Smrg */ 6501b8e80941Smrg const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width); 6502b8e80941Smrg const unsigned dst_size = inst->size_written / 6503b8e80941Smrg inst->dst.component_size(inst->exec_size); 6504b8e80941Smrg 6505b8e80941Smrg assert(!inst->writes_accumulator && !inst->mlen); 6506b8e80941Smrg 6507b8e80941Smrg /* Inserting the zip, unzip, and duplicated instructions in all of 6508b8e80941Smrg * the right spots is somewhat tricky. All of the unzip and any 6509b8e80941Smrg * instructions from the zip which unzip the destination prior to 6510b8e80941Smrg * writing need to happen before all of the per-group instructions 6511b8e80941Smrg * and the zip instructions need to happen after. In order to sort 6512b8e80941Smrg * this all out, we insert the unzip instructions before \p inst, 6513b8e80941Smrg * insert the per-group instructions after \p inst (i.e. before 6514b8e80941Smrg * inst->next), and insert the zip instructions before the 6515b8e80941Smrg * instruction after \p inst. Since we are inserting instructions 6516b8e80941Smrg * after \p inst, inst->next is a moving target and we need to save 6517b8e80941Smrg * it off here so that we insert the zip instructions in the right 6518b8e80941Smrg * place. 6519b8e80941Smrg * 6520b8e80941Smrg * Since we're inserting split instructions after after_inst, the 6521b8e80941Smrg * instructions will end up in the reverse order that we insert them. 6522b8e80941Smrg * However, certain render target writes require that the low group 6523b8e80941Smrg * instructions come before the high group. From the Ivy Bridge PRM 6524b8e80941Smrg * Vol. 4, Pt. 1, Section 3.9.11: 6525b8e80941Smrg * 6526b8e80941Smrg * "If multiple SIMD8 Dual Source messages are delivered by the 6527b8e80941Smrg * pixel shader thread, each SIMD8_DUALSRC_LO message must be 6528b8e80941Smrg * issued before the SIMD8_DUALSRC_HI message with the same Slot 6529b8e80941Smrg * Group Select setting." 6530b8e80941Smrg * 6531b8e80941Smrg * And, from Section 3.9.11.1 of the same PRM: 6532b8e80941Smrg * 6533b8e80941Smrg * "When SIMD32 or SIMD16 PS threads send render target writes 6534b8e80941Smrg * with multiple SIMD8 and SIMD16 messages, the following must 6535b8e80941Smrg * hold: 6536b8e80941Smrg * 6537b8e80941Smrg * All the slots (as described above) must have a corresponding 6538b8e80941Smrg * render target write irrespective of the slot's validity. A slot 6539b8e80941Smrg * is considered valid when at least one sample is enabled. For 6540b8e80941Smrg * example, a SIMD16 PS thread must send two SIMD8 render target 6541b8e80941Smrg * writes to cover all the slots. 6542b8e80941Smrg * 6543b8e80941Smrg * PS thread must send SIMD render target write messages with 6544b8e80941Smrg * increasing slot numbers. For example, SIMD16 thread has 6545b8e80941Smrg * Slot[15:0] and if two SIMD8 render target writes are used, the 6546b8e80941Smrg * first SIMD8 render target write must send Slot[7:0] and the 6547b8e80941Smrg * next one must send Slot[15:8]." 6548b8e80941Smrg * 6549b8e80941Smrg * In order to make low group instructions come before high group 6550b8e80941Smrg * instructions (this is required for some render target writes), we 6551b8e80941Smrg * split from the highest group to lowest. 6552b8e80941Smrg */ 6553b8e80941Smrg exec_node *const after_inst = inst->next; 6554b8e80941Smrg for (int i = n - 1; i >= 0; i--) { 6555b8e80941Smrg /* Emit a copy of the original instruction with the lowered width. 6556b8e80941Smrg * If the EOT flag was set throw it away except for the last 6557b8e80941Smrg * instruction to avoid killing the thread prematurely. 6558b8e80941Smrg */ 6559b8e80941Smrg fs_inst split_inst = *inst; 6560b8e80941Smrg split_inst.exec_size = lower_width; 6561b8e80941Smrg split_inst.eot = inst->eot && i == int(n - 1); 6562b8e80941Smrg 6563b8e80941Smrg /* Select the correct channel enables for the i-th group, then 6564b8e80941Smrg * transform the sources and destination and emit the lowered 6565b8e80941Smrg * instruction. 6566b8e80941Smrg */ 6567b8e80941Smrg const fs_builder lbld = ibld.group(lower_width, i); 6568b8e80941Smrg 6569b8e80941Smrg for (unsigned j = 0; j < inst->sources; j++) 6570b8e80941Smrg split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j); 6571b8e80941Smrg 6572b8e80941Smrg split_inst.dst = emit_zip(lbld.at(block, inst), 6573b8e80941Smrg lbld.at(block, after_inst), inst); 6574b8e80941Smrg split_inst.size_written = 6575b8e80941Smrg split_inst.dst.component_size(lower_width) * dst_size; 6576b8e80941Smrg 6577b8e80941Smrg lbld.at(block, inst->next).emit(split_inst); 6578b8e80941Smrg } 6579b8e80941Smrg 6580b8e80941Smrg inst->remove(block); 6581b8e80941Smrg progress = true; 6582b8e80941Smrg } 6583b8e80941Smrg } 6584b8e80941Smrg 6585b8e80941Smrg if (progress) 6586b8e80941Smrg invalidate_live_intervals(); 6587b8e80941Smrg 6588b8e80941Smrg return progress; 6589b8e80941Smrg} 6590b8e80941Smrg 6591b8e80941Smrgvoid 6592b8e80941Smrgfs_visitor::dump_instructions() 6593b8e80941Smrg{ 6594b8e80941Smrg dump_instructions(NULL); 6595b8e80941Smrg} 6596b8e80941Smrg 6597b8e80941Smrgvoid 6598b8e80941Smrgfs_visitor::dump_instructions(const char *name) 6599b8e80941Smrg{ 6600b8e80941Smrg FILE *file = stderr; 6601b8e80941Smrg if (name && geteuid() != 0) { 6602b8e80941Smrg file = fopen(name, "w"); 6603b8e80941Smrg if (!file) 6604b8e80941Smrg file = stderr; 6605b8e80941Smrg } 6606b8e80941Smrg 6607b8e80941Smrg if (cfg) { 6608b8e80941Smrg calculate_register_pressure(); 6609b8e80941Smrg int ip = 0, max_pressure = 0; 6610b8e80941Smrg foreach_block_and_inst(block, backend_instruction, inst, cfg) { 6611b8e80941Smrg max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); 6612b8e80941Smrg fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); 6613b8e80941Smrg dump_instruction(inst, file); 6614b8e80941Smrg ip++; 6615b8e80941Smrg } 6616b8e80941Smrg fprintf(file, "Maximum %3d registers live at once.\n", max_pressure); 6617b8e80941Smrg } else { 6618b8e80941Smrg int ip = 0; 6619b8e80941Smrg foreach_in_list(backend_instruction, inst, &instructions) { 6620b8e80941Smrg fprintf(file, "%4d: ", ip++); 6621b8e80941Smrg dump_instruction(inst, file); 6622b8e80941Smrg } 6623b8e80941Smrg } 6624b8e80941Smrg 6625b8e80941Smrg if (file != stderr) { 6626b8e80941Smrg fclose(file); 6627b8e80941Smrg } 6628b8e80941Smrg} 6629b8e80941Smrg 6630b8e80941Smrgvoid 6631b8e80941Smrgfs_visitor::dump_instruction(backend_instruction *be_inst) 6632b8e80941Smrg{ 6633b8e80941Smrg dump_instruction(be_inst, stderr); 6634b8e80941Smrg} 6635b8e80941Smrg 6636b8e80941Smrgvoid 6637b8e80941Smrgfs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) 6638b8e80941Smrg{ 6639b8e80941Smrg fs_inst *inst = (fs_inst *)be_inst; 6640b8e80941Smrg 6641b8e80941Smrg if (inst->predicate) { 6642b8e80941Smrg fprintf(file, "(%cf%d.%d) ", 6643b8e80941Smrg inst->predicate_inverse ? '-' : '+', 6644b8e80941Smrg inst->flag_subreg / 2, 6645b8e80941Smrg inst->flag_subreg % 2); 6646b8e80941Smrg } 6647b8e80941Smrg 6648b8e80941Smrg fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode)); 6649b8e80941Smrg if (inst->saturate) 6650b8e80941Smrg fprintf(file, ".sat"); 6651b8e80941Smrg if (inst->conditional_mod) { 6652b8e80941Smrg fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); 6653b8e80941Smrg if (!inst->predicate && 6654b8e80941Smrg (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && 6655b8e80941Smrg inst->opcode != BRW_OPCODE_CSEL && 6656b8e80941Smrg inst->opcode != BRW_OPCODE_IF && 6657b8e80941Smrg inst->opcode != BRW_OPCODE_WHILE))) { 6658b8e80941Smrg fprintf(file, ".f%d.%d", inst->flag_subreg / 2, 6659b8e80941Smrg inst->flag_subreg % 2); 6660b8e80941Smrg } 6661b8e80941Smrg } 6662b8e80941Smrg fprintf(file, "(%d) ", inst->exec_size); 6663b8e80941Smrg 6664b8e80941Smrg if (inst->mlen) { 6665b8e80941Smrg fprintf(file, "(mlen: %d) ", inst->mlen); 6666b8e80941Smrg } 6667b8e80941Smrg 6668b8e80941Smrg if (inst->ex_mlen) { 6669b8e80941Smrg fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen); 6670b8e80941Smrg } 6671b8e80941Smrg 6672b8e80941Smrg if (inst->eot) { 6673b8e80941Smrg fprintf(file, "(EOT) "); 6674b8e80941Smrg } 6675b8e80941Smrg 6676b8e80941Smrg switch (inst->dst.file) { 6677b8e80941Smrg case VGRF: 6678b8e80941Smrg fprintf(file, "vgrf%d", inst->dst.nr); 6679b8e80941Smrg break; 6680b8e80941Smrg case FIXED_GRF: 6681b8e80941Smrg fprintf(file, "g%d", inst->dst.nr); 6682b8e80941Smrg break; 6683b8e80941Smrg case MRF: 6684b8e80941Smrg fprintf(file, "m%d", inst->dst.nr); 6685b8e80941Smrg break; 6686b8e80941Smrg case BAD_FILE: 6687b8e80941Smrg fprintf(file, "(null)"); 6688b8e80941Smrg break; 6689b8e80941Smrg case UNIFORM: 6690b8e80941Smrg fprintf(file, "***u%d***", inst->dst.nr); 6691b8e80941Smrg break; 6692b8e80941Smrg case ATTR: 6693b8e80941Smrg fprintf(file, "***attr%d***", inst->dst.nr); 6694b8e80941Smrg break; 6695b8e80941Smrg case ARF: 6696b8e80941Smrg switch (inst->dst.nr) { 6697b8e80941Smrg case BRW_ARF_NULL: 6698b8e80941Smrg fprintf(file, "null"); 6699b8e80941Smrg break; 6700b8e80941Smrg case BRW_ARF_ADDRESS: 6701b8e80941Smrg fprintf(file, "a0.%d", inst->dst.subnr); 6702b8e80941Smrg break; 6703b8e80941Smrg case BRW_ARF_ACCUMULATOR: 6704b8e80941Smrg fprintf(file, "acc%d", inst->dst.subnr); 6705b8e80941Smrg break; 6706b8e80941Smrg case BRW_ARF_FLAG: 6707b8e80941Smrg fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); 6708b8e80941Smrg break; 6709b8e80941Smrg default: 6710b8e80941Smrg fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); 6711b8e80941Smrg break; 6712b8e80941Smrg } 6713b8e80941Smrg break; 6714b8e80941Smrg case IMM: 6715b8e80941Smrg unreachable("not reached"); 6716b8e80941Smrg } 6717b8e80941Smrg 6718b8e80941Smrg if (inst->dst.offset || 6719b8e80941Smrg (inst->dst.file == VGRF && 6720b8e80941Smrg alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { 6721b8e80941Smrg const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE); 6722b8e80941Smrg fprintf(file, "+%d.%d", inst->dst.offset / reg_size, 6723b8e80941Smrg inst->dst.offset % reg_size); 6724b8e80941Smrg } 6725b8e80941Smrg 6726b8e80941Smrg if (inst->dst.stride != 1) 6727b8e80941Smrg fprintf(file, "<%u>", inst->dst.stride); 6728b8e80941Smrg fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type)); 6729b8e80941Smrg 6730b8e80941Smrg for (int i = 0; i < inst->sources; i++) { 6731b8e80941Smrg if (inst->src[i].negate) 6732b8e80941Smrg fprintf(file, "-"); 6733b8e80941Smrg if (inst->src[i].abs) 6734b8e80941Smrg fprintf(file, "|"); 6735b8e80941Smrg switch (inst->src[i].file) { 6736b8e80941Smrg case VGRF: 6737b8e80941Smrg fprintf(file, "vgrf%d", inst->src[i].nr); 6738b8e80941Smrg break; 6739b8e80941Smrg case FIXED_GRF: 6740b8e80941Smrg fprintf(file, "g%d", inst->src[i].nr); 6741b8e80941Smrg break; 6742b8e80941Smrg case MRF: 6743b8e80941Smrg fprintf(file, "***m%d***", inst->src[i].nr); 6744b8e80941Smrg break; 6745b8e80941Smrg case ATTR: 6746b8e80941Smrg fprintf(file, "attr%d", inst->src[i].nr); 6747b8e80941Smrg break; 6748b8e80941Smrg case UNIFORM: 6749b8e80941Smrg fprintf(file, "u%d", inst->src[i].nr); 6750b8e80941Smrg break; 6751b8e80941Smrg case BAD_FILE: 6752b8e80941Smrg fprintf(file, "(null)"); 6753b8e80941Smrg break; 6754b8e80941Smrg case IMM: 6755b8e80941Smrg switch (inst->src[i].type) { 6756b8e80941Smrg case BRW_REGISTER_TYPE_F: 6757b8e80941Smrg fprintf(file, "%-gf", inst->src[i].f); 6758b8e80941Smrg break; 6759b8e80941Smrg case BRW_REGISTER_TYPE_DF: 6760b8e80941Smrg fprintf(file, "%fdf", inst->src[i].df); 6761b8e80941Smrg break; 6762b8e80941Smrg case BRW_REGISTER_TYPE_W: 6763b8e80941Smrg case BRW_REGISTER_TYPE_D: 6764b8e80941Smrg fprintf(file, "%dd", inst->src[i].d); 6765b8e80941Smrg break; 6766b8e80941Smrg case BRW_REGISTER_TYPE_UW: 6767b8e80941Smrg case BRW_REGISTER_TYPE_UD: 6768b8e80941Smrg fprintf(file, "%uu", inst->src[i].ud); 6769b8e80941Smrg break; 6770b8e80941Smrg case BRW_REGISTER_TYPE_Q: 6771b8e80941Smrg fprintf(file, "%" PRId64 "q", inst->src[i].d64); 6772b8e80941Smrg break; 6773b8e80941Smrg case BRW_REGISTER_TYPE_UQ: 6774b8e80941Smrg fprintf(file, "%" PRIu64 "uq", inst->src[i].u64); 6775b8e80941Smrg break; 6776b8e80941Smrg case BRW_REGISTER_TYPE_VF: 6777b8e80941Smrg fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", 6778b8e80941Smrg brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), 6779b8e80941Smrg brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), 6780b8e80941Smrg brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), 6781b8e80941Smrg brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); 6782b8e80941Smrg break; 6783b8e80941Smrg case BRW_REGISTER_TYPE_V: 6784b8e80941Smrg case BRW_REGISTER_TYPE_UV: 6785b8e80941Smrg fprintf(file, "%08x%s", inst->src[i].ud, 6786b8e80941Smrg inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV"); 6787b8e80941Smrg break; 6788b8e80941Smrg default: 6789b8e80941Smrg fprintf(file, "???"); 6790b8e80941Smrg break; 6791b8e80941Smrg } 6792b8e80941Smrg break; 6793b8e80941Smrg case ARF: 6794b8e80941Smrg switch (inst->src[i].nr) { 6795b8e80941Smrg case BRW_ARF_NULL: 6796b8e80941Smrg fprintf(file, "null"); 6797b8e80941Smrg break; 6798b8e80941Smrg case BRW_ARF_ADDRESS: 6799b8e80941Smrg fprintf(file, "a0.%d", inst->src[i].subnr); 6800b8e80941Smrg break; 6801b8e80941Smrg case BRW_ARF_ACCUMULATOR: 6802b8e80941Smrg fprintf(file, "acc%d", inst->src[i].subnr); 6803b8e80941Smrg break; 6804b8e80941Smrg case BRW_ARF_FLAG: 6805b8e80941Smrg fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); 6806b8e80941Smrg break; 6807b8e80941Smrg default: 6808b8e80941Smrg fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); 6809b8e80941Smrg break; 6810b8e80941Smrg } 6811b8e80941Smrg break; 6812b8e80941Smrg } 6813b8e80941Smrg 6814b8e80941Smrg if (inst->src[i].offset || 6815b8e80941Smrg (inst->src[i].file == VGRF && 6816b8e80941Smrg alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { 6817b8e80941Smrg const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE); 6818b8e80941Smrg fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, 6819b8e80941Smrg inst->src[i].offset % reg_size); 6820b8e80941Smrg } 6821b8e80941Smrg 6822b8e80941Smrg if (inst->src[i].abs) 6823b8e80941Smrg fprintf(file, "|"); 6824b8e80941Smrg 6825b8e80941Smrg if (inst->src[i].file != IMM) { 6826b8e80941Smrg unsigned stride; 6827b8e80941Smrg if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) { 6828b8e80941Smrg unsigned hstride = inst->src[i].hstride; 6829b8e80941Smrg stride = (hstride == 0 ? 0 : (1 << (hstride - 1))); 6830b8e80941Smrg } else { 6831b8e80941Smrg stride = inst->src[i].stride; 6832b8e80941Smrg } 6833b8e80941Smrg if (stride != 1) 6834b8e80941Smrg fprintf(file, "<%u>", stride); 6835b8e80941Smrg 6836b8e80941Smrg fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type)); 6837b8e80941Smrg } 6838b8e80941Smrg 6839b8e80941Smrg if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE) 6840b8e80941Smrg fprintf(file, ", "); 6841b8e80941Smrg } 6842b8e80941Smrg 6843b8e80941Smrg fprintf(file, " "); 6844b8e80941Smrg 6845b8e80941Smrg if (inst->force_writemask_all) 6846b8e80941Smrg fprintf(file, "NoMask "); 6847b8e80941Smrg 6848b8e80941Smrg if (inst->exec_size != dispatch_width) 6849b8e80941Smrg fprintf(file, "group%d ", inst->group); 6850b8e80941Smrg 6851b8e80941Smrg fprintf(file, "\n"); 6852b8e80941Smrg} 6853b8e80941Smrg 6854b8e80941Smrgvoid 6855b8e80941Smrgfs_visitor::setup_fs_payload_gen6() 6856b8e80941Smrg{ 6857b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 6858b8e80941Smrg struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 6859b8e80941Smrg const unsigned payload_width = MIN2(16, dispatch_width); 6860b8e80941Smrg assert(dispatch_width % payload_width == 0); 6861b8e80941Smrg assert(devinfo->gen >= 6); 6862b8e80941Smrg 6863b8e80941Smrg prog_data->uses_src_depth = prog_data->uses_src_w = 6864b8e80941Smrg (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0; 6865b8e80941Smrg 6866b8e80941Smrg prog_data->uses_sample_mask = 6867b8e80941Smrg (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0; 6868b8e80941Smrg 6869b8e80941Smrg /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: 6870b8e80941Smrg * 6871b8e80941Smrg * "MSDISPMODE_PERSAMPLE is required in order to select 6872b8e80941Smrg * POSOFFSET_SAMPLE" 6873b8e80941Smrg * 6874b8e80941Smrg * So we can only really get sample positions if we are doing real 6875b8e80941Smrg * per-sample dispatch. If we need gl_SamplePosition and we don't have 6876b8e80941Smrg * persample dispatch, we hard-code it to 0.5. 6877b8e80941Smrg */ 6878b8e80941Smrg prog_data->uses_pos_offset = prog_data->persample_dispatch && 6879b8e80941Smrg (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS); 6880b8e80941Smrg 6881b8e80941Smrg /* R0: PS thread payload header. */ 6882b8e80941Smrg payload.num_regs++; 6883b8e80941Smrg 6884b8e80941Smrg for (unsigned j = 0; j < dispatch_width / payload_width; j++) { 6885b8e80941Smrg /* R1: masks, pixel X/Y coordinates. */ 6886b8e80941Smrg payload.subspan_coord_reg[j] = payload.num_regs++; 6887b8e80941Smrg } 6888b8e80941Smrg 6889b8e80941Smrg for (unsigned j = 0; j < dispatch_width / payload_width; j++) { 6890b8e80941Smrg /* R3-26: barycentric interpolation coordinates. These appear in the 6891b8e80941Smrg * same order that they appear in the brw_barycentric_mode enum. Each 6892b8e80941Smrg * set of coordinates occupies 2 registers if dispatch width == 8 and 4 6893b8e80941Smrg * registers if dispatch width == 16. Coordinates only appear if they 6894b8e80941Smrg * were enabled using the "Barycentric Interpolation Mode" bits in 6895b8e80941Smrg * WM_STATE. 6896b8e80941Smrg */ 6897b8e80941Smrg for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { 6898b8e80941Smrg if (prog_data->barycentric_interp_modes & (1 << i)) { 6899b8e80941Smrg payload.barycentric_coord_reg[i][j] = payload.num_regs; 6900b8e80941Smrg payload.num_regs += payload_width / 4; 6901b8e80941Smrg } 6902b8e80941Smrg } 6903b8e80941Smrg 6904b8e80941Smrg /* R27-28: interpolated depth if uses source depth */ 6905b8e80941Smrg if (prog_data->uses_src_depth) { 6906b8e80941Smrg payload.source_depth_reg[j] = payload.num_regs; 6907b8e80941Smrg payload.num_regs += payload_width / 8; 6908b8e80941Smrg } 6909b8e80941Smrg 6910b8e80941Smrg /* R29-30: interpolated W set if GEN6_WM_USES_SOURCE_W. */ 6911b8e80941Smrg if (prog_data->uses_src_w) { 6912b8e80941Smrg payload.source_w_reg[j] = payload.num_regs; 6913b8e80941Smrg payload.num_regs += payload_width / 8; 6914b8e80941Smrg } 6915b8e80941Smrg 6916b8e80941Smrg /* R31: MSAA position offsets. */ 6917b8e80941Smrg if (prog_data->uses_pos_offset) { 6918b8e80941Smrg payload.sample_pos_reg[j] = payload.num_regs; 6919b8e80941Smrg payload.num_regs++; 6920b8e80941Smrg } 6921b8e80941Smrg 6922b8e80941Smrg /* R32-33: MSAA input coverage mask */ 6923b8e80941Smrg if (prog_data->uses_sample_mask) { 6924b8e80941Smrg assert(devinfo->gen >= 7); 6925b8e80941Smrg payload.sample_mask_in_reg[j] = payload.num_regs; 6926b8e80941Smrg payload.num_regs += payload_width / 8; 6927b8e80941Smrg } 6928b8e80941Smrg } 6929b8e80941Smrg 6930b8e80941Smrg if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { 6931b8e80941Smrg source_depth_to_render_target = true; 6932b8e80941Smrg } 6933b8e80941Smrg} 6934b8e80941Smrg 6935b8e80941Smrgvoid 6936b8e80941Smrgfs_visitor::setup_vs_payload() 6937b8e80941Smrg{ 6938b8e80941Smrg /* R0: thread header, R1: urb handles */ 6939b8e80941Smrg payload.num_regs = 2; 6940b8e80941Smrg} 6941b8e80941Smrg 6942b8e80941Smrgvoid 6943b8e80941Smrgfs_visitor::setup_gs_payload() 6944b8e80941Smrg{ 6945b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 6946b8e80941Smrg 6947b8e80941Smrg struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 6948b8e80941Smrg struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 6949b8e80941Smrg 6950b8e80941Smrg /* R0: thread header, R1: output URB handles */ 6951b8e80941Smrg payload.num_regs = 2; 6952b8e80941Smrg 6953b8e80941Smrg if (gs_prog_data->include_primitive_id) { 6954b8e80941Smrg /* R2: Primitive ID 0..7 */ 6955b8e80941Smrg payload.num_regs++; 6956b8e80941Smrg } 6957b8e80941Smrg 6958b8e80941Smrg /* Always enable VUE handles so we can safely use pull model if needed. 6959b8e80941Smrg * 6960b8e80941Smrg * The push model for a GS uses a ton of register space even for trivial 6961b8e80941Smrg * scenarios with just a few inputs, so just make things easier and a bit 6962b8e80941Smrg * safer by always having pull model available. 6963b8e80941Smrg */ 6964b8e80941Smrg gs_prog_data->base.include_vue_handles = true; 6965b8e80941Smrg 6966b8e80941Smrg /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */ 6967b8e80941Smrg payload.num_regs += nir->info.gs.vertices_in; 6968b8e80941Smrg 6969b8e80941Smrg /* Use a maximum of 24 registers for push-model inputs. */ 6970b8e80941Smrg const unsigned max_push_components = 24; 6971b8e80941Smrg 6972b8e80941Smrg /* If pushing our inputs would take too many registers, reduce the URB read 6973b8e80941Smrg * length (which is in HWords, or 8 registers), and resort to pulling. 6974b8e80941Smrg * 6975b8e80941Smrg * Note that the GS reads <URB Read Length> HWords for every vertex - so we 6976b8e80941Smrg * have to multiply by VerticesIn to obtain the total storage requirement. 6977b8e80941Smrg */ 6978b8e80941Smrg if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in > 6979b8e80941Smrg max_push_components) { 6980b8e80941Smrg vue_prog_data->urb_read_length = 6981b8e80941Smrg ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8; 6982b8e80941Smrg } 6983b8e80941Smrg} 6984b8e80941Smrg 6985b8e80941Smrgvoid 6986b8e80941Smrgfs_visitor::setup_cs_payload() 6987b8e80941Smrg{ 6988b8e80941Smrg assert(devinfo->gen >= 7); 6989b8e80941Smrg payload.num_regs = 1; 6990b8e80941Smrg} 6991b8e80941Smrg 6992b8e80941Smrgvoid 6993b8e80941Smrgfs_visitor::calculate_register_pressure() 6994b8e80941Smrg{ 6995b8e80941Smrg invalidate_live_intervals(); 6996b8e80941Smrg calculate_live_intervals(); 6997b8e80941Smrg 6998b8e80941Smrg unsigned num_instructions = 0; 6999b8e80941Smrg foreach_block(block, cfg) 7000b8e80941Smrg num_instructions += block->instructions.length(); 7001b8e80941Smrg 7002b8e80941Smrg regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); 7003b8e80941Smrg 7004b8e80941Smrg for (unsigned reg = 0; reg < alloc.count; reg++) { 7005b8e80941Smrg for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) 7006b8e80941Smrg regs_live_at_ip[ip] += alloc.sizes[reg]; 7007b8e80941Smrg } 7008b8e80941Smrg} 7009b8e80941Smrg 7010b8e80941Smrgvoid 7011b8e80941Smrgfs_visitor::optimize() 7012b8e80941Smrg{ 7013b8e80941Smrg /* Start by validating the shader we currently have. */ 7014b8e80941Smrg validate(); 7015b8e80941Smrg 7016b8e80941Smrg /* bld is the common builder object pointing at the end of the program we 7017b8e80941Smrg * used to translate it into i965 IR. For the optimization and lowering 7018b8e80941Smrg * passes coming next, any code added after the end of the program without 7019b8e80941Smrg * having explicitly called fs_builder::at() clearly points at a mistake. 7020b8e80941Smrg * Ideally optimization passes wouldn't be part of the visitor so they 7021b8e80941Smrg * wouldn't have access to bld at all, but they do, so just in case some 7022b8e80941Smrg * pass forgets to ask for a location explicitly set it to NULL here to 7023b8e80941Smrg * make it trip. The dispatch width is initialized to a bogus value to 7024b8e80941Smrg * make sure that optimizations set the execution controls explicitly to 7025b8e80941Smrg * match the code they are manipulating instead of relying on the defaults. 7026b8e80941Smrg */ 7027b8e80941Smrg bld = fs_builder(this, 64); 7028b8e80941Smrg 7029b8e80941Smrg assign_constant_locations(); 7030b8e80941Smrg lower_constant_loads(); 7031b8e80941Smrg 7032b8e80941Smrg validate(); 7033b8e80941Smrg 7034b8e80941Smrg split_virtual_grfs(); 7035b8e80941Smrg validate(); 7036b8e80941Smrg 7037b8e80941Smrg#define OPT(pass, args...) ({ \ 7038b8e80941Smrg pass_num++; \ 7039b8e80941Smrg bool this_progress = pass(args); \ 7040b8e80941Smrg \ 7041b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ 7042b8e80941Smrg char filename[64]; \ 7043b8e80941Smrg snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \ 7044b8e80941Smrg stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \ 7045b8e80941Smrg \ 7046b8e80941Smrg backend_shader::dump_instructions(filename); \ 7047b8e80941Smrg } \ 7048b8e80941Smrg \ 7049b8e80941Smrg validate(); \ 7050b8e80941Smrg \ 7051b8e80941Smrg progress = progress || this_progress; \ 7052b8e80941Smrg this_progress; \ 7053b8e80941Smrg }) 7054b8e80941Smrg 7055b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { 7056b8e80941Smrg char filename[64]; 7057b8e80941Smrg snprintf(filename, 64, "%s%d-%s-00-00-start", 7058b8e80941Smrg stage_abbrev, dispatch_width, nir->info.name); 7059b8e80941Smrg 7060b8e80941Smrg backend_shader::dump_instructions(filename); 7061b8e80941Smrg } 7062b8e80941Smrg 7063b8e80941Smrg bool progress = false; 7064b8e80941Smrg int iteration = 0; 7065b8e80941Smrg int pass_num = 0; 7066b8e80941Smrg 7067b8e80941Smrg /* Before anything else, eliminate dead code. The results of some NIR 7068b8e80941Smrg * instructions may effectively be calculated twice. Once when the 7069b8e80941Smrg * instruction is encountered, and again when the user of that result is 7070b8e80941Smrg * encountered. Wipe those away before algebraic optimizations and 7071b8e80941Smrg * especially copy propagation can mix things up. 7072b8e80941Smrg */ 7073b8e80941Smrg OPT(dead_code_eliminate); 7074b8e80941Smrg 7075b8e80941Smrg OPT(remove_extra_rounding_modes); 7076b8e80941Smrg 7077b8e80941Smrg do { 7078b8e80941Smrg progress = false; 7079b8e80941Smrg pass_num = 0; 7080b8e80941Smrg iteration++; 7081b8e80941Smrg 7082b8e80941Smrg OPT(remove_duplicate_mrf_writes); 7083b8e80941Smrg 7084b8e80941Smrg OPT(opt_algebraic); 7085b8e80941Smrg OPT(opt_cse); 7086b8e80941Smrg OPT(opt_copy_propagation); 7087b8e80941Smrg OPT(opt_predicated_break, this); 7088b8e80941Smrg OPT(opt_cmod_propagation); 7089b8e80941Smrg OPT(dead_code_eliminate); 7090b8e80941Smrg OPT(opt_peephole_sel); 7091b8e80941Smrg OPT(dead_control_flow_eliminate, this); 7092b8e80941Smrg OPT(opt_register_renaming); 7093b8e80941Smrg OPT(opt_saturate_propagation); 7094b8e80941Smrg OPT(register_coalesce); 7095b8e80941Smrg OPT(compute_to_mrf); 7096b8e80941Smrg OPT(eliminate_find_live_channel); 7097b8e80941Smrg 7098b8e80941Smrg OPT(compact_virtual_grfs); 7099b8e80941Smrg } while (progress); 7100b8e80941Smrg 7101b8e80941Smrg if (OPT(lower_linterp)) { 7102b8e80941Smrg OPT(opt_copy_propagation); 7103b8e80941Smrg OPT(dead_code_eliminate); 7104b8e80941Smrg } 7105b8e80941Smrg 7106b8e80941Smrg /* Do this after cmod propagation has had every possible opportunity to 7107b8e80941Smrg * propagate results into SEL instructions. 7108b8e80941Smrg */ 7109b8e80941Smrg if (OPT(opt_peephole_csel)) 7110b8e80941Smrg OPT(dead_code_eliminate); 7111b8e80941Smrg 7112b8e80941Smrg progress = false; 7113b8e80941Smrg pass_num = 0; 7114b8e80941Smrg 7115b8e80941Smrg if (OPT(lower_pack)) { 7116b8e80941Smrg OPT(register_coalesce); 7117b8e80941Smrg OPT(dead_code_eliminate); 7118b8e80941Smrg } 7119b8e80941Smrg 7120b8e80941Smrg OPT(lower_simd_width); 7121b8e80941Smrg 7122b8e80941Smrg /* After SIMD lowering just in case we had to unroll the EOT send. */ 7123b8e80941Smrg OPT(opt_sampler_eot); 7124b8e80941Smrg 7125b8e80941Smrg OPT(lower_logical_sends); 7126b8e80941Smrg 7127b8e80941Smrg if (progress) { 7128b8e80941Smrg OPT(opt_copy_propagation); 7129b8e80941Smrg /* Only run after logical send lowering because it's easier to implement 7130b8e80941Smrg * in terms of physical sends. 7131b8e80941Smrg */ 7132b8e80941Smrg if (OPT(opt_zero_samples)) 7133b8e80941Smrg OPT(opt_copy_propagation); 7134b8e80941Smrg /* Run after logical send lowering to give it a chance to CSE the 7135b8e80941Smrg * LOAD_PAYLOAD instructions created to construct the payloads of 7136b8e80941Smrg * e.g. texturing messages in cases where it wasn't possible to CSE the 7137b8e80941Smrg * whole logical instruction. 7138b8e80941Smrg */ 7139b8e80941Smrg OPT(opt_cse); 7140b8e80941Smrg OPT(register_coalesce); 7141b8e80941Smrg OPT(compute_to_mrf); 7142b8e80941Smrg OPT(dead_code_eliminate); 7143b8e80941Smrg OPT(remove_duplicate_mrf_writes); 7144b8e80941Smrg OPT(opt_peephole_sel); 7145b8e80941Smrg } 7146b8e80941Smrg 7147b8e80941Smrg OPT(opt_redundant_discard_jumps); 7148b8e80941Smrg 7149b8e80941Smrg if (OPT(lower_load_payload)) { 7150b8e80941Smrg split_virtual_grfs(); 7151b8e80941Smrg OPT(register_coalesce); 7152b8e80941Smrg OPT(lower_simd_width); 7153b8e80941Smrg OPT(compute_to_mrf); 7154b8e80941Smrg OPT(dead_code_eliminate); 7155b8e80941Smrg } 7156b8e80941Smrg 7157b8e80941Smrg OPT(opt_combine_constants); 7158b8e80941Smrg OPT(lower_integer_multiplication); 7159b8e80941Smrg 7160b8e80941Smrg if (devinfo->gen <= 5 && OPT(lower_minmax)) { 7161b8e80941Smrg OPT(opt_cmod_propagation); 7162b8e80941Smrg OPT(opt_cse); 7163b8e80941Smrg OPT(opt_copy_propagation); 7164b8e80941Smrg OPT(dead_code_eliminate); 7165b8e80941Smrg } 7166b8e80941Smrg 7167b8e80941Smrg if (OPT(lower_regioning)) { 7168b8e80941Smrg OPT(opt_copy_propagation); 7169b8e80941Smrg OPT(dead_code_eliminate); 7170b8e80941Smrg OPT(lower_simd_width); 7171b8e80941Smrg } 7172b8e80941Smrg 7173b8e80941Smrg OPT(fixup_sends_duplicate_payload); 7174b8e80941Smrg 7175b8e80941Smrg lower_uniform_pull_constant_loads(); 7176b8e80941Smrg 7177b8e80941Smrg validate(); 7178b8e80941Smrg} 7179b8e80941Smrg 7180b8e80941Smrg/** 7181b8e80941Smrg * From the Skylake PRM Vol. 2a docs for sends: 7182b8e80941Smrg * 7183b8e80941Smrg * "It is required that the second block of GRFs does not overlap with the 7184b8e80941Smrg * first block." 7185b8e80941Smrg * 7186b8e80941Smrg * There are plenty of cases where we may accidentally violate this due to 7187b8e80941Smrg * having, for instance, both sources be the constant 0. This little pass 7188b8e80941Smrg * just adds a new vgrf for the second payload and copies it over. 7189b8e80941Smrg */ 7190b8e80941Smrgbool 7191b8e80941Smrgfs_visitor::fixup_sends_duplicate_payload() 7192b8e80941Smrg{ 7193b8e80941Smrg bool progress = false; 7194b8e80941Smrg 7195b8e80941Smrg foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { 7196b8e80941Smrg if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && 7197b8e80941Smrg regions_overlap(inst->src[2], inst->mlen * REG_SIZE, 7198b8e80941Smrg inst->src[3], inst->ex_mlen * REG_SIZE)) { 7199b8e80941Smrg fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen), 7200b8e80941Smrg BRW_REGISTER_TYPE_UD); 7201b8e80941Smrg /* Sadly, we've lost all notion of channels and bit sizes at this 7202b8e80941Smrg * point. Just WE_all it. 7203b8e80941Smrg */ 7204b8e80941Smrg const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0); 7205b8e80941Smrg fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD); 7206b8e80941Smrg fs_reg copy_dst = tmp; 7207b8e80941Smrg for (unsigned i = 0; i < inst->ex_mlen; i += 2) { 7208b8e80941Smrg if (inst->ex_mlen == i + 1) { 7209b8e80941Smrg /* Only one register left; do SIMD8 */ 7210b8e80941Smrg ibld.group(8, 0).MOV(copy_dst, copy_src); 7211b8e80941Smrg } else { 7212b8e80941Smrg ibld.MOV(copy_dst, copy_src); 7213b8e80941Smrg } 7214b8e80941Smrg copy_src = offset(copy_src, ibld, 1); 7215b8e80941Smrg copy_dst = offset(copy_dst, ibld, 1); 7216b8e80941Smrg } 7217b8e80941Smrg inst->src[3] = tmp; 7218b8e80941Smrg progress = true; 7219b8e80941Smrg } 7220b8e80941Smrg } 7221b8e80941Smrg 7222b8e80941Smrg if (progress) 7223b8e80941Smrg invalidate_live_intervals(); 7224b8e80941Smrg 7225b8e80941Smrg return progress; 7226b8e80941Smrg} 7227b8e80941Smrg 7228b8e80941Smrg/** 7229b8e80941Smrg * Three source instruction must have a GRF/MRF destination register. 7230b8e80941Smrg * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. 7231b8e80941Smrg */ 7232b8e80941Smrgvoid 7233b8e80941Smrgfs_visitor::fixup_3src_null_dest() 7234b8e80941Smrg{ 7235b8e80941Smrg bool progress = false; 7236b8e80941Smrg 7237b8e80941Smrg foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { 7238b8e80941Smrg if (inst->is_3src(devinfo) && inst->dst.is_null()) { 7239b8e80941Smrg inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8), 7240b8e80941Smrg inst->dst.type); 7241b8e80941Smrg progress = true; 7242b8e80941Smrg } 7243b8e80941Smrg } 7244b8e80941Smrg 7245b8e80941Smrg if (progress) 7246b8e80941Smrg invalidate_live_intervals(); 7247b8e80941Smrg} 7248b8e80941Smrg 7249b8e80941Smrgvoid 7250b8e80941Smrgfs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling) 7251b8e80941Smrg{ 7252b8e80941Smrg bool allocated_without_spills; 7253b8e80941Smrg 7254b8e80941Smrg static const enum instruction_scheduler_mode pre_modes[] = { 7255b8e80941Smrg SCHEDULE_PRE, 7256b8e80941Smrg SCHEDULE_PRE_NON_LIFO, 7257b8e80941Smrg SCHEDULE_PRE_LIFO, 7258b8e80941Smrg }; 7259b8e80941Smrg 7260b8e80941Smrg bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS); 7261b8e80941Smrg 7262b8e80941Smrg /* Try each scheduling heuristic to see if it can successfully register 7263b8e80941Smrg * allocate without spilling. They should be ordered by decreasing 7264b8e80941Smrg * performance but increasing likelihood of allocating. 7265b8e80941Smrg */ 7266b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { 7267b8e80941Smrg schedule_instructions(pre_modes[i]); 7268b8e80941Smrg 7269b8e80941Smrg if (0) { 7270b8e80941Smrg assign_regs_trivial(); 7271b8e80941Smrg allocated_without_spills = true; 7272b8e80941Smrg } else { 7273b8e80941Smrg allocated_without_spills = assign_regs(false, spill_all); 7274b8e80941Smrg } 7275b8e80941Smrg if (allocated_without_spills) 7276b8e80941Smrg break; 7277b8e80941Smrg } 7278b8e80941Smrg 7279b8e80941Smrg if (!allocated_without_spills) { 7280b8e80941Smrg if (!allow_spilling) 7281b8e80941Smrg fail("Failure to register allocate and spilling is not allowed."); 7282b8e80941Smrg 7283b8e80941Smrg /* We assume that any spilling is worse than just dropping back to 7284b8e80941Smrg * SIMD8. There's probably actually some intermediate point where 7285b8e80941Smrg * SIMD16 with a couple of spills is still better. 7286b8e80941Smrg */ 7287b8e80941Smrg if (dispatch_width > min_dispatch_width) { 7288b8e80941Smrg fail("Failure to register allocate. Reduce number of " 7289b8e80941Smrg "live scalar values to avoid this."); 7290b8e80941Smrg } else { 7291b8e80941Smrg compiler->shader_perf_log(log_data, 7292b8e80941Smrg "%s shader triggered register spilling. " 7293b8e80941Smrg "Try reducing the number of live scalar " 7294b8e80941Smrg "values to improve performance.\n", 7295b8e80941Smrg stage_name); 7296b8e80941Smrg } 7297b8e80941Smrg 7298b8e80941Smrg /* Since we're out of heuristics, just go spill registers until we 7299b8e80941Smrg * get an allocation. 7300b8e80941Smrg */ 7301b8e80941Smrg while (!assign_regs(true, spill_all)) { 7302b8e80941Smrg if (failed) 7303b8e80941Smrg break; 7304b8e80941Smrg } 7305b8e80941Smrg } 7306b8e80941Smrg 7307b8e80941Smrg /* This must come after all optimization and register allocation, since 7308b8e80941Smrg * it inserts dead code that happens to have side effects, and it does 7309b8e80941Smrg * so based on the actual physical registers in use. 7310b8e80941Smrg */ 7311b8e80941Smrg insert_gen4_send_dependency_workarounds(); 7312b8e80941Smrg 7313b8e80941Smrg if (failed) 7314b8e80941Smrg return; 7315b8e80941Smrg 7316b8e80941Smrg opt_bank_conflicts(); 7317b8e80941Smrg 7318b8e80941Smrg schedule_instructions(SCHEDULE_POST); 7319b8e80941Smrg 7320b8e80941Smrg if (last_scratch > 0) { 7321b8e80941Smrg MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024; 7322b8e80941Smrg 7323b8e80941Smrg prog_data->total_scratch = brw_get_scratch_size(last_scratch); 7324b8e80941Smrg 7325b8e80941Smrg if (stage == MESA_SHADER_COMPUTE) { 7326b8e80941Smrg if (devinfo->is_haswell) { 7327b8e80941Smrg /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" 7328b8e80941Smrg * field documentation, Haswell supports a minimum of 2kB of 7329b8e80941Smrg * scratch space for compute shaders, unlike every other stage 7330b8e80941Smrg * and platform. 7331b8e80941Smrg */ 7332b8e80941Smrg prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048); 7333b8e80941Smrg } else if (devinfo->gen <= 7) { 7334b8e80941Smrg /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space" 7335b8e80941Smrg * field documentation, platforms prior to Haswell measure scratch 7336b8e80941Smrg * size linearly with a range of [1kB, 12kB] and 1kB granularity. 7337b8e80941Smrg */ 7338b8e80941Smrg prog_data->total_scratch = ALIGN(last_scratch, 1024); 7339b8e80941Smrg max_scratch_size = 12 * 1024; 7340b8e80941Smrg } 7341b8e80941Smrg } 7342b8e80941Smrg 7343b8e80941Smrg /* We currently only support up to 2MB of scratch space. If we 7344b8e80941Smrg * need to support more eventually, the documentation suggests 7345b8e80941Smrg * that we could allocate a larger buffer, and partition it out 7346b8e80941Smrg * ourselves. We'd just have to undo the hardware's address 7347b8e80941Smrg * calculation by subtracting (FFTID * Per Thread Scratch Space) 7348b8e80941Smrg * and then add FFTID * (Larger Per Thread Scratch Space). 7349b8e80941Smrg * 7350b8e80941Smrg * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > 7351b8e80941Smrg * Thread Group Tracking > Local Memory/Scratch Space. 7352b8e80941Smrg */ 7353b8e80941Smrg assert(prog_data->total_scratch < max_scratch_size); 7354b8e80941Smrg } 7355b8e80941Smrg} 7356b8e80941Smrg 7357b8e80941Smrgbool 7358b8e80941Smrgfs_visitor::run_vs() 7359b8e80941Smrg{ 7360b8e80941Smrg assert(stage == MESA_SHADER_VERTEX); 7361b8e80941Smrg 7362b8e80941Smrg setup_vs_payload(); 7363b8e80941Smrg 7364b8e80941Smrg if (shader_time_index >= 0) 7365b8e80941Smrg emit_shader_time_begin(); 7366b8e80941Smrg 7367b8e80941Smrg emit_nir_code(); 7368b8e80941Smrg 7369b8e80941Smrg if (failed) 7370b8e80941Smrg return false; 7371b8e80941Smrg 7372b8e80941Smrg compute_clip_distance(); 7373b8e80941Smrg 7374b8e80941Smrg emit_urb_writes(); 7375b8e80941Smrg 7376b8e80941Smrg if (shader_time_index >= 0) 7377b8e80941Smrg emit_shader_time_end(); 7378b8e80941Smrg 7379b8e80941Smrg calculate_cfg(); 7380b8e80941Smrg 7381b8e80941Smrg optimize(); 7382b8e80941Smrg 7383b8e80941Smrg assign_curb_setup(); 7384b8e80941Smrg assign_vs_urb_setup(); 7385b8e80941Smrg 7386b8e80941Smrg fixup_3src_null_dest(); 7387b8e80941Smrg allocate_registers(8, true); 7388b8e80941Smrg 7389b8e80941Smrg return !failed; 7390b8e80941Smrg} 7391b8e80941Smrg 7392b8e80941Smrgbool 7393b8e80941Smrgfs_visitor::run_tcs_single_patch() 7394b8e80941Smrg{ 7395b8e80941Smrg assert(stage == MESA_SHADER_TESS_CTRL); 7396b8e80941Smrg 7397b8e80941Smrg struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 7398b8e80941Smrg 7399b8e80941Smrg /* r1-r4 contain the ICP handles. */ 7400b8e80941Smrg payload.num_regs = 5; 7401b8e80941Smrg 7402b8e80941Smrg if (shader_time_index >= 0) 7403b8e80941Smrg emit_shader_time_begin(); 7404b8e80941Smrg 7405b8e80941Smrg /* Initialize gl_InvocationID */ 7406b8e80941Smrg fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW); 7407b8e80941Smrg fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD); 7408b8e80941Smrg bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210))); 7409b8e80941Smrg bld.MOV(channels_ud, channels_uw); 7410b8e80941Smrg 7411b8e80941Smrg if (tcs_prog_data->instances == 1) { 7412b8e80941Smrg invocation_id = channels_ud; 7413b8e80941Smrg } else { 7414b8e80941Smrg const unsigned invocation_id_mask = devinfo->gen >= 11 ? 7415b8e80941Smrg INTEL_MASK(22, 16) : INTEL_MASK(23, 17); 7416b8e80941Smrg const unsigned invocation_id_shift = devinfo->gen >= 11 ? 16 : 17; 7417b8e80941Smrg 7418b8e80941Smrg invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD); 7419b8e80941Smrg 7420b8e80941Smrg /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */ 7421b8e80941Smrg fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD); 7422b8e80941Smrg fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD); 7423b8e80941Smrg bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)), 7424b8e80941Smrg brw_imm_ud(invocation_id_mask)); 7425b8e80941Smrg bld.SHR(instance_times_8, t, brw_imm_ud(invocation_id_shift - 3)); 7426b8e80941Smrg 7427b8e80941Smrg bld.ADD(invocation_id, instance_times_8, channels_ud); 7428b8e80941Smrg } 7429b8e80941Smrg 7430b8e80941Smrg /* Fix the disptach mask */ 7431b8e80941Smrg if (nir->info.tess.tcs_vertices_out % 8) { 7432b8e80941Smrg bld.CMP(bld.null_reg_ud(), invocation_id, 7433b8e80941Smrg brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L); 7434b8e80941Smrg bld.IF(BRW_PREDICATE_NORMAL); 7435b8e80941Smrg } 7436b8e80941Smrg 7437b8e80941Smrg emit_nir_code(); 7438b8e80941Smrg 7439b8e80941Smrg if (nir->info.tess.tcs_vertices_out % 8) { 7440b8e80941Smrg bld.emit(BRW_OPCODE_ENDIF); 7441b8e80941Smrg } 7442b8e80941Smrg 7443b8e80941Smrg /* Emit EOT write; set TR DS Cache bit */ 7444b8e80941Smrg fs_reg srcs[3] = { 7445b8e80941Smrg fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), 7446b8e80941Smrg fs_reg(brw_imm_ud(WRITEMASK_X << 16)), 7447b8e80941Smrg fs_reg(brw_imm_ud(0)), 7448b8e80941Smrg }; 7449b8e80941Smrg fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 7450b8e80941Smrg bld.LOAD_PAYLOAD(payload, srcs, 3, 2); 7451b8e80941Smrg 7452b8e80941Smrg fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, 7453b8e80941Smrg bld.null_reg_ud(), payload); 7454b8e80941Smrg inst->mlen = 3; 7455b8e80941Smrg inst->eot = true; 7456b8e80941Smrg 7457b8e80941Smrg if (shader_time_index >= 0) 7458b8e80941Smrg emit_shader_time_end(); 7459b8e80941Smrg 7460b8e80941Smrg if (failed) 7461b8e80941Smrg return false; 7462b8e80941Smrg 7463b8e80941Smrg calculate_cfg(); 7464b8e80941Smrg 7465b8e80941Smrg optimize(); 7466b8e80941Smrg 7467b8e80941Smrg assign_curb_setup(); 7468b8e80941Smrg assign_tcs_single_patch_urb_setup(); 7469b8e80941Smrg 7470b8e80941Smrg fixup_3src_null_dest(); 7471b8e80941Smrg allocate_registers(8, true); 7472b8e80941Smrg 7473b8e80941Smrg return !failed; 7474b8e80941Smrg} 7475b8e80941Smrg 7476b8e80941Smrgbool 7477b8e80941Smrgfs_visitor::run_tes() 7478b8e80941Smrg{ 7479b8e80941Smrg assert(stage == MESA_SHADER_TESS_EVAL); 7480b8e80941Smrg 7481b8e80941Smrg /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */ 7482b8e80941Smrg payload.num_regs = 5; 7483b8e80941Smrg 7484b8e80941Smrg if (shader_time_index >= 0) 7485b8e80941Smrg emit_shader_time_begin(); 7486b8e80941Smrg 7487b8e80941Smrg emit_nir_code(); 7488b8e80941Smrg 7489b8e80941Smrg if (failed) 7490b8e80941Smrg return false; 7491b8e80941Smrg 7492b8e80941Smrg emit_urb_writes(); 7493b8e80941Smrg 7494b8e80941Smrg if (shader_time_index >= 0) 7495b8e80941Smrg emit_shader_time_end(); 7496b8e80941Smrg 7497b8e80941Smrg calculate_cfg(); 7498b8e80941Smrg 7499b8e80941Smrg optimize(); 7500b8e80941Smrg 7501b8e80941Smrg assign_curb_setup(); 7502b8e80941Smrg assign_tes_urb_setup(); 7503b8e80941Smrg 7504b8e80941Smrg fixup_3src_null_dest(); 7505b8e80941Smrg allocate_registers(8, true); 7506b8e80941Smrg 7507b8e80941Smrg return !failed; 7508b8e80941Smrg} 7509b8e80941Smrg 7510b8e80941Smrgbool 7511b8e80941Smrgfs_visitor::run_gs() 7512b8e80941Smrg{ 7513b8e80941Smrg assert(stage == MESA_SHADER_GEOMETRY); 7514b8e80941Smrg 7515b8e80941Smrg setup_gs_payload(); 7516b8e80941Smrg 7517b8e80941Smrg this->final_gs_vertex_count = vgrf(glsl_type::uint_type); 7518b8e80941Smrg 7519b8e80941Smrg if (gs_compile->control_data_header_size_bits > 0) { 7520b8e80941Smrg /* Create a VGRF to store accumulated control data bits. */ 7521b8e80941Smrg this->control_data_bits = vgrf(glsl_type::uint_type); 7522b8e80941Smrg 7523b8e80941Smrg /* If we're outputting more than 32 control data bits, then EmitVertex() 7524b8e80941Smrg * will set control_data_bits to 0 after emitting the first vertex. 7525b8e80941Smrg * Otherwise, we need to initialize it to 0 here. 7526b8e80941Smrg */ 7527b8e80941Smrg if (gs_compile->control_data_header_size_bits <= 32) { 7528b8e80941Smrg const fs_builder abld = bld.annotate("initialize control data bits"); 7529b8e80941Smrg abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 7530b8e80941Smrg } 7531b8e80941Smrg } 7532b8e80941Smrg 7533b8e80941Smrg if (shader_time_index >= 0) 7534b8e80941Smrg emit_shader_time_begin(); 7535b8e80941Smrg 7536b8e80941Smrg emit_nir_code(); 7537b8e80941Smrg 7538b8e80941Smrg emit_gs_thread_end(); 7539b8e80941Smrg 7540b8e80941Smrg if (shader_time_index >= 0) 7541b8e80941Smrg emit_shader_time_end(); 7542b8e80941Smrg 7543b8e80941Smrg if (failed) 7544b8e80941Smrg return false; 7545b8e80941Smrg 7546b8e80941Smrg calculate_cfg(); 7547b8e80941Smrg 7548b8e80941Smrg optimize(); 7549b8e80941Smrg 7550b8e80941Smrg assign_curb_setup(); 7551b8e80941Smrg assign_gs_urb_setup(); 7552b8e80941Smrg 7553b8e80941Smrg fixup_3src_null_dest(); 7554b8e80941Smrg allocate_registers(8, true); 7555b8e80941Smrg 7556b8e80941Smrg return !failed; 7557b8e80941Smrg} 7558b8e80941Smrg 7559b8e80941Smrg/* From the SKL PRM, Volume 16, Workarounds: 7560b8e80941Smrg * 7561b8e80941Smrg * 0877 3D Pixel Shader Hang possible when pixel shader dispatched with 7562b8e80941Smrg * only header phases (R0-R2) 7563b8e80941Smrg * 7564b8e80941Smrg * WA: Enable a non-header phase (e.g. push constant) when dispatch would 7565b8e80941Smrg * have been header only. 7566b8e80941Smrg * 7567b8e80941Smrg * Instead of enabling push constants one can alternatively enable one of the 7568b8e80941Smrg * inputs. Here one simply chooses "layer" which shouldn't impose much 7569b8e80941Smrg * overhead. 7570b8e80941Smrg */ 7571b8e80941Smrgstatic void 7572b8e80941Smrggen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) 7573b8e80941Smrg{ 7574b8e80941Smrg if (wm_prog_data->num_varying_inputs) 7575b8e80941Smrg return; 7576b8e80941Smrg 7577b8e80941Smrg if (wm_prog_data->base.curb_read_length) 7578b8e80941Smrg return; 7579b8e80941Smrg 7580b8e80941Smrg wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; 7581b8e80941Smrg wm_prog_data->num_varying_inputs = 1; 7582b8e80941Smrg} 7583b8e80941Smrg 7584b8e80941Smrgbool 7585b8e80941Smrgfs_visitor::run_fs(bool allow_spilling, bool do_rep_send) 7586b8e80941Smrg{ 7587b8e80941Smrg struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); 7588b8e80941Smrg brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; 7589b8e80941Smrg 7590b8e80941Smrg assert(stage == MESA_SHADER_FRAGMENT); 7591b8e80941Smrg 7592b8e80941Smrg if (devinfo->gen >= 6) 7593b8e80941Smrg setup_fs_payload_gen6(); 7594b8e80941Smrg else 7595b8e80941Smrg setup_fs_payload_gen4(); 7596b8e80941Smrg 7597b8e80941Smrg if (0) { 7598b8e80941Smrg emit_dummy_fs(); 7599b8e80941Smrg } else if (do_rep_send) { 7600b8e80941Smrg assert(dispatch_width == 16); 7601b8e80941Smrg emit_repclear_shader(); 7602b8e80941Smrg } else { 7603b8e80941Smrg if (shader_time_index >= 0) 7604b8e80941Smrg emit_shader_time_begin(); 7605b8e80941Smrg 7606b8e80941Smrg calculate_urb_setup(); 7607b8e80941Smrg if (nir->info.inputs_read > 0 || 7608b8e80941Smrg (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) { 7609b8e80941Smrg if (devinfo->gen < 6) 7610b8e80941Smrg emit_interpolation_setup_gen4(); 7611b8e80941Smrg else 7612b8e80941Smrg emit_interpolation_setup_gen6(); 7613b8e80941Smrg } 7614b8e80941Smrg 7615b8e80941Smrg /* We handle discards by keeping track of the still-live pixels in f0.1. 7616b8e80941Smrg * Initialize it with the dispatched pixels. 7617b8e80941Smrg */ 7618b8e80941Smrg if (wm_prog_data->uses_kill) { 7619b8e80941Smrg const fs_reg dispatch_mask = 7620b8e80941Smrg devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0); 7621b8e80941Smrg bld.exec_all().group(1, 0) 7622b8e80941Smrg .MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW), 7623b8e80941Smrg retype(dispatch_mask, BRW_REGISTER_TYPE_UW)); 7624b8e80941Smrg } 7625b8e80941Smrg 7626b8e80941Smrg emit_nir_code(); 7627b8e80941Smrg 7628b8e80941Smrg if (failed) 7629b8e80941Smrg return false; 7630b8e80941Smrg 7631b8e80941Smrg if (wm_prog_data->uses_kill) 7632b8e80941Smrg bld.emit(FS_OPCODE_PLACEHOLDER_HALT); 7633b8e80941Smrg 7634b8e80941Smrg if (wm_key->alpha_test_func) 7635b8e80941Smrg emit_alpha_test(); 7636b8e80941Smrg 7637b8e80941Smrg emit_fb_writes(); 7638b8e80941Smrg 7639b8e80941Smrg if (shader_time_index >= 0) 7640b8e80941Smrg emit_shader_time_end(); 7641b8e80941Smrg 7642b8e80941Smrg calculate_cfg(); 7643b8e80941Smrg 7644b8e80941Smrg optimize(); 7645b8e80941Smrg 7646b8e80941Smrg assign_curb_setup(); 7647b8e80941Smrg 7648b8e80941Smrg if (devinfo->gen >= 9) 7649b8e80941Smrg gen9_ps_header_only_workaround(wm_prog_data); 7650b8e80941Smrg 7651b8e80941Smrg assign_urb_setup(); 7652b8e80941Smrg 7653b8e80941Smrg fixup_3src_null_dest(); 7654b8e80941Smrg allocate_registers(8, allow_spilling); 7655b8e80941Smrg 7656b8e80941Smrg if (failed) 7657b8e80941Smrg return false; 7658b8e80941Smrg } 7659b8e80941Smrg 7660b8e80941Smrg return !failed; 7661b8e80941Smrg} 7662b8e80941Smrg 7663b8e80941Smrgbool 7664b8e80941Smrgfs_visitor::run_cs(unsigned min_dispatch_width) 7665b8e80941Smrg{ 7666b8e80941Smrg assert(stage == MESA_SHADER_COMPUTE); 7667b8e80941Smrg assert(dispatch_width >= min_dispatch_width); 7668b8e80941Smrg 7669b8e80941Smrg setup_cs_payload(); 7670b8e80941Smrg 7671b8e80941Smrg if (shader_time_index >= 0) 7672b8e80941Smrg emit_shader_time_begin(); 7673b8e80941Smrg 7674b8e80941Smrg if (devinfo->is_haswell && prog_data->total_shared > 0) { 7675b8e80941Smrg /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */ 7676b8e80941Smrg const fs_builder abld = bld.exec_all().group(1, 0); 7677b8e80941Smrg abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW), 7678b8e80941Smrg suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1)); 7679b8e80941Smrg } 7680b8e80941Smrg 7681b8e80941Smrg emit_nir_code(); 7682b8e80941Smrg 7683b8e80941Smrg if (failed) 7684b8e80941Smrg return false; 7685b8e80941Smrg 7686b8e80941Smrg emit_cs_terminate(); 7687b8e80941Smrg 7688b8e80941Smrg if (shader_time_index >= 0) 7689b8e80941Smrg emit_shader_time_end(); 7690b8e80941Smrg 7691b8e80941Smrg calculate_cfg(); 7692b8e80941Smrg 7693b8e80941Smrg optimize(); 7694b8e80941Smrg 7695b8e80941Smrg assign_curb_setup(); 7696b8e80941Smrg 7697b8e80941Smrg fixup_3src_null_dest(); 7698b8e80941Smrg allocate_registers(min_dispatch_width, true); 7699b8e80941Smrg 7700b8e80941Smrg if (failed) 7701b8e80941Smrg return false; 7702b8e80941Smrg 7703b8e80941Smrg return !failed; 7704b8e80941Smrg} 7705b8e80941Smrg 7706b8e80941Smrg/** 7707b8e80941Smrg * Return a bitfield where bit n is set if barycentric interpolation mode n 7708b8e80941Smrg * (see enum brw_barycentric_mode) is needed by the fragment shader. 7709b8e80941Smrg * 7710b8e80941Smrg * We examine the load_barycentric intrinsics rather than looking at input 7711b8e80941Smrg * variables so that we catch interpolateAtCentroid() messages too, which 7712b8e80941Smrg * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up. 7713b8e80941Smrg */ 7714b8e80941Smrgstatic unsigned 7715b8e80941Smrgbrw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo, 7716b8e80941Smrg const nir_shader *shader) 7717b8e80941Smrg{ 7718b8e80941Smrg unsigned barycentric_interp_modes = 0; 7719b8e80941Smrg 7720b8e80941Smrg nir_foreach_function(f, shader) { 7721b8e80941Smrg if (!f->impl) 7722b8e80941Smrg continue; 7723b8e80941Smrg 7724b8e80941Smrg nir_foreach_block(block, f->impl) { 7725b8e80941Smrg nir_foreach_instr(instr, block) { 7726b8e80941Smrg if (instr->type != nir_instr_type_intrinsic) 7727b8e80941Smrg continue; 7728b8e80941Smrg 7729b8e80941Smrg nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 7730b8e80941Smrg if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) 7731b8e80941Smrg continue; 7732b8e80941Smrg 7733b8e80941Smrg /* Ignore WPOS; it doesn't require interpolation. */ 7734b8e80941Smrg if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS) 7735b8e80941Smrg continue; 7736b8e80941Smrg 7737b8e80941Smrg intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); 7738b8e80941Smrg enum glsl_interp_mode interp = (enum glsl_interp_mode) 7739b8e80941Smrg nir_intrinsic_interp_mode(intrin); 7740b8e80941Smrg nir_intrinsic_op bary_op = intrin->intrinsic; 7741b8e80941Smrg enum brw_barycentric_mode bary = 7742b8e80941Smrg brw_barycentric_mode(interp, bary_op); 7743b8e80941Smrg 7744b8e80941Smrg barycentric_interp_modes |= 1 << bary; 7745b8e80941Smrg 7746b8e80941Smrg if (devinfo->needs_unlit_centroid_workaround && 7747b8e80941Smrg bary_op == nir_intrinsic_load_barycentric_centroid) 7748b8e80941Smrg barycentric_interp_modes |= 1 << centroid_to_pixel(bary); 7749b8e80941Smrg } 7750b8e80941Smrg } 7751b8e80941Smrg } 7752b8e80941Smrg 7753b8e80941Smrg return barycentric_interp_modes; 7754b8e80941Smrg} 7755b8e80941Smrg 7756b8e80941Smrgstatic void 7757b8e80941Smrgbrw_compute_flat_inputs(struct brw_wm_prog_data *prog_data, 7758b8e80941Smrg const nir_shader *shader) 7759b8e80941Smrg{ 7760b8e80941Smrg prog_data->flat_inputs = 0; 7761b8e80941Smrg 7762b8e80941Smrg nir_foreach_variable(var, &shader->inputs) { 7763b8e80941Smrg unsigned slots = glsl_count_attribute_slots(var->type, false); 7764b8e80941Smrg for (unsigned s = 0; s < slots; s++) { 7765b8e80941Smrg int input_index = prog_data->urb_setup[var->data.location + s]; 7766b8e80941Smrg 7767b8e80941Smrg if (input_index < 0) 7768b8e80941Smrg continue; 7769b8e80941Smrg 7770b8e80941Smrg /* flat shading */ 7771b8e80941Smrg if (var->data.interpolation == INTERP_MODE_FLAT) 7772b8e80941Smrg prog_data->flat_inputs |= 1 << input_index; 7773b8e80941Smrg } 7774b8e80941Smrg } 7775b8e80941Smrg} 7776b8e80941Smrg 7777b8e80941Smrgstatic uint8_t 7778b8e80941Smrgcomputed_depth_mode(const nir_shader *shader) 7779b8e80941Smrg{ 7780b8e80941Smrg if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { 7781b8e80941Smrg switch (shader->info.fs.depth_layout) { 7782b8e80941Smrg case FRAG_DEPTH_LAYOUT_NONE: 7783b8e80941Smrg case FRAG_DEPTH_LAYOUT_ANY: 7784b8e80941Smrg return BRW_PSCDEPTH_ON; 7785b8e80941Smrg case FRAG_DEPTH_LAYOUT_GREATER: 7786b8e80941Smrg return BRW_PSCDEPTH_ON_GE; 7787b8e80941Smrg case FRAG_DEPTH_LAYOUT_LESS: 7788b8e80941Smrg return BRW_PSCDEPTH_ON_LE; 7789b8e80941Smrg case FRAG_DEPTH_LAYOUT_UNCHANGED: 7790b8e80941Smrg return BRW_PSCDEPTH_OFF; 7791b8e80941Smrg } 7792b8e80941Smrg } 7793b8e80941Smrg return BRW_PSCDEPTH_OFF; 7794b8e80941Smrg} 7795b8e80941Smrg 7796b8e80941Smrg/** 7797b8e80941Smrg * Move load_interpolated_input with simple (payload-based) barycentric modes 7798b8e80941Smrg * to the top of the program so we don't emit multiple PLNs for the same input. 7799b8e80941Smrg * 7800b8e80941Smrg * This works around CSE not being able to handle non-dominating cases 7801b8e80941Smrg * such as: 7802b8e80941Smrg * 7803b8e80941Smrg * if (...) { 7804b8e80941Smrg * interpolate input 7805b8e80941Smrg * } else { 7806b8e80941Smrg * interpolate the same exact input 7807b8e80941Smrg * } 7808b8e80941Smrg * 7809b8e80941Smrg * This should be replaced by global value numbering someday. 7810b8e80941Smrg */ 7811b8e80941Smrgstatic bool 7812b8e80941Smrgmove_interpolation_to_top(nir_shader *nir) 7813b8e80941Smrg{ 7814b8e80941Smrg bool progress = false; 7815b8e80941Smrg 7816b8e80941Smrg nir_foreach_function(f, nir) { 7817b8e80941Smrg if (!f->impl) 7818b8e80941Smrg continue; 7819b8e80941Smrg 7820b8e80941Smrg nir_block *top = nir_start_block(f->impl); 7821b8e80941Smrg exec_node *cursor_node = NULL; 7822b8e80941Smrg 7823b8e80941Smrg nir_foreach_block(block, f->impl) { 7824b8e80941Smrg if (block == top) 7825b8e80941Smrg continue; 7826b8e80941Smrg 7827b8e80941Smrg nir_foreach_instr_safe(instr, block) { 7828b8e80941Smrg if (instr->type != nir_instr_type_intrinsic) 7829b8e80941Smrg continue; 7830b8e80941Smrg 7831b8e80941Smrg nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 7832b8e80941Smrg if (intrin->intrinsic != nir_intrinsic_load_interpolated_input) 7833b8e80941Smrg continue; 7834b8e80941Smrg nir_intrinsic_instr *bary_intrinsic = 7835b8e80941Smrg nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr); 7836b8e80941Smrg nir_intrinsic_op op = bary_intrinsic->intrinsic; 7837b8e80941Smrg 7838b8e80941Smrg /* Leave interpolateAtSample/Offset() where they are. */ 7839b8e80941Smrg if (op == nir_intrinsic_load_barycentric_at_sample || 7840b8e80941Smrg op == nir_intrinsic_load_barycentric_at_offset) 7841b8e80941Smrg continue; 7842b8e80941Smrg 7843b8e80941Smrg nir_instr *move[3] = { 7844b8e80941Smrg &bary_intrinsic->instr, 7845b8e80941Smrg intrin->src[1].ssa->parent_instr, 7846b8e80941Smrg instr 7847b8e80941Smrg }; 7848b8e80941Smrg 7849b8e80941Smrg for (unsigned i = 0; i < ARRAY_SIZE(move); i++) { 7850b8e80941Smrg if (move[i]->block != top) { 7851b8e80941Smrg move[i]->block = top; 7852b8e80941Smrg exec_node_remove(&move[i]->node); 7853b8e80941Smrg if (cursor_node) { 7854b8e80941Smrg exec_node_insert_after(cursor_node, &move[i]->node); 7855b8e80941Smrg } else { 7856b8e80941Smrg exec_list_push_head(&top->instr_list, &move[i]->node); 7857b8e80941Smrg } 7858b8e80941Smrg cursor_node = &move[i]->node; 7859b8e80941Smrg progress = true; 7860b8e80941Smrg } 7861b8e80941Smrg } 7862b8e80941Smrg } 7863b8e80941Smrg } 7864b8e80941Smrg nir_metadata_preserve(f->impl, (nir_metadata) 7865b8e80941Smrg ((unsigned) nir_metadata_block_index | 7866b8e80941Smrg (unsigned) nir_metadata_dominance)); 7867b8e80941Smrg } 7868b8e80941Smrg 7869b8e80941Smrg return progress; 7870b8e80941Smrg} 7871b8e80941Smrg 7872b8e80941Smrg/** 7873b8e80941Smrg * Demote per-sample barycentric intrinsics to centroid. 7874b8e80941Smrg * 7875b8e80941Smrg * Useful when rendering to a non-multisampled buffer. 7876b8e80941Smrg */ 7877b8e80941Smrgstatic bool 7878b8e80941Smrgdemote_sample_qualifiers(nir_shader *nir) 7879b8e80941Smrg{ 7880b8e80941Smrg bool progress = true; 7881b8e80941Smrg 7882b8e80941Smrg nir_foreach_function(f, nir) { 7883b8e80941Smrg if (!f->impl) 7884b8e80941Smrg continue; 7885b8e80941Smrg 7886b8e80941Smrg nir_builder b; 7887b8e80941Smrg nir_builder_init(&b, f->impl); 7888b8e80941Smrg 7889b8e80941Smrg nir_foreach_block(block, f->impl) { 7890b8e80941Smrg nir_foreach_instr_safe(instr, block) { 7891b8e80941Smrg if (instr->type != nir_instr_type_intrinsic) 7892b8e80941Smrg continue; 7893b8e80941Smrg 7894b8e80941Smrg nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 7895b8e80941Smrg if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample && 7896b8e80941Smrg intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample) 7897b8e80941Smrg continue; 7898b8e80941Smrg 7899b8e80941Smrg b.cursor = nir_before_instr(instr); 7900b8e80941Smrg nir_ssa_def *centroid = 7901b8e80941Smrg nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid, 7902b8e80941Smrg nir_intrinsic_interp_mode(intrin)); 7903b8e80941Smrg nir_ssa_def_rewrite_uses(&intrin->dest.ssa, 7904b8e80941Smrg nir_src_for_ssa(centroid)); 7905b8e80941Smrg nir_instr_remove(instr); 7906b8e80941Smrg progress = true; 7907b8e80941Smrg } 7908b8e80941Smrg } 7909b8e80941Smrg 7910b8e80941Smrg nir_metadata_preserve(f->impl, (nir_metadata) 7911b8e80941Smrg ((unsigned) nir_metadata_block_index | 7912b8e80941Smrg (unsigned) nir_metadata_dominance)); 7913b8e80941Smrg } 7914b8e80941Smrg 7915b8e80941Smrg return progress; 7916b8e80941Smrg} 7917b8e80941Smrg 7918b8e80941Smrg/** 7919b8e80941Smrg * Pre-gen6, the register file of the EUs was shared between threads, 7920b8e80941Smrg * and each thread used some subset allocated on a 16-register block 7921b8e80941Smrg * granularity. The unit states wanted these block counts. 7922b8e80941Smrg */ 7923b8e80941Smrgstatic inline int 7924b8e80941Smrgbrw_register_blocks(int reg_count) 7925b8e80941Smrg{ 7926b8e80941Smrg return ALIGN(reg_count, 16) / 16 - 1; 7927b8e80941Smrg} 7928b8e80941Smrg 7929b8e80941Smrgconst unsigned * 7930b8e80941Smrgbrw_compile_fs(const struct brw_compiler *compiler, void *log_data, 7931b8e80941Smrg void *mem_ctx, 7932b8e80941Smrg const struct brw_wm_prog_key *key, 7933b8e80941Smrg struct brw_wm_prog_data *prog_data, 7934b8e80941Smrg nir_shader *shader, 7935b8e80941Smrg struct gl_program *prog, 7936b8e80941Smrg int shader_time_index8, int shader_time_index16, 7937b8e80941Smrg int shader_time_index32, bool allow_spilling, 7938b8e80941Smrg bool use_rep_send, struct brw_vue_map *vue_map, 7939b8e80941Smrg char **error_str) 7940b8e80941Smrg{ 7941b8e80941Smrg const struct gen_device_info *devinfo = compiler->devinfo; 7942b8e80941Smrg 7943b8e80941Smrg shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); 7944b8e80941Smrg brw_nir_lower_fs_inputs(shader, devinfo, key); 7945b8e80941Smrg brw_nir_lower_fs_outputs(shader); 7946b8e80941Smrg 7947b8e80941Smrg if (devinfo->gen < 6) 7948b8e80941Smrg brw_setup_vue_interpolation(vue_map, shader, prog_data); 7949b8e80941Smrg 7950b8e80941Smrg if (!key->multisample_fbo) 7951b8e80941Smrg NIR_PASS_V(shader, demote_sample_qualifiers); 7952b8e80941Smrg NIR_PASS_V(shader, move_interpolation_to_top); 7953b8e80941Smrg shader = brw_postprocess_nir(shader, compiler, true); 7954b8e80941Smrg 7955b8e80941Smrg /* key->alpha_test_func means simulating alpha testing via discards, 7956b8e80941Smrg * so the shader definitely kills pixels. 7957b8e80941Smrg */ 7958b8e80941Smrg prog_data->uses_kill = shader->info.fs.uses_discard || 7959b8e80941Smrg key->alpha_test_func; 7960b8e80941Smrg prog_data->uses_omask = key->multisample_fbo && 7961b8e80941Smrg shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK); 7962b8e80941Smrg prog_data->computed_depth_mode = computed_depth_mode(shader); 7963b8e80941Smrg prog_data->computed_stencil = 7964b8e80941Smrg shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); 7965b8e80941Smrg 7966b8e80941Smrg prog_data->persample_dispatch = 7967b8e80941Smrg key->multisample_fbo && 7968b8e80941Smrg (key->persample_interp || 7969b8e80941Smrg (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID | 7970b8e80941Smrg SYSTEM_BIT_SAMPLE_POS)) || 7971b8e80941Smrg shader->info.fs.uses_sample_qualifier || 7972b8e80941Smrg shader->info.outputs_read); 7973b8e80941Smrg 7974b8e80941Smrg prog_data->has_render_target_reads = shader->info.outputs_read != 0ull; 7975b8e80941Smrg 7976b8e80941Smrg prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; 7977b8e80941Smrg prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; 7978b8e80941Smrg prog_data->inner_coverage = shader->info.fs.inner_coverage; 7979b8e80941Smrg 7980b8e80941Smrg prog_data->barycentric_interp_modes = 7981b8e80941Smrg brw_compute_barycentric_interp_modes(compiler->devinfo, shader); 7982b8e80941Smrg 7983b8e80941Smrg cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL; 7984b8e80941Smrg 7985b8e80941Smrg fs_visitor v8(compiler, log_data, mem_ctx, key, 7986b8e80941Smrg &prog_data->base, prog, shader, 8, 7987b8e80941Smrg shader_time_index8); 7988b8e80941Smrg if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) { 7989b8e80941Smrg if (error_str) 7990b8e80941Smrg *error_str = ralloc_strdup(mem_ctx, v8.fail_msg); 7991b8e80941Smrg 7992b8e80941Smrg return NULL; 7993b8e80941Smrg } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) { 7994b8e80941Smrg simd8_cfg = v8.cfg; 7995b8e80941Smrg prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs; 7996b8e80941Smrg prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used); 7997b8e80941Smrg } 7998b8e80941Smrg 7999b8e80941Smrg if (v8.max_dispatch_width >= 16 && 8000b8e80941Smrg likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) { 8001b8e80941Smrg /* Try a SIMD16 compile */ 8002b8e80941Smrg fs_visitor v16(compiler, log_data, mem_ctx, key, 8003b8e80941Smrg &prog_data->base, prog, shader, 16, 8004b8e80941Smrg shader_time_index16); 8005b8e80941Smrg v16.import_uniforms(&v8); 8006b8e80941Smrg if (!v16.run_fs(allow_spilling, use_rep_send)) { 8007b8e80941Smrg compiler->shader_perf_log(log_data, 8008b8e80941Smrg "SIMD16 shader failed to compile: %s", 8009b8e80941Smrg v16.fail_msg); 8010b8e80941Smrg } else { 8011b8e80941Smrg simd16_cfg = v16.cfg; 8012b8e80941Smrg prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs; 8013b8e80941Smrg prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used); 8014b8e80941Smrg } 8015b8e80941Smrg } 8016b8e80941Smrg 8017b8e80941Smrg /* Currently, the compiler only supports SIMD32 on SNB+ */ 8018b8e80941Smrg if (v8.max_dispatch_width >= 32 && !use_rep_send && 8019b8e80941Smrg compiler->devinfo->gen >= 6 && 8020b8e80941Smrg unlikely(INTEL_DEBUG & DEBUG_DO32)) { 8021b8e80941Smrg /* Try a SIMD32 compile */ 8022b8e80941Smrg fs_visitor v32(compiler, log_data, mem_ctx, key, 8023b8e80941Smrg &prog_data->base, prog, shader, 32, 8024b8e80941Smrg shader_time_index32); 8025b8e80941Smrg v32.import_uniforms(&v8); 8026b8e80941Smrg if (!v32.run_fs(allow_spilling, false)) { 8027b8e80941Smrg compiler->shader_perf_log(log_data, 8028b8e80941Smrg "SIMD32 shader failed to compile: %s", 8029b8e80941Smrg v32.fail_msg); 8030b8e80941Smrg } else { 8031b8e80941Smrg simd32_cfg = v32.cfg; 8032b8e80941Smrg prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs; 8033b8e80941Smrg prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used); 8034b8e80941Smrg } 8035b8e80941Smrg } 8036b8e80941Smrg 8037b8e80941Smrg /* When the caller requests a repclear shader, they want SIMD16-only */ 8038b8e80941Smrg if (use_rep_send) 8039b8e80941Smrg simd8_cfg = NULL; 8040b8e80941Smrg 8041b8e80941Smrg /* Prior to Iron Lake, the PS had a single shader offset with a jump table 8042b8e80941Smrg * at the top to select the shader. We've never implemented that. 8043b8e80941Smrg * Instead, we just give them exactly one shader and we pick the widest one 8044b8e80941Smrg * available. 8045b8e80941Smrg */ 8046b8e80941Smrg if (compiler->devinfo->gen < 5) { 8047b8e80941Smrg if (simd32_cfg || simd16_cfg) 8048b8e80941Smrg simd8_cfg = NULL; 8049b8e80941Smrg if (simd32_cfg) 8050b8e80941Smrg simd16_cfg = NULL; 8051b8e80941Smrg } 8052b8e80941Smrg 8053b8e80941Smrg /* If computed depth is enabled SNB only allows SIMD8. */ 8054b8e80941Smrg if (compiler->devinfo->gen == 6 && 8055b8e80941Smrg prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) 8056b8e80941Smrg assert(simd16_cfg == NULL && simd32_cfg == NULL); 8057b8e80941Smrg 8058b8e80941Smrg if (compiler->devinfo->gen <= 5 && !simd8_cfg) { 8059b8e80941Smrg /* Iron lake and earlier only have one Dispatch GRF start field. Make 8060b8e80941Smrg * the data available in the base prog data struct for convenience. 8061b8e80941Smrg */ 8062b8e80941Smrg if (simd16_cfg) { 8063b8e80941Smrg prog_data->base.dispatch_grf_start_reg = 8064b8e80941Smrg prog_data->dispatch_grf_start_reg_16; 8065b8e80941Smrg } else if (simd32_cfg) { 8066b8e80941Smrg prog_data->base.dispatch_grf_start_reg = 8067b8e80941Smrg prog_data->dispatch_grf_start_reg_32; 8068b8e80941Smrg } 8069b8e80941Smrg } 8070b8e80941Smrg 8071b8e80941Smrg if (prog_data->persample_dispatch) { 8072b8e80941Smrg /* Starting with SandyBridge (where we first get MSAA), the different 8073b8e80941Smrg * pixel dispatch combinations are grouped into classifications A 8074b8e80941Smrg * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware 8075b8e80941Smrg * generations, the only configurations supporting persample dispatch 8076b8e80941Smrg * are are this in which only one dispatch width is enabled. 8077b8e80941Smrg */ 8078b8e80941Smrg if (simd32_cfg || simd16_cfg) 8079b8e80941Smrg simd8_cfg = NULL; 8080b8e80941Smrg if (simd32_cfg) 8081b8e80941Smrg simd16_cfg = NULL; 8082b8e80941Smrg } 8083b8e80941Smrg 8084b8e80941Smrg /* We have to compute the flat inputs after the visitor is finished running 8085b8e80941Smrg * because it relies on prog_data->urb_setup which is computed in 8086b8e80941Smrg * fs_visitor::calculate_urb_setup(). 8087b8e80941Smrg */ 8088b8e80941Smrg brw_compute_flat_inputs(prog_data, shader); 8089b8e80941Smrg 8090b8e80941Smrg fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, 8091b8e80941Smrg v8.promoted_constants, v8.runtime_check_aads_emit, 8092b8e80941Smrg MESA_SHADER_FRAGMENT); 8093b8e80941Smrg 8094b8e80941Smrg if (unlikely(INTEL_DEBUG & DEBUG_WM)) { 8095b8e80941Smrg g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s", 8096b8e80941Smrg shader->info.label ? 8097b8e80941Smrg shader->info.label : "unnamed", 8098b8e80941Smrg shader->info.name)); 8099b8e80941Smrg } 8100b8e80941Smrg 8101b8e80941Smrg if (simd8_cfg) { 8102b8e80941Smrg prog_data->dispatch_8 = true; 8103b8e80941Smrg g.generate_code(simd8_cfg, 8); 8104b8e80941Smrg } 8105b8e80941Smrg 8106b8e80941Smrg if (simd16_cfg) { 8107b8e80941Smrg prog_data->dispatch_16 = true; 8108b8e80941Smrg prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); 8109b8e80941Smrg } 8110b8e80941Smrg 8111b8e80941Smrg if (simd32_cfg) { 8112b8e80941Smrg prog_data->dispatch_32 = true; 8113b8e80941Smrg prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32); 8114b8e80941Smrg } 8115b8e80941Smrg 8116b8e80941Smrg return g.get_assembly(); 8117b8e80941Smrg} 8118b8e80941Smrg 8119b8e80941Smrgfs_reg * 8120b8e80941Smrgfs_visitor::emit_cs_work_group_id_setup() 8121b8e80941Smrg{ 8122b8e80941Smrg assert(stage == MESA_SHADER_COMPUTE); 8123b8e80941Smrg 8124b8e80941Smrg fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type)); 8125b8e80941Smrg 8126b8e80941Smrg struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); 8127b8e80941Smrg struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD)); 8128b8e80941Smrg struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD)); 8129b8e80941Smrg 8130b8e80941Smrg bld.MOV(*reg, r0_1); 8131b8e80941Smrg bld.MOV(offset(*reg, bld, 1), r0_6); 8132b8e80941Smrg bld.MOV(offset(*reg, bld, 2), r0_7); 8133b8e80941Smrg 8134b8e80941Smrg return reg; 8135b8e80941Smrg} 8136b8e80941Smrg 8137b8e80941Smrgstatic void 8138b8e80941Smrgfill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) 8139b8e80941Smrg{ 8140b8e80941Smrg block->dwords = dwords; 8141b8e80941Smrg block->regs = DIV_ROUND_UP(dwords, 8); 8142b8e80941Smrg block->size = block->regs * 32; 8143b8e80941Smrg} 8144b8e80941Smrg 8145b8e80941Smrgstatic void 8146b8e80941Smrgcs_fill_push_const_info(const struct gen_device_info *devinfo, 8147b8e80941Smrg struct brw_cs_prog_data *cs_prog_data) 8148b8e80941Smrg{ 8149b8e80941Smrg const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; 8150b8e80941Smrg int subgroup_id_index = get_subgroup_id_param_index(prog_data); 8151b8e80941Smrg bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell; 8152b8e80941Smrg 8153b8e80941Smrg /* The thread ID should be stored in the last param dword */ 8154b8e80941Smrg assert(subgroup_id_index == -1 || 8155b8e80941Smrg subgroup_id_index == (int)prog_data->nr_params - 1); 8156b8e80941Smrg 8157b8e80941Smrg unsigned cross_thread_dwords, per_thread_dwords; 8158b8e80941Smrg if (!cross_thread_supported) { 8159b8e80941Smrg cross_thread_dwords = 0u; 8160b8e80941Smrg per_thread_dwords = prog_data->nr_params; 8161b8e80941Smrg } else if (subgroup_id_index >= 0) { 8162b8e80941Smrg /* Fill all but the last register with cross-thread payload */ 8163b8e80941Smrg cross_thread_dwords = 8 * (subgroup_id_index / 8); 8164b8e80941Smrg per_thread_dwords = prog_data->nr_params - cross_thread_dwords; 8165b8e80941Smrg assert(per_thread_dwords > 0 && per_thread_dwords <= 8); 8166b8e80941Smrg } else { 8167b8e80941Smrg /* Fill all data using cross-thread payload */ 8168b8e80941Smrg cross_thread_dwords = prog_data->nr_params; 8169b8e80941Smrg per_thread_dwords = 0u; 8170b8e80941Smrg } 8171b8e80941Smrg 8172b8e80941Smrg fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords); 8173b8e80941Smrg fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords); 8174b8e80941Smrg 8175b8e80941Smrg unsigned total_dwords = 8176b8e80941Smrg (cs_prog_data->push.per_thread.size * cs_prog_data->threads + 8177b8e80941Smrg cs_prog_data->push.cross_thread.size) / 4; 8178b8e80941Smrg fill_push_const_block_info(&cs_prog_data->push.total, total_dwords); 8179b8e80941Smrg 8180b8e80941Smrg assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 || 8181b8e80941Smrg cs_prog_data->push.per_thread.size == 0); 8182b8e80941Smrg assert(cs_prog_data->push.cross_thread.dwords + 8183b8e80941Smrg cs_prog_data->push.per_thread.dwords == 8184b8e80941Smrg prog_data->nr_params); 8185b8e80941Smrg} 8186b8e80941Smrg 8187b8e80941Smrgstatic void 8188b8e80941Smrgcs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size) 8189b8e80941Smrg{ 8190b8e80941Smrg cs_prog_data->simd_size = size; 8191b8e80941Smrg unsigned group_size = cs_prog_data->local_size[0] * 8192b8e80941Smrg cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; 8193b8e80941Smrg cs_prog_data->threads = (group_size + size - 1) / size; 8194b8e80941Smrg} 8195b8e80941Smrg 8196b8e80941Smrgstatic nir_shader * 8197b8e80941Smrgcompile_cs_to_nir(const struct brw_compiler *compiler, 8198b8e80941Smrg void *mem_ctx, 8199b8e80941Smrg const struct brw_cs_prog_key *key, 8200b8e80941Smrg const nir_shader *src_shader, 8201b8e80941Smrg unsigned dispatch_width) 8202b8e80941Smrg{ 8203b8e80941Smrg nir_shader *shader = nir_shader_clone(mem_ctx, src_shader); 8204b8e80941Smrg shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true); 8205b8e80941Smrg 8206b8e80941Smrg NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width); 8207b8e80941Smrg 8208b8e80941Smrg /* Clean up after the local index and ID calculations. */ 8209b8e80941Smrg NIR_PASS_V(shader, nir_opt_constant_folding); 8210b8e80941Smrg NIR_PASS_V(shader, nir_opt_dce); 8211b8e80941Smrg 8212b8e80941Smrg return brw_postprocess_nir(shader, compiler, true); 8213b8e80941Smrg} 8214b8e80941Smrg 8215b8e80941Smrgconst unsigned * 8216b8e80941Smrgbrw_compile_cs(const struct brw_compiler *compiler, void *log_data, 8217b8e80941Smrg void *mem_ctx, 8218b8e80941Smrg const struct brw_cs_prog_key *key, 8219b8e80941Smrg struct brw_cs_prog_data *prog_data, 8220b8e80941Smrg const nir_shader *src_shader, 8221b8e80941Smrg int shader_time_index, 8222b8e80941Smrg char **error_str) 8223b8e80941Smrg{ 8224b8e80941Smrg prog_data->local_size[0] = src_shader->info.cs.local_size[0]; 8225b8e80941Smrg prog_data->local_size[1] = src_shader->info.cs.local_size[1]; 8226b8e80941Smrg prog_data->local_size[2] = src_shader->info.cs.local_size[2]; 8227b8e80941Smrg unsigned local_workgroup_size = 8228b8e80941Smrg src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] * 8229b8e80941Smrg src_shader->info.cs.local_size[2]; 8230b8e80941Smrg 8231b8e80941Smrg unsigned min_dispatch_width = 8232b8e80941Smrg DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads); 8233b8e80941Smrg min_dispatch_width = MAX2(8, min_dispatch_width); 8234b8e80941Smrg min_dispatch_width = util_next_power_of_two(min_dispatch_width); 8235b8e80941Smrg assert(min_dispatch_width <= 32); 8236b8e80941Smrg 8237b8e80941Smrg fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL; 8238b8e80941Smrg cfg_t *cfg = NULL; 8239b8e80941Smrg const char *fail_msg = NULL; 8240b8e80941Smrg unsigned promoted_constants = 0; 8241b8e80941Smrg 8242b8e80941Smrg /* Now the main event: Visit the shader IR and generate our CS IR for it. 8243b8e80941Smrg */ 8244b8e80941Smrg if (min_dispatch_width <= 8) { 8245b8e80941Smrg nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key, 8246b8e80941Smrg src_shader, 8); 8247b8e80941Smrg v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, 8248b8e80941Smrg NULL, /* Never used in core profile */ 8249b8e80941Smrg nir8, 8, shader_time_index); 8250b8e80941Smrg if (!v8->run_cs(min_dispatch_width)) { 8251b8e80941Smrg fail_msg = v8->fail_msg; 8252b8e80941Smrg } else { 8253b8e80941Smrg /* We should always be able to do SIMD32 for compute shaders */ 8254b8e80941Smrg assert(v8->max_dispatch_width >= 32); 8255b8e80941Smrg 8256b8e80941Smrg cfg = v8->cfg; 8257b8e80941Smrg cs_set_simd_size(prog_data, 8); 8258b8e80941Smrg cs_fill_push_const_info(compiler->devinfo, prog_data); 8259b8e80941Smrg promoted_constants = v8->promoted_constants; 8260b8e80941Smrg } 8261b8e80941Smrg } 8262b8e80941Smrg 8263b8e80941Smrg if (likely(!(INTEL_DEBUG & DEBUG_NO16)) && 8264b8e80941Smrg !fail_msg && min_dispatch_width <= 16) { 8265b8e80941Smrg /* Try a SIMD16 compile */ 8266b8e80941Smrg nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key, 8267b8e80941Smrg src_shader, 16); 8268b8e80941Smrg v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, 8269b8e80941Smrg NULL, /* Never used in core profile */ 8270b8e80941Smrg nir16, 16, shader_time_index); 8271b8e80941Smrg if (v8) 8272b8e80941Smrg v16->import_uniforms(v8); 8273b8e80941Smrg 8274b8e80941Smrg if (!v16->run_cs(min_dispatch_width)) { 8275b8e80941Smrg compiler->shader_perf_log(log_data, 8276b8e80941Smrg "SIMD16 shader failed to compile: %s", 8277b8e80941Smrg v16->fail_msg); 8278b8e80941Smrg if (!cfg) { 8279b8e80941Smrg fail_msg = 8280b8e80941Smrg "Couldn't generate SIMD16 program and not " 8281b8e80941Smrg "enough threads for SIMD8"; 8282b8e80941Smrg } 8283b8e80941Smrg } else { 8284b8e80941Smrg /* We should always be able to do SIMD32 for compute shaders */ 8285b8e80941Smrg assert(v16->max_dispatch_width >= 32); 8286b8e80941Smrg 8287b8e80941Smrg cfg = v16->cfg; 8288b8e80941Smrg cs_set_simd_size(prog_data, 16); 8289b8e80941Smrg cs_fill_push_const_info(compiler->devinfo, prog_data); 8290b8e80941Smrg promoted_constants = v16->promoted_constants; 8291b8e80941Smrg } 8292b8e80941Smrg } 8293b8e80941Smrg 8294b8e80941Smrg /* We should always be able to do SIMD32 for compute shaders */ 8295b8e80941Smrg assert(!v16 || v16->max_dispatch_width >= 32); 8296b8e80941Smrg 8297b8e80941Smrg if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) { 8298b8e80941Smrg /* Try a SIMD32 compile */ 8299b8e80941Smrg nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key, 8300b8e80941Smrg src_shader, 32); 8301b8e80941Smrg v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base, 8302b8e80941Smrg NULL, /* Never used in core profile */ 8303b8e80941Smrg nir32, 32, shader_time_index); 8304b8e80941Smrg if (v8) 8305b8e80941Smrg v32->import_uniforms(v8); 8306b8e80941Smrg else if (v16) 8307b8e80941Smrg v32->import_uniforms(v16); 8308b8e80941Smrg 8309b8e80941Smrg if (!v32->run_cs(min_dispatch_width)) { 8310b8e80941Smrg compiler->shader_perf_log(log_data, 8311b8e80941Smrg "SIMD32 shader failed to compile: %s", 8312b8e80941Smrg v32->fail_msg); 8313b8e80941Smrg if (!cfg) { 8314b8e80941Smrg fail_msg = 8315b8e80941Smrg "Couldn't generate SIMD32 program and not " 8316b8e80941Smrg "enough threads for SIMD16"; 8317b8e80941Smrg } 8318b8e80941Smrg } else { 8319b8e80941Smrg cfg = v32->cfg; 8320b8e80941Smrg cs_set_simd_size(prog_data, 32); 8321b8e80941Smrg cs_fill_push_const_info(compiler->devinfo, prog_data); 8322b8e80941Smrg promoted_constants = v32->promoted_constants; 8323b8e80941Smrg } 8324b8e80941Smrg } 8325b8e80941Smrg 8326b8e80941Smrg const unsigned *ret = NULL; 8327b8e80941Smrg if (unlikely(cfg == NULL)) { 8328b8e80941Smrg assert(fail_msg); 8329b8e80941Smrg if (error_str) 8330b8e80941Smrg *error_str = ralloc_strdup(mem_ctx, fail_msg); 8331b8e80941Smrg } else { 8332b8e80941Smrg fs_generator g(compiler, log_data, mem_ctx, &prog_data->base, 8333b8e80941Smrg promoted_constants, false, MESA_SHADER_COMPUTE); 8334b8e80941Smrg if (INTEL_DEBUG & DEBUG_CS) { 8335b8e80941Smrg char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s", 8336b8e80941Smrg src_shader->info.label ? 8337b8e80941Smrg src_shader->info.label : "unnamed", 8338b8e80941Smrg src_shader->info.name); 8339b8e80941Smrg g.enable_debug(name); 8340b8e80941Smrg } 8341b8e80941Smrg 8342b8e80941Smrg g.generate_code(cfg, prog_data->simd_size); 8343b8e80941Smrg 8344b8e80941Smrg ret = g.get_assembly(); 8345b8e80941Smrg } 8346b8e80941Smrg 8347b8e80941Smrg delete v8; 8348b8e80941Smrg delete v16; 8349b8e80941Smrg delete v32; 8350b8e80941Smrg 8351b8e80941Smrg return ret; 8352b8e80941Smrg} 8353b8e80941Smrg 8354b8e80941Smrg/** 8355b8e80941Smrg * Test the dispatch mask packing assumptions of 8356b8e80941Smrg * brw_stage_has_packed_dispatch(). Call this from e.g. the top of 8357b8e80941Smrg * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is 8358b8e80941Smrg * executed with an unexpected dispatch mask. 8359b8e80941Smrg */ 8360b8e80941Smrgstatic UNUSED void 8361b8e80941Smrgbrw_fs_test_dispatch_packing(const fs_builder &bld) 8362b8e80941Smrg{ 8363b8e80941Smrg const gl_shader_stage stage = bld.shader->stage; 8364b8e80941Smrg 8365b8e80941Smrg if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage, 8366b8e80941Smrg bld.shader->stage_prog_data)) { 8367b8e80941Smrg const fs_builder ubld = bld.exec_all().group(1, 0); 8368b8e80941Smrg const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0); 8369b8e80941Smrg const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() : 8370b8e80941Smrg brw_dmask_reg()); 8371b8e80941Smrg 8372b8e80941Smrg ubld.ADD(tmp, mask, brw_imm_ud(1)); 8373b8e80941Smrg ubld.AND(tmp, mask, tmp); 8374b8e80941Smrg 8375b8e80941Smrg /* This will loop forever if the dispatch mask doesn't have the expected 8376b8e80941Smrg * form '2^n-1', in which case tmp will be non-zero. 8377b8e80941Smrg */ 8378b8e80941Smrg bld.emit(BRW_OPCODE_DO); 8379b8e80941Smrg bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ); 8380b8e80941Smrg set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE)); 8381b8e80941Smrg } 8382b8e80941Smrg} 8383