1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2010 Intel Corporation
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21b8e80941Smrg * IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg/** @file brw_fs.cpp
25b8e80941Smrg *
26b8e80941Smrg * This file drives the GLSL IR -> LIR translation, contains the
27b8e80941Smrg * optimizations on the LIR, and drives the generation of native code
28b8e80941Smrg * from the LIR.
29b8e80941Smrg */
30b8e80941Smrg
31b8e80941Smrg#include "main/macros.h"
32b8e80941Smrg#include "brw_eu.h"
33b8e80941Smrg#include "brw_fs.h"
34b8e80941Smrg#include "brw_nir.h"
35b8e80941Smrg#include "brw_vec4_gs_visitor.h"
36b8e80941Smrg#include "brw_cfg.h"
37b8e80941Smrg#include "brw_dead_control_flow.h"
38b8e80941Smrg#include "dev/gen_debug.h"
39b8e80941Smrg#include "compiler/glsl_types.h"
40b8e80941Smrg#include "compiler/nir/nir_builder.h"
41b8e80941Smrg#include "program/prog_parameter.h"
42b8e80941Smrg#include "util/u_math.h"
43b8e80941Smrg
44b8e80941Smrgusing namespace brw;
45b8e80941Smrg
46b8e80941Smrgstatic unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
47b8e80941Smrg                                       const fs_inst *inst);
48b8e80941Smrg
49b8e80941Smrgvoid
50b8e80941Smrgfs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
51b8e80941Smrg              const fs_reg *src, unsigned sources)
52b8e80941Smrg{
53b8e80941Smrg   memset((void*)this, 0, sizeof(*this));
54b8e80941Smrg
55b8e80941Smrg   this->src = new fs_reg[MAX2(sources, 3)];
56b8e80941Smrg   for (unsigned i = 0; i < sources; i++)
57b8e80941Smrg      this->src[i] = src[i];
58b8e80941Smrg
59b8e80941Smrg   this->opcode = opcode;
60b8e80941Smrg   this->dst = dst;
61b8e80941Smrg   this->sources = sources;
62b8e80941Smrg   this->exec_size = exec_size;
63b8e80941Smrg   this->base_mrf = -1;
64b8e80941Smrg
65b8e80941Smrg   assert(dst.file != IMM && dst.file != UNIFORM);
66b8e80941Smrg
67b8e80941Smrg   assert(this->exec_size != 0);
68b8e80941Smrg
69b8e80941Smrg   this->conditional_mod = BRW_CONDITIONAL_NONE;
70b8e80941Smrg
71b8e80941Smrg   /* This will be the case for almost all instructions. */
72b8e80941Smrg   switch (dst.file) {
73b8e80941Smrg   case VGRF:
74b8e80941Smrg   case ARF:
75b8e80941Smrg   case FIXED_GRF:
76b8e80941Smrg   case MRF:
77b8e80941Smrg   case ATTR:
78b8e80941Smrg      this->size_written = dst.component_size(exec_size);
79b8e80941Smrg      break;
80b8e80941Smrg   case BAD_FILE:
81b8e80941Smrg      this->size_written = 0;
82b8e80941Smrg      break;
83b8e80941Smrg   case IMM:
84b8e80941Smrg   case UNIFORM:
85b8e80941Smrg      unreachable("Invalid destination register file");
86b8e80941Smrg   }
87b8e80941Smrg
88b8e80941Smrg   this->writes_accumulator = false;
89b8e80941Smrg}
90b8e80941Smrg
91b8e80941Smrgfs_inst::fs_inst()
92b8e80941Smrg{
93b8e80941Smrg   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
94b8e80941Smrg}
95b8e80941Smrg
96b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
97b8e80941Smrg{
98b8e80941Smrg   init(opcode, exec_size, reg_undef, NULL, 0);
99b8e80941Smrg}
100b8e80941Smrg
101b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
102b8e80941Smrg{
103b8e80941Smrg   init(opcode, exec_size, dst, NULL, 0);
104b8e80941Smrg}
105b8e80941Smrg
106b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
107b8e80941Smrg                 const fs_reg &src0)
108b8e80941Smrg{
109b8e80941Smrg   const fs_reg src[1] = { src0 };
110b8e80941Smrg   init(opcode, exec_size, dst, src, 1);
111b8e80941Smrg}
112b8e80941Smrg
113b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
114b8e80941Smrg                 const fs_reg &src0, const fs_reg &src1)
115b8e80941Smrg{
116b8e80941Smrg   const fs_reg src[2] = { src0, src1 };
117b8e80941Smrg   init(opcode, exec_size, dst, src, 2);
118b8e80941Smrg}
119b8e80941Smrg
120b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
121b8e80941Smrg                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
122b8e80941Smrg{
123b8e80941Smrg   const fs_reg src[3] = { src0, src1, src2 };
124b8e80941Smrg   init(opcode, exec_size, dst, src, 3);
125b8e80941Smrg}
126b8e80941Smrg
127b8e80941Smrgfs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
128b8e80941Smrg                 const fs_reg src[], unsigned sources)
129b8e80941Smrg{
130b8e80941Smrg   init(opcode, exec_width, dst, src, sources);
131b8e80941Smrg}
132b8e80941Smrg
133b8e80941Smrgfs_inst::fs_inst(const fs_inst &that)
134b8e80941Smrg{
135b8e80941Smrg   memcpy((void*)this, &that, sizeof(that));
136b8e80941Smrg
137b8e80941Smrg   this->src = new fs_reg[MAX2(that.sources, 3)];
138b8e80941Smrg
139b8e80941Smrg   for (unsigned i = 0; i < that.sources; i++)
140b8e80941Smrg      this->src[i] = that.src[i];
141b8e80941Smrg}
142b8e80941Smrg
143b8e80941Smrgfs_inst::~fs_inst()
144b8e80941Smrg{
145b8e80941Smrg   delete[] this->src;
146b8e80941Smrg}
147b8e80941Smrg
148b8e80941Smrgvoid
149b8e80941Smrgfs_inst::resize_sources(uint8_t num_sources)
150b8e80941Smrg{
151b8e80941Smrg   if (this->sources != num_sources) {
152b8e80941Smrg      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
153b8e80941Smrg
154b8e80941Smrg      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
155b8e80941Smrg         src[i] = this->src[i];
156b8e80941Smrg
157b8e80941Smrg      delete[] this->src;
158b8e80941Smrg      this->src = src;
159b8e80941Smrg      this->sources = num_sources;
160b8e80941Smrg   }
161b8e80941Smrg}
162b8e80941Smrg
163b8e80941Smrgvoid
164b8e80941Smrgfs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
165b8e80941Smrg                                       const fs_reg &dst,
166b8e80941Smrg                                       const fs_reg &surf_index,
167b8e80941Smrg                                       const fs_reg &varying_offset,
168b8e80941Smrg                                       uint32_t const_offset)
169b8e80941Smrg{
170b8e80941Smrg   /* We have our constant surface use a pitch of 4 bytes, so our index can
171b8e80941Smrg    * be any component of a vector, and then we load 4 contiguous
172b8e80941Smrg    * components starting from that.
173b8e80941Smrg    *
174b8e80941Smrg    * We break down the const_offset to a portion added to the variable offset
175b8e80941Smrg    * and a portion done using fs_reg::offset, which means that if you have
176b8e80941Smrg    * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
177b8e80941Smrg    * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
178b8e80941Smrg    * later notice that those loads are all the same and eliminate the
179b8e80941Smrg    * redundant ones.
180b8e80941Smrg    */
181b8e80941Smrg   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
182b8e80941Smrg   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
183b8e80941Smrg
184b8e80941Smrg   /* The pull load message will load a vec4 (16 bytes). If we are loading
185b8e80941Smrg    * a double this means we are only loading 2 elements worth of data.
186b8e80941Smrg    * We also want to use a 32-bit data type for the dst of the load operation
187b8e80941Smrg    * so other parts of the driver don't get confused about the size of the
188b8e80941Smrg    * result.
189b8e80941Smrg    */
190b8e80941Smrg   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
191b8e80941Smrg   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
192b8e80941Smrg                            vec4_result, surf_index, vec4_offset);
193b8e80941Smrg   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
194b8e80941Smrg
195b8e80941Smrg   shuffle_from_32bit_read(bld, dst, vec4_result,
196b8e80941Smrg                           (const_offset & 0xf) / type_sz(dst.type), 1);
197b8e80941Smrg}
198b8e80941Smrg
199b8e80941Smrg/**
200b8e80941Smrg * A helper for MOV generation for fixing up broken hardware SEND dependency
201b8e80941Smrg * handling.
202b8e80941Smrg */
203b8e80941Smrgvoid
204b8e80941Smrgfs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
205b8e80941Smrg{
206b8e80941Smrg   /* The caller always wants uncompressed to emit the minimal extra
207b8e80941Smrg    * dependencies, and to avoid having to deal with aligning its regs to 2.
208b8e80941Smrg    */
209b8e80941Smrg   const fs_builder ubld = bld.annotate("send dependency resolve")
210b8e80941Smrg                              .half(0);
211b8e80941Smrg
212b8e80941Smrg   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
213b8e80941Smrg}
214b8e80941Smrg
215b8e80941Smrgbool
216b8e80941Smrgfs_inst::is_send_from_grf() const
217b8e80941Smrg{
218b8e80941Smrg   switch (opcode) {
219b8e80941Smrg   case SHADER_OPCODE_SEND:
220b8e80941Smrg   case SHADER_OPCODE_SHADER_TIME_ADD:
221b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
222b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
223b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
224b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8:
225b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
226b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
227b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
228b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8:
229b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
230b8e80941Smrg      return true;
231b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
232b8e80941Smrg      return src[1].file == VGRF;
233b8e80941Smrg   case FS_OPCODE_FB_WRITE:
234b8e80941Smrg   case FS_OPCODE_FB_READ:
235b8e80941Smrg      return src[0].file == VGRF;
236b8e80941Smrg   default:
237b8e80941Smrg      if (is_tex())
238b8e80941Smrg         return src[0].file == VGRF;
239b8e80941Smrg
240b8e80941Smrg      return false;
241b8e80941Smrg   }
242b8e80941Smrg}
243b8e80941Smrg
244b8e80941Smrgbool
245b8e80941Smrgfs_inst::is_control_source(unsigned arg) const
246b8e80941Smrg{
247b8e80941Smrg   switch (opcode) {
248b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
249b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
250b8e80941Smrg   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
251b8e80941Smrg      return arg == 0;
252b8e80941Smrg
253b8e80941Smrg   case SHADER_OPCODE_BROADCAST:
254b8e80941Smrg   case SHADER_OPCODE_SHUFFLE:
255b8e80941Smrg   case SHADER_OPCODE_QUAD_SWIZZLE:
256b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
257b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
258b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
259b8e80941Smrg   case SHADER_OPCODE_GET_BUFFER_SIZE:
260b8e80941Smrg      return arg == 1;
261b8e80941Smrg
262b8e80941Smrg   case SHADER_OPCODE_MOV_INDIRECT:
263b8e80941Smrg   case SHADER_OPCODE_CLUSTER_BROADCAST:
264b8e80941Smrg   case SHADER_OPCODE_TEX:
265b8e80941Smrg   case FS_OPCODE_TXB:
266b8e80941Smrg   case SHADER_OPCODE_TXD:
267b8e80941Smrg   case SHADER_OPCODE_TXF:
268b8e80941Smrg   case SHADER_OPCODE_TXF_LZ:
269b8e80941Smrg   case SHADER_OPCODE_TXF_CMS:
270b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_W:
271b8e80941Smrg   case SHADER_OPCODE_TXF_UMS:
272b8e80941Smrg   case SHADER_OPCODE_TXF_MCS:
273b8e80941Smrg   case SHADER_OPCODE_TXL:
274b8e80941Smrg   case SHADER_OPCODE_TXL_LZ:
275b8e80941Smrg   case SHADER_OPCODE_TXS:
276b8e80941Smrg   case SHADER_OPCODE_LOD:
277b8e80941Smrg   case SHADER_OPCODE_TG4:
278b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET:
279b8e80941Smrg   case SHADER_OPCODE_SAMPLEINFO:
280b8e80941Smrg      return arg == 1 || arg == 2;
281b8e80941Smrg
282b8e80941Smrg   case SHADER_OPCODE_SEND:
283b8e80941Smrg      return arg == 0 || arg == 1;
284b8e80941Smrg
285b8e80941Smrg   default:
286b8e80941Smrg      return false;
287b8e80941Smrg   }
288b8e80941Smrg}
289b8e80941Smrg
290b8e80941Smrg/**
291b8e80941Smrg * Returns true if this instruction's sources and destinations cannot
292b8e80941Smrg * safely be the same register.
293b8e80941Smrg *
294b8e80941Smrg * In most cases, a register can be written over safely by the same
295b8e80941Smrg * instruction that is its last use.  For a single instruction, the
296b8e80941Smrg * sources are dereferenced before writing of the destination starts
297b8e80941Smrg * (naturally).
298b8e80941Smrg *
299b8e80941Smrg * However, there are a few cases where this can be problematic:
300b8e80941Smrg *
301b8e80941Smrg * - Virtual opcodes that translate to multiple instructions in the
302b8e80941Smrg *   code generator: if src == dst and one instruction writes the
303b8e80941Smrg *   destination before a later instruction reads the source, then
304b8e80941Smrg *   src will have been clobbered.
305b8e80941Smrg *
306b8e80941Smrg * - SIMD16 compressed instructions with certain regioning (see below).
307b8e80941Smrg *
308b8e80941Smrg * The register allocator uses this information to set up conflicts between
309b8e80941Smrg * GRF sources and the destination.
310b8e80941Smrg */
311b8e80941Smrgbool
312b8e80941Smrgfs_inst::has_source_and_destination_hazard() const
313b8e80941Smrg{
314b8e80941Smrg   switch (opcode) {
315b8e80941Smrg   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
316b8e80941Smrg      /* Multiple partial writes to the destination */
317b8e80941Smrg      return true;
318b8e80941Smrg   case SHADER_OPCODE_SHUFFLE:
319b8e80941Smrg      /* This instruction returns an arbitrary channel from the source and
320b8e80941Smrg       * gets split into smaller instructions in the generator.  It's possible
321b8e80941Smrg       * that one of the instructions will read from a channel corresponding
322b8e80941Smrg       * to an earlier instruction.
323b8e80941Smrg       */
324b8e80941Smrg   case SHADER_OPCODE_SEL_EXEC:
325b8e80941Smrg      /* This is implemented as
326b8e80941Smrg       *
327b8e80941Smrg       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
328b8e80941Smrg       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
329b8e80941Smrg       *
330b8e80941Smrg       * Because the source is only read in the second instruction, the first
331b8e80941Smrg       * may stomp all over it.
332b8e80941Smrg       */
333b8e80941Smrg      return true;
334b8e80941Smrg   case SHADER_OPCODE_QUAD_SWIZZLE:
335b8e80941Smrg      switch (src[1].ud) {
336b8e80941Smrg      case BRW_SWIZZLE_XXXX:
337b8e80941Smrg      case BRW_SWIZZLE_YYYY:
338b8e80941Smrg      case BRW_SWIZZLE_ZZZZ:
339b8e80941Smrg      case BRW_SWIZZLE_WWWW:
340b8e80941Smrg      case BRW_SWIZZLE_XXZZ:
341b8e80941Smrg      case BRW_SWIZZLE_YYWW:
342b8e80941Smrg      case BRW_SWIZZLE_XYXY:
343b8e80941Smrg      case BRW_SWIZZLE_ZWZW:
344b8e80941Smrg         /* These can be implemented as a single Align1 region on all
345b8e80941Smrg          * platforms, so there's never a hazard between source and
346b8e80941Smrg          * destination.  C.f. fs_generator::generate_quad_swizzle().
347b8e80941Smrg          */
348b8e80941Smrg         return false;
349b8e80941Smrg      default:
350b8e80941Smrg         return !is_uniform(src[0]);
351b8e80941Smrg      }
352b8e80941Smrg   default:
353b8e80941Smrg      /* The SIMD16 compressed instruction
354b8e80941Smrg       *
355b8e80941Smrg       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
356b8e80941Smrg       *
357b8e80941Smrg       * is actually decoded in hardware as:
358b8e80941Smrg       *
359b8e80941Smrg       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
360b8e80941Smrg       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
361b8e80941Smrg       *
362b8e80941Smrg       * Which is safe.  However, if we have uniform accesses
363b8e80941Smrg       * happening, we get into trouble:
364b8e80941Smrg       *
365b8e80941Smrg       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
366b8e80941Smrg       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
367b8e80941Smrg       *
368b8e80941Smrg       * Now our destination for the first instruction overwrote the
369b8e80941Smrg       * second instruction's src0, and we get garbage for those 8
370b8e80941Smrg       * pixels.  There's a similar issue for the pre-gen6
371b8e80941Smrg       * pixel_x/pixel_y, which are registers of 16-bit values and thus
372b8e80941Smrg       * would get stomped by the first decode as well.
373b8e80941Smrg       */
374b8e80941Smrg      if (exec_size == 16) {
375b8e80941Smrg         for (int i = 0; i < sources; i++) {
376b8e80941Smrg            if (src[i].file == VGRF && (src[i].stride == 0 ||
377b8e80941Smrg                                        src[i].type == BRW_REGISTER_TYPE_UW ||
378b8e80941Smrg                                        src[i].type == BRW_REGISTER_TYPE_W ||
379b8e80941Smrg                                        src[i].type == BRW_REGISTER_TYPE_UB ||
380b8e80941Smrg                                        src[i].type == BRW_REGISTER_TYPE_B)) {
381b8e80941Smrg               return true;
382b8e80941Smrg            }
383b8e80941Smrg         }
384b8e80941Smrg      }
385b8e80941Smrg      return false;
386b8e80941Smrg   }
387b8e80941Smrg}
388b8e80941Smrg
389b8e80941Smrgbool
390b8e80941Smrgfs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
391b8e80941Smrg{
392b8e80941Smrg   if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
393b8e80941Smrg      return false;
394b8e80941Smrg
395b8e80941Smrg   fs_reg reg = this->src[0];
396b8e80941Smrg   if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
397b8e80941Smrg      return false;
398b8e80941Smrg
399b8e80941Smrg   if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
400b8e80941Smrg      return false;
401b8e80941Smrg
402b8e80941Smrg   for (int i = 0; i < this->sources; i++) {
403b8e80941Smrg      reg.type = this->src[i].type;
404b8e80941Smrg      if (!this->src[i].equals(reg))
405b8e80941Smrg         return false;
406b8e80941Smrg
407b8e80941Smrg      if (i < this->header_size) {
408b8e80941Smrg         reg.offset += REG_SIZE;
409b8e80941Smrg      } else {
410b8e80941Smrg         reg = horiz_offset(reg, this->exec_size);
411b8e80941Smrg      }
412b8e80941Smrg   }
413b8e80941Smrg
414b8e80941Smrg   return true;
415b8e80941Smrg}
416b8e80941Smrg
417b8e80941Smrgbool
418b8e80941Smrgfs_inst::can_do_source_mods(const struct gen_device_info *devinfo) const
419b8e80941Smrg{
420b8e80941Smrg   if (devinfo->gen == 6 && is_math())
421b8e80941Smrg      return false;
422b8e80941Smrg
423b8e80941Smrg   if (is_send_from_grf())
424b8e80941Smrg      return false;
425b8e80941Smrg
426b8e80941Smrg   if (!backend_instruction::can_do_source_mods())
427b8e80941Smrg      return false;
428b8e80941Smrg
429b8e80941Smrg   return true;
430b8e80941Smrg}
431b8e80941Smrg
432b8e80941Smrgbool
433b8e80941Smrgfs_inst::can_do_cmod()
434b8e80941Smrg{
435b8e80941Smrg   if (!backend_instruction::can_do_cmod())
436b8e80941Smrg      return false;
437b8e80941Smrg
438b8e80941Smrg   /* The accumulator result appears to get used for the conditional modifier
439b8e80941Smrg    * generation.  When negating a UD value, there is a 33rd bit generated for
440b8e80941Smrg    * the sign in the accumulator value, so now you can't check, for example,
441b8e80941Smrg    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
442b8e80941Smrg    */
443b8e80941Smrg   for (unsigned i = 0; i < sources; i++) {
444b8e80941Smrg      if (type_is_unsigned_int(src[i].type) && src[i].negate)
445b8e80941Smrg         return false;
446b8e80941Smrg   }
447b8e80941Smrg
448b8e80941Smrg   return true;
449b8e80941Smrg}
450b8e80941Smrg
451b8e80941Smrgbool
452b8e80941Smrgfs_inst::can_change_types() const
453b8e80941Smrg{
454b8e80941Smrg   return dst.type == src[0].type &&
455b8e80941Smrg          !src[0].abs && !src[0].negate && !saturate &&
456b8e80941Smrg          (opcode == BRW_OPCODE_MOV ||
457b8e80941Smrg           (opcode == BRW_OPCODE_SEL &&
458b8e80941Smrg            dst.type == src[1].type &&
459b8e80941Smrg            predicate != BRW_PREDICATE_NONE &&
460b8e80941Smrg            !src[1].abs && !src[1].negate));
461b8e80941Smrg}
462b8e80941Smrg
463b8e80941Smrgvoid
464b8e80941Smrgfs_reg::init()
465b8e80941Smrg{
466b8e80941Smrg   memset((void*)this, 0, sizeof(*this));
467b8e80941Smrg   type = BRW_REGISTER_TYPE_UD;
468b8e80941Smrg   stride = 1;
469b8e80941Smrg}
470b8e80941Smrg
471b8e80941Smrg/** Generic unset register constructor. */
472b8e80941Smrgfs_reg::fs_reg()
473b8e80941Smrg{
474b8e80941Smrg   init();
475b8e80941Smrg   this->file = BAD_FILE;
476b8e80941Smrg}
477b8e80941Smrg
478b8e80941Smrgfs_reg::fs_reg(struct ::brw_reg reg) :
479b8e80941Smrg   backend_reg(reg)
480b8e80941Smrg{
481b8e80941Smrg   this->offset = 0;
482b8e80941Smrg   this->stride = 1;
483b8e80941Smrg   if (this->file == IMM &&
484b8e80941Smrg       (this->type != BRW_REGISTER_TYPE_V &&
485b8e80941Smrg        this->type != BRW_REGISTER_TYPE_UV &&
486b8e80941Smrg        this->type != BRW_REGISTER_TYPE_VF)) {
487b8e80941Smrg      this->stride = 0;
488b8e80941Smrg   }
489b8e80941Smrg}
490b8e80941Smrg
491b8e80941Smrgbool
492b8e80941Smrgfs_reg::equals(const fs_reg &r) const
493b8e80941Smrg{
494b8e80941Smrg   return (this->backend_reg::equals(r) &&
495b8e80941Smrg           stride == r.stride);
496b8e80941Smrg}
497b8e80941Smrg
498b8e80941Smrgbool
499b8e80941Smrgfs_reg::negative_equals(const fs_reg &r) const
500b8e80941Smrg{
501b8e80941Smrg   return (this->backend_reg::negative_equals(r) &&
502b8e80941Smrg           stride == r.stride);
503b8e80941Smrg}
504b8e80941Smrg
505b8e80941Smrgbool
506b8e80941Smrgfs_reg::is_contiguous() const
507b8e80941Smrg{
508b8e80941Smrg   return stride == 1;
509b8e80941Smrg}
510b8e80941Smrg
511b8e80941Smrgunsigned
512b8e80941Smrgfs_reg::component_size(unsigned width) const
513b8e80941Smrg{
514b8e80941Smrg   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
515b8e80941Smrg                            hstride == 0 ? 0 :
516b8e80941Smrg                            1 << (hstride - 1));
517b8e80941Smrg   return MAX2(width * stride, 1) * type_sz(type);
518b8e80941Smrg}
519b8e80941Smrg
520b8e80941Smrgextern "C" int
521b8e80941Smrgtype_size_scalar(const struct glsl_type *type, bool bindless)
522b8e80941Smrg{
523b8e80941Smrg   unsigned int size, i;
524b8e80941Smrg
525b8e80941Smrg   switch (type->base_type) {
526b8e80941Smrg   case GLSL_TYPE_UINT:
527b8e80941Smrg   case GLSL_TYPE_INT:
528b8e80941Smrg   case GLSL_TYPE_FLOAT:
529b8e80941Smrg   case GLSL_TYPE_BOOL:
530b8e80941Smrg      return type->components();
531b8e80941Smrg   case GLSL_TYPE_UINT16:
532b8e80941Smrg   case GLSL_TYPE_INT16:
533b8e80941Smrg   case GLSL_TYPE_FLOAT16:
534b8e80941Smrg      return DIV_ROUND_UP(type->components(), 2);
535b8e80941Smrg   case GLSL_TYPE_UINT8:
536b8e80941Smrg   case GLSL_TYPE_INT8:
537b8e80941Smrg      return DIV_ROUND_UP(type->components(), 4);
538b8e80941Smrg   case GLSL_TYPE_DOUBLE:
539b8e80941Smrg   case GLSL_TYPE_UINT64:
540b8e80941Smrg   case GLSL_TYPE_INT64:
541b8e80941Smrg      return type->components() * 2;
542b8e80941Smrg   case GLSL_TYPE_ARRAY:
543b8e80941Smrg      return type_size_scalar(type->fields.array, bindless) * type->length;
544b8e80941Smrg   case GLSL_TYPE_STRUCT:
545b8e80941Smrg   case GLSL_TYPE_INTERFACE:
546b8e80941Smrg      size = 0;
547b8e80941Smrg      for (i = 0; i < type->length; i++) {
548b8e80941Smrg	 size += type_size_scalar(type->fields.structure[i].type, bindless);
549b8e80941Smrg      }
550b8e80941Smrg      return size;
551b8e80941Smrg   case GLSL_TYPE_SAMPLER:
552b8e80941Smrg   case GLSL_TYPE_IMAGE:
553b8e80941Smrg      if (bindless)
554b8e80941Smrg         return type->components() * 2;
555b8e80941Smrg   case GLSL_TYPE_ATOMIC_UINT:
556b8e80941Smrg      /* Samplers, atomics, and images take up no register space, since
557b8e80941Smrg       * they're baked in at link time.
558b8e80941Smrg       */
559b8e80941Smrg      return 0;
560b8e80941Smrg   case GLSL_TYPE_SUBROUTINE:
561b8e80941Smrg      return 1;
562b8e80941Smrg   case GLSL_TYPE_VOID:
563b8e80941Smrg   case GLSL_TYPE_ERROR:
564b8e80941Smrg   case GLSL_TYPE_FUNCTION:
565b8e80941Smrg      unreachable("not reached");
566b8e80941Smrg   }
567b8e80941Smrg
568b8e80941Smrg   return 0;
569b8e80941Smrg}
570b8e80941Smrg
571b8e80941Smrg/**
572b8e80941Smrg * Create a MOV to read the timestamp register.
573b8e80941Smrg *
574b8e80941Smrg * The caller is responsible for emitting the MOV.  The return value is
575b8e80941Smrg * the destination of the MOV, with extra parameters set.
576b8e80941Smrg */
577b8e80941Smrgfs_reg
578b8e80941Smrgfs_visitor::get_timestamp(const fs_builder &bld)
579b8e80941Smrg{
580b8e80941Smrg   assert(devinfo->gen >= 7);
581b8e80941Smrg
582b8e80941Smrg   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
583b8e80941Smrg                                          BRW_ARF_TIMESTAMP,
584b8e80941Smrg                                          0),
585b8e80941Smrg                             BRW_REGISTER_TYPE_UD));
586b8e80941Smrg
587b8e80941Smrg   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
588b8e80941Smrg
589b8e80941Smrg   /* We want to read the 3 fields we care about even if it's not enabled in
590b8e80941Smrg    * the dispatch.
591b8e80941Smrg    */
592b8e80941Smrg   bld.group(4, 0).exec_all().MOV(dst, ts);
593b8e80941Smrg
594b8e80941Smrg   return dst;
595b8e80941Smrg}
596b8e80941Smrg
597b8e80941Smrgvoid
598b8e80941Smrgfs_visitor::emit_shader_time_begin()
599b8e80941Smrg{
600b8e80941Smrg   /* We want only the low 32 bits of the timestamp.  Since it's running
601b8e80941Smrg    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
602b8e80941Smrg    * which is plenty of time for our purposes.  It is identical across the
603b8e80941Smrg    * EUs, but since it's tracking GPU core speed it will increment at a
604b8e80941Smrg    * varying rate as render P-states change.
605b8e80941Smrg    */
606b8e80941Smrg   shader_start_time = component(
607b8e80941Smrg      get_timestamp(bld.annotate("shader time start")), 0);
608b8e80941Smrg}
609b8e80941Smrg
610b8e80941Smrgvoid
611b8e80941Smrgfs_visitor::emit_shader_time_end()
612b8e80941Smrg{
613b8e80941Smrg   /* Insert our code just before the final SEND with EOT. */
614b8e80941Smrg   exec_node *end = this->instructions.get_tail();
615b8e80941Smrg   assert(end && ((fs_inst *) end)->eot);
616b8e80941Smrg   const fs_builder ibld = bld.annotate("shader time end")
617b8e80941Smrg                              .exec_all().at(NULL, end);
618b8e80941Smrg   const fs_reg timestamp = get_timestamp(ibld);
619b8e80941Smrg
620b8e80941Smrg   /* We only use the low 32 bits of the timestamp - see
621b8e80941Smrg    * emit_shader_time_begin()).
622b8e80941Smrg    *
623b8e80941Smrg    * We could also check if render P-states have changed (or anything
624b8e80941Smrg    * else that might disrupt timing) by setting smear to 2 and checking if
625b8e80941Smrg    * that field is != 0.
626b8e80941Smrg    */
627b8e80941Smrg   const fs_reg shader_end_time = component(timestamp, 0);
628b8e80941Smrg
629b8e80941Smrg   /* Check that there weren't any timestamp reset events (assuming these
630b8e80941Smrg    * were the only two timestamp reads that happened).
631b8e80941Smrg    */
632b8e80941Smrg   const fs_reg reset = component(timestamp, 2);
633b8e80941Smrg   set_condmod(BRW_CONDITIONAL_Z,
634b8e80941Smrg               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
635b8e80941Smrg   ibld.IF(BRW_PREDICATE_NORMAL);
636b8e80941Smrg
637b8e80941Smrg   fs_reg start = shader_start_time;
638b8e80941Smrg   start.negate = true;
639b8e80941Smrg   const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
640b8e80941Smrg                                        BRW_REGISTER_TYPE_UD),
641b8e80941Smrg                                 0);
642b8e80941Smrg   const fs_builder cbld = ibld.group(1, 0);
643b8e80941Smrg   cbld.group(1, 0).ADD(diff, start, shader_end_time);
644b8e80941Smrg
645b8e80941Smrg   /* If there were no instructions between the two timestamp gets, the diff
646b8e80941Smrg    * is 2 cycles.  Remove that overhead, so I can forget about that when
647b8e80941Smrg    * trying to determine the time taken for single instructions.
648b8e80941Smrg    */
649b8e80941Smrg   cbld.ADD(diff, diff, brw_imm_ud(-2u));
650b8e80941Smrg   SHADER_TIME_ADD(cbld, 0, diff);
651b8e80941Smrg   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
652b8e80941Smrg   ibld.emit(BRW_OPCODE_ELSE);
653b8e80941Smrg   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
654b8e80941Smrg   ibld.emit(BRW_OPCODE_ENDIF);
655b8e80941Smrg}
656b8e80941Smrg
657b8e80941Smrgvoid
658b8e80941Smrgfs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
659b8e80941Smrg                            int shader_time_subindex,
660b8e80941Smrg                            fs_reg value)
661b8e80941Smrg{
662b8e80941Smrg   int index = shader_time_index * 3 + shader_time_subindex;
663b8e80941Smrg   struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
664b8e80941Smrg
665b8e80941Smrg   fs_reg payload;
666b8e80941Smrg   if (dispatch_width == 8)
667b8e80941Smrg      payload = vgrf(glsl_type::uvec2_type);
668b8e80941Smrg   else
669b8e80941Smrg      payload = vgrf(glsl_type::uint_type);
670b8e80941Smrg
671b8e80941Smrg   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
672b8e80941Smrg}
673b8e80941Smrg
674b8e80941Smrgvoid
675b8e80941Smrgfs_visitor::vfail(const char *format, va_list va)
676b8e80941Smrg{
677b8e80941Smrg   char *msg;
678b8e80941Smrg
679b8e80941Smrg   if (failed)
680b8e80941Smrg      return;
681b8e80941Smrg
682b8e80941Smrg   failed = true;
683b8e80941Smrg
684b8e80941Smrg   msg = ralloc_vasprintf(mem_ctx, format, va);
685b8e80941Smrg   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
686b8e80941Smrg
687b8e80941Smrg   this->fail_msg = msg;
688b8e80941Smrg
689b8e80941Smrg   if (debug_enabled) {
690b8e80941Smrg      fprintf(stderr, "%s",  msg);
691b8e80941Smrg   }
692b8e80941Smrg}
693b8e80941Smrg
694b8e80941Smrgvoid
695b8e80941Smrgfs_visitor::fail(const char *format, ...)
696b8e80941Smrg{
697b8e80941Smrg   va_list va;
698b8e80941Smrg
699b8e80941Smrg   va_start(va, format);
700b8e80941Smrg   vfail(format, va);
701b8e80941Smrg   va_end(va);
702b8e80941Smrg}
703b8e80941Smrg
704b8e80941Smrg/**
705b8e80941Smrg * Mark this program as impossible to compile with dispatch width greater
706b8e80941Smrg * than n.
707b8e80941Smrg *
708b8e80941Smrg * During the SIMD8 compile (which happens first), we can detect and flag
709b8e80941Smrg * things that are unsupported in SIMD16+ mode, so the compiler can skip the
710b8e80941Smrg * SIMD16+ compile altogether.
711b8e80941Smrg *
712b8e80941Smrg * During a compile of dispatch width greater than n (if one happens anyway),
713b8e80941Smrg * this just calls fail().
714b8e80941Smrg */
715b8e80941Smrgvoid
716b8e80941Smrgfs_visitor::limit_dispatch_width(unsigned n, const char *msg)
717b8e80941Smrg{
718b8e80941Smrg   if (dispatch_width > n) {
719b8e80941Smrg      fail("%s", msg);
720b8e80941Smrg   } else {
721b8e80941Smrg      max_dispatch_width = n;
722b8e80941Smrg      compiler->shader_perf_log(log_data,
723b8e80941Smrg                                "Shader dispatch width limited to SIMD%d: %s",
724b8e80941Smrg                                n, msg);
725b8e80941Smrg   }
726b8e80941Smrg}
727b8e80941Smrg
728b8e80941Smrg/**
729b8e80941Smrg * Returns true if the instruction has a flag that means it won't
730b8e80941Smrg * update an entire destination register.
731b8e80941Smrg *
732b8e80941Smrg * For example, dead code elimination and live variable analysis want to know
733b8e80941Smrg * when a write to a variable screens off any preceding values that were in
734b8e80941Smrg * it.
735b8e80941Smrg */
736b8e80941Smrgbool
737b8e80941Smrgfs_inst::is_partial_write() const
738b8e80941Smrg{
739b8e80941Smrg   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
740b8e80941Smrg           (this->exec_size * type_sz(this->dst.type)) < 32 ||
741b8e80941Smrg           !this->dst.is_contiguous() ||
742b8e80941Smrg           this->dst.offset % REG_SIZE != 0);
743b8e80941Smrg}
744b8e80941Smrg
745b8e80941Smrgunsigned
746b8e80941Smrgfs_inst::components_read(unsigned i) const
747b8e80941Smrg{
748b8e80941Smrg   /* Return zero if the source is not present. */
749b8e80941Smrg   if (src[i].file == BAD_FILE)
750b8e80941Smrg      return 0;
751b8e80941Smrg
752b8e80941Smrg   switch (opcode) {
753b8e80941Smrg   case FS_OPCODE_LINTERP:
754b8e80941Smrg      if (i == 0)
755b8e80941Smrg         return 2;
756b8e80941Smrg      else
757b8e80941Smrg         return 1;
758b8e80941Smrg
759b8e80941Smrg   case FS_OPCODE_PIXEL_X:
760b8e80941Smrg   case FS_OPCODE_PIXEL_Y:
761b8e80941Smrg      assert(i == 0);
762b8e80941Smrg      return 2;
763b8e80941Smrg
764b8e80941Smrg   case FS_OPCODE_FB_WRITE_LOGICAL:
765b8e80941Smrg      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
766b8e80941Smrg      /* First/second FB write color. */
767b8e80941Smrg      if (i < 2)
768b8e80941Smrg         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
769b8e80941Smrg      else
770b8e80941Smrg         return 1;
771b8e80941Smrg
772b8e80941Smrg   case SHADER_OPCODE_TEX_LOGICAL:
773b8e80941Smrg   case SHADER_OPCODE_TXD_LOGICAL:
774b8e80941Smrg   case SHADER_OPCODE_TXF_LOGICAL:
775b8e80941Smrg   case SHADER_OPCODE_TXL_LOGICAL:
776b8e80941Smrg   case SHADER_OPCODE_TXS_LOGICAL:
777b8e80941Smrg   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
778b8e80941Smrg   case FS_OPCODE_TXB_LOGICAL:
779b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_LOGICAL:
780b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
781b8e80941Smrg   case SHADER_OPCODE_TXF_UMS_LOGICAL:
782b8e80941Smrg   case SHADER_OPCODE_TXF_MCS_LOGICAL:
783b8e80941Smrg   case SHADER_OPCODE_LOD_LOGICAL:
784b8e80941Smrg   case SHADER_OPCODE_TG4_LOGICAL:
785b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
786b8e80941Smrg   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
787b8e80941Smrg      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
788b8e80941Smrg             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
789b8e80941Smrg      /* Texture coordinates. */
790b8e80941Smrg      if (i == TEX_LOGICAL_SRC_COORDINATE)
791b8e80941Smrg         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
792b8e80941Smrg      /* Texture derivatives. */
793b8e80941Smrg      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
794b8e80941Smrg               opcode == SHADER_OPCODE_TXD_LOGICAL)
795b8e80941Smrg         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
796b8e80941Smrg      /* Texture offset. */
797b8e80941Smrg      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
798b8e80941Smrg         return 2;
799b8e80941Smrg      /* MCS */
800b8e80941Smrg      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
801b8e80941Smrg         return 2;
802b8e80941Smrg      else
803b8e80941Smrg         return 1;
804b8e80941Smrg
805b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
806b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
807b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
808b8e80941Smrg      /* Surface coordinates. */
809b8e80941Smrg      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
810b8e80941Smrg         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
811b8e80941Smrg      /* Surface operation source (ignored for reads). */
812b8e80941Smrg      else if (i == SURFACE_LOGICAL_SRC_DATA)
813b8e80941Smrg         return 0;
814b8e80941Smrg      else
815b8e80941Smrg         return 1;
816b8e80941Smrg
817b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
818b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
819b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
820b8e80941Smrg             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
821b8e80941Smrg      /* Surface coordinates. */
822b8e80941Smrg      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
823b8e80941Smrg         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
824b8e80941Smrg      /* Surface operation source. */
825b8e80941Smrg      else if (i == SURFACE_LOGICAL_SRC_DATA)
826b8e80941Smrg         return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
827b8e80941Smrg      else
828b8e80941Smrg         return 1;
829b8e80941Smrg
830b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
831b8e80941Smrg      assert(src[2].file == IMM);
832b8e80941Smrg      return 1;
833b8e80941Smrg
834b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
835b8e80941Smrg      assert(src[2].file == IMM);
836b8e80941Smrg      return i == 1 ? src[2].ud : 1;
837b8e80941Smrg
838b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
839b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
840b8e80941Smrg      assert(src[2].file == IMM);
841b8e80941Smrg      if (i == 1) {
842b8e80941Smrg         /* Data source */
843b8e80941Smrg         const unsigned op = src[2].ud;
844b8e80941Smrg         switch (op) {
845b8e80941Smrg         case BRW_AOP_INC:
846b8e80941Smrg         case BRW_AOP_DEC:
847b8e80941Smrg         case BRW_AOP_PREDEC:
848b8e80941Smrg            return 0;
849b8e80941Smrg         case BRW_AOP_CMPWR:
850b8e80941Smrg            return 2;
851b8e80941Smrg         default:
852b8e80941Smrg            return 1;
853b8e80941Smrg         }
854b8e80941Smrg      } else {
855b8e80941Smrg         return 1;
856b8e80941Smrg      }
857b8e80941Smrg
858b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
859b8e80941Smrg      assert(src[2].file == IMM);
860b8e80941Smrg      if (i == 1) {
861b8e80941Smrg         /* Data source */
862b8e80941Smrg         const unsigned op = src[2].ud;
863b8e80941Smrg         return op == BRW_AOP_FCMPWR ? 2 : 1;
864b8e80941Smrg      } else {
865b8e80941Smrg         return 1;
866b8e80941Smrg      }
867b8e80941Smrg
868b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
869b8e80941Smrg      /* Scattered logical opcodes use the following params:
870b8e80941Smrg       * src[0] Surface coordinates
871b8e80941Smrg       * src[1] Surface operation source (ignored for reads)
872b8e80941Smrg       * src[2] Surface
873b8e80941Smrg       * src[3] IMM with always 1 dimension.
874b8e80941Smrg       * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
875b8e80941Smrg       */
876b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
877b8e80941Smrg             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
878b8e80941Smrg      return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
879b8e80941Smrg
880b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
881b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
882b8e80941Smrg             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
883b8e80941Smrg      return 1;
884b8e80941Smrg
885b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
886b8e80941Smrg   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
887b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
888b8e80941Smrg             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
889b8e80941Smrg      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
890b8e80941Smrg      /* Surface coordinates. */
891b8e80941Smrg      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
892b8e80941Smrg         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
893b8e80941Smrg      /* Surface operation source. */
894b8e80941Smrg      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
895b8e80941Smrg         return 2;
896b8e80941Smrg      else if (i == SURFACE_LOGICAL_SRC_DATA &&
897b8e80941Smrg               (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
898b8e80941Smrg         return 0;
899b8e80941Smrg      else
900b8e80941Smrg         return 1;
901b8e80941Smrg   }
902b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
903b8e80941Smrg      return (i == 0 ? 2 : 1);
904b8e80941Smrg
905b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
906b8e80941Smrg      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
907b8e80941Smrg             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
908b8e80941Smrg      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
909b8e80941Smrg      /* Surface coordinates. */
910b8e80941Smrg      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
911b8e80941Smrg         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
912b8e80941Smrg      /* Surface operation source. */
913b8e80941Smrg      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
914b8e80941Smrg         return 2;
915b8e80941Smrg      else
916b8e80941Smrg         return 1;
917b8e80941Smrg   }
918b8e80941Smrg
919b8e80941Smrg   default:
920b8e80941Smrg      return 1;
921b8e80941Smrg   }
922b8e80941Smrg}
923b8e80941Smrg
924b8e80941Smrgunsigned
925b8e80941Smrgfs_inst::size_read(int arg) const
926b8e80941Smrg{
927b8e80941Smrg   switch (opcode) {
928b8e80941Smrg   case SHADER_OPCODE_SEND:
929b8e80941Smrg      if (arg == 2) {
930b8e80941Smrg         return mlen * REG_SIZE;
931b8e80941Smrg      } else if (arg == 3) {
932b8e80941Smrg         return ex_mlen * REG_SIZE;
933b8e80941Smrg      }
934b8e80941Smrg      break;
935b8e80941Smrg
936b8e80941Smrg   case FS_OPCODE_FB_WRITE:
937b8e80941Smrg   case FS_OPCODE_REP_FB_WRITE:
938b8e80941Smrg      if (arg == 0) {
939b8e80941Smrg         if (base_mrf >= 0)
940b8e80941Smrg            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
941b8e80941Smrg         else
942b8e80941Smrg            return mlen * REG_SIZE;
943b8e80941Smrg      }
944b8e80941Smrg      break;
945b8e80941Smrg
946b8e80941Smrg   case FS_OPCODE_FB_READ:
947b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8:
948b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
949b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
950b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
951b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8:
952b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
953b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
954b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
955b8e80941Smrg      if (arg == 0)
956b8e80941Smrg         return mlen * REG_SIZE;
957b8e80941Smrg      break;
958b8e80941Smrg
959b8e80941Smrg   case FS_OPCODE_SET_SAMPLE_ID:
960b8e80941Smrg      if (arg == 1)
961b8e80941Smrg         return 1;
962b8e80941Smrg      break;
963b8e80941Smrg
964b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
965b8e80941Smrg      /* The payload is actually stored in src1 */
966b8e80941Smrg      if (arg == 1)
967b8e80941Smrg         return mlen * REG_SIZE;
968b8e80941Smrg      break;
969b8e80941Smrg
970b8e80941Smrg   case FS_OPCODE_LINTERP:
971b8e80941Smrg      if (arg == 1)
972b8e80941Smrg         return 16;
973b8e80941Smrg      break;
974b8e80941Smrg
975b8e80941Smrg   case SHADER_OPCODE_LOAD_PAYLOAD:
976b8e80941Smrg      if (arg < this->header_size)
977b8e80941Smrg         return REG_SIZE;
978b8e80941Smrg      break;
979b8e80941Smrg
980b8e80941Smrg   case CS_OPCODE_CS_TERMINATE:
981b8e80941Smrg   case SHADER_OPCODE_BARRIER:
982b8e80941Smrg      return REG_SIZE;
983b8e80941Smrg
984b8e80941Smrg   case SHADER_OPCODE_MOV_INDIRECT:
985b8e80941Smrg      if (arg == 0) {
986b8e80941Smrg         assert(src[2].file == IMM);
987b8e80941Smrg         return src[2].ud;
988b8e80941Smrg      }
989b8e80941Smrg      break;
990b8e80941Smrg
991b8e80941Smrg   default:
992b8e80941Smrg      if (is_tex() && arg == 0 && src[0].file == VGRF)
993b8e80941Smrg         return mlen * REG_SIZE;
994b8e80941Smrg      break;
995b8e80941Smrg   }
996b8e80941Smrg
997b8e80941Smrg   switch (src[arg].file) {
998b8e80941Smrg   case UNIFORM:
999b8e80941Smrg   case IMM:
1000b8e80941Smrg      return components_read(arg) * type_sz(src[arg].type);
1001b8e80941Smrg   case BAD_FILE:
1002b8e80941Smrg   case ARF:
1003b8e80941Smrg   case FIXED_GRF:
1004b8e80941Smrg   case VGRF:
1005b8e80941Smrg   case ATTR:
1006b8e80941Smrg      return components_read(arg) * src[arg].component_size(exec_size);
1007b8e80941Smrg   case MRF:
1008b8e80941Smrg      unreachable("MRF registers are not allowed as sources");
1009b8e80941Smrg   }
1010b8e80941Smrg   return 0;
1011b8e80941Smrg}
1012b8e80941Smrg
1013b8e80941Smrgnamespace {
1014b8e80941Smrg   /* Return the subset of flag registers that an instruction could
1015b8e80941Smrg    * potentially read or write based on the execution controls and flag
1016b8e80941Smrg    * subregister number of the instruction.
1017b8e80941Smrg    */
1018b8e80941Smrg   unsigned
1019b8e80941Smrg   flag_mask(const fs_inst *inst)
1020b8e80941Smrg   {
1021b8e80941Smrg      const unsigned start = inst->flag_subreg * 16 + inst->group;
1022b8e80941Smrg      const unsigned end = start + inst->exec_size;
1023b8e80941Smrg      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1024b8e80941Smrg   }
1025b8e80941Smrg
1026b8e80941Smrg   unsigned
1027b8e80941Smrg   bit_mask(unsigned n)
1028b8e80941Smrg   {
1029b8e80941Smrg      return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1030b8e80941Smrg   }
1031b8e80941Smrg
1032b8e80941Smrg   unsigned
1033b8e80941Smrg   flag_mask(const fs_reg &r, unsigned sz)
1034b8e80941Smrg   {
1035b8e80941Smrg      if (r.file == ARF) {
1036b8e80941Smrg         const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
1037b8e80941Smrg         const unsigned end = start + sz;
1038b8e80941Smrg         return bit_mask(end) & ~bit_mask(start);
1039b8e80941Smrg      } else {
1040b8e80941Smrg         return 0;
1041b8e80941Smrg      }
1042b8e80941Smrg   }
1043b8e80941Smrg}
1044b8e80941Smrg
1045b8e80941Smrgunsigned
1046b8e80941Smrgfs_inst::flags_read(const gen_device_info *devinfo) const
1047b8e80941Smrg{
1048b8e80941Smrg   if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
1049b8e80941Smrg       predicate == BRW_PREDICATE_ALIGN1_ALLV) {
1050b8e80941Smrg      /* The vertical predication modes combine corresponding bits from
1051b8e80941Smrg       * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
1052b8e80941Smrg       */
1053b8e80941Smrg      const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
1054b8e80941Smrg      return flag_mask(this) << shift | flag_mask(this);
1055b8e80941Smrg   } else if (predicate) {
1056b8e80941Smrg      return flag_mask(this);
1057b8e80941Smrg   } else {
1058b8e80941Smrg      unsigned mask = 0;
1059b8e80941Smrg      for (int i = 0; i < sources; i++) {
1060b8e80941Smrg         mask |= flag_mask(src[i], size_read(i));
1061b8e80941Smrg      }
1062b8e80941Smrg      return mask;
1063b8e80941Smrg   }
1064b8e80941Smrg}
1065b8e80941Smrg
1066b8e80941Smrgunsigned
1067b8e80941Smrgfs_inst::flags_written() const
1068b8e80941Smrg{
1069b8e80941Smrg   if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
1070b8e80941Smrg                            opcode != BRW_OPCODE_CSEL &&
1071b8e80941Smrg                            opcode != BRW_OPCODE_IF &&
1072b8e80941Smrg                            opcode != BRW_OPCODE_WHILE)) ||
1073b8e80941Smrg       opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1074b8e80941Smrg       opcode == FS_OPCODE_FB_WRITE) {
1075b8e80941Smrg      return flag_mask(this);
1076b8e80941Smrg   } else {
1077b8e80941Smrg      return flag_mask(dst, size_written);
1078b8e80941Smrg   }
1079b8e80941Smrg}
1080b8e80941Smrg
1081b8e80941Smrg/**
1082b8e80941Smrg * Returns how many MRFs an FS opcode will write over.
1083b8e80941Smrg *
1084b8e80941Smrg * Note that this is not the 0 or 1 implied writes in an actual gen
1085b8e80941Smrg * instruction -- the FS opcodes often generate MOVs in addition.
1086b8e80941Smrg */
1087b8e80941Smrgint
1088b8e80941Smrgfs_visitor::implied_mrf_writes(fs_inst *inst) const
1089b8e80941Smrg{
1090b8e80941Smrg   if (inst->mlen == 0)
1091b8e80941Smrg      return 0;
1092b8e80941Smrg
1093b8e80941Smrg   if (inst->base_mrf == -1)
1094b8e80941Smrg      return 0;
1095b8e80941Smrg
1096b8e80941Smrg   switch (inst->opcode) {
1097b8e80941Smrg   case SHADER_OPCODE_RCP:
1098b8e80941Smrg   case SHADER_OPCODE_RSQ:
1099b8e80941Smrg   case SHADER_OPCODE_SQRT:
1100b8e80941Smrg   case SHADER_OPCODE_EXP2:
1101b8e80941Smrg   case SHADER_OPCODE_LOG2:
1102b8e80941Smrg   case SHADER_OPCODE_SIN:
1103b8e80941Smrg   case SHADER_OPCODE_COS:
1104b8e80941Smrg      return 1 * dispatch_width / 8;
1105b8e80941Smrg   case SHADER_OPCODE_POW:
1106b8e80941Smrg   case SHADER_OPCODE_INT_QUOTIENT:
1107b8e80941Smrg   case SHADER_OPCODE_INT_REMAINDER:
1108b8e80941Smrg      return 2 * dispatch_width / 8;
1109b8e80941Smrg   case SHADER_OPCODE_TEX:
1110b8e80941Smrg   case FS_OPCODE_TXB:
1111b8e80941Smrg   case SHADER_OPCODE_TXD:
1112b8e80941Smrg   case SHADER_OPCODE_TXF:
1113b8e80941Smrg   case SHADER_OPCODE_TXF_CMS:
1114b8e80941Smrg   case SHADER_OPCODE_TXF_MCS:
1115b8e80941Smrg   case SHADER_OPCODE_TG4:
1116b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET:
1117b8e80941Smrg   case SHADER_OPCODE_TXL:
1118b8e80941Smrg   case SHADER_OPCODE_TXS:
1119b8e80941Smrg   case SHADER_OPCODE_LOD:
1120b8e80941Smrg   case SHADER_OPCODE_SAMPLEINFO:
1121b8e80941Smrg      return 1;
1122b8e80941Smrg   case FS_OPCODE_FB_WRITE:
1123b8e80941Smrg   case FS_OPCODE_REP_FB_WRITE:
1124b8e80941Smrg      return inst->src[0].file == BAD_FILE ? 0 : 2;
1125b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1126b8e80941Smrg   case SHADER_OPCODE_GEN4_SCRATCH_READ:
1127b8e80941Smrg      return 1;
1128b8e80941Smrg   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
1129b8e80941Smrg      return inst->mlen;
1130b8e80941Smrg   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1131b8e80941Smrg      return inst->mlen;
1132b8e80941Smrg   default:
1133b8e80941Smrg      unreachable("not reached");
1134b8e80941Smrg   }
1135b8e80941Smrg}
1136b8e80941Smrg
1137b8e80941Smrgfs_reg
1138b8e80941Smrgfs_visitor::vgrf(const glsl_type *const type)
1139b8e80941Smrg{
1140b8e80941Smrg   int reg_width = dispatch_width / 8;
1141b8e80941Smrg   return fs_reg(VGRF,
1142b8e80941Smrg                 alloc.allocate(type_size_scalar(type, false) * reg_width),
1143b8e80941Smrg                 brw_type_for_base_type(type));
1144b8e80941Smrg}
1145b8e80941Smrg
1146b8e80941Smrgfs_reg::fs_reg(enum brw_reg_file file, int nr)
1147b8e80941Smrg{
1148b8e80941Smrg   init();
1149b8e80941Smrg   this->file = file;
1150b8e80941Smrg   this->nr = nr;
1151b8e80941Smrg   this->type = BRW_REGISTER_TYPE_F;
1152b8e80941Smrg   this->stride = (file == UNIFORM ? 0 : 1);
1153b8e80941Smrg}
1154b8e80941Smrg
1155b8e80941Smrgfs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
1156b8e80941Smrg{
1157b8e80941Smrg   init();
1158b8e80941Smrg   this->file = file;
1159b8e80941Smrg   this->nr = nr;
1160b8e80941Smrg   this->type = type;
1161b8e80941Smrg   this->stride = (file == UNIFORM ? 0 : 1);
1162b8e80941Smrg}
1163b8e80941Smrg
1164b8e80941Smrg/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1165b8e80941Smrg * This brings in those uniform definitions
1166b8e80941Smrg */
1167b8e80941Smrgvoid
1168b8e80941Smrgfs_visitor::import_uniforms(fs_visitor *v)
1169b8e80941Smrg{
1170b8e80941Smrg   this->push_constant_loc = v->push_constant_loc;
1171b8e80941Smrg   this->pull_constant_loc = v->pull_constant_loc;
1172b8e80941Smrg   this->uniforms = v->uniforms;
1173b8e80941Smrg   this->subgroup_id = v->subgroup_id;
1174b8e80941Smrg}
1175b8e80941Smrg
1176b8e80941Smrgvoid
1177b8e80941Smrgfs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
1178b8e80941Smrg{
1179b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1180b8e80941Smrg
1181b8e80941Smrg   /* gl_FragCoord.x */
1182b8e80941Smrg   bld.MOV(wpos, this->pixel_x);
1183b8e80941Smrg   wpos = offset(wpos, bld, 1);
1184b8e80941Smrg
1185b8e80941Smrg   /* gl_FragCoord.y */
1186b8e80941Smrg   bld.MOV(wpos, this->pixel_y);
1187b8e80941Smrg   wpos = offset(wpos, bld, 1);
1188b8e80941Smrg
1189b8e80941Smrg   /* gl_FragCoord.z */
1190b8e80941Smrg   if (devinfo->gen >= 6) {
1191b8e80941Smrg      bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
1192b8e80941Smrg   } else {
1193b8e80941Smrg      bld.emit(FS_OPCODE_LINTERP, wpos,
1194b8e80941Smrg               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
1195b8e80941Smrg               component(interp_reg(VARYING_SLOT_POS, 2), 0));
1196b8e80941Smrg   }
1197b8e80941Smrg   wpos = offset(wpos, bld, 1);
1198b8e80941Smrg
1199b8e80941Smrg   /* gl_FragCoord.w: Already set up in emit_interpolation */
1200b8e80941Smrg   bld.MOV(wpos, this->wpos_w);
1201b8e80941Smrg}
1202b8e80941Smrg
1203b8e80941Smrgenum brw_barycentric_mode
1204b8e80941Smrgbrw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
1205b8e80941Smrg{
1206b8e80941Smrg   /* Barycentric modes don't make sense for flat inputs. */
1207b8e80941Smrg   assert(mode != INTERP_MODE_FLAT);
1208b8e80941Smrg
1209b8e80941Smrg   unsigned bary;
1210b8e80941Smrg   switch (op) {
1211b8e80941Smrg   case nir_intrinsic_load_barycentric_pixel:
1212b8e80941Smrg   case nir_intrinsic_load_barycentric_at_offset:
1213b8e80941Smrg      bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1214b8e80941Smrg      break;
1215b8e80941Smrg   case nir_intrinsic_load_barycentric_centroid:
1216b8e80941Smrg      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1217b8e80941Smrg      break;
1218b8e80941Smrg   case nir_intrinsic_load_barycentric_sample:
1219b8e80941Smrg   case nir_intrinsic_load_barycentric_at_sample:
1220b8e80941Smrg      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1221b8e80941Smrg      break;
1222b8e80941Smrg   default:
1223b8e80941Smrg      unreachable("invalid intrinsic");
1224b8e80941Smrg   }
1225b8e80941Smrg
1226b8e80941Smrg   if (mode == INTERP_MODE_NOPERSPECTIVE)
1227b8e80941Smrg      bary += 3;
1228b8e80941Smrg
1229b8e80941Smrg   return (enum brw_barycentric_mode) bary;
1230b8e80941Smrg}
1231b8e80941Smrg
1232b8e80941Smrg/**
1233b8e80941Smrg * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1234b8e80941Smrg */
1235b8e80941Smrgstatic enum brw_barycentric_mode
1236b8e80941Smrgcentroid_to_pixel(enum brw_barycentric_mode bary)
1237b8e80941Smrg{
1238b8e80941Smrg   assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1239b8e80941Smrg          bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1240b8e80941Smrg   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1241b8e80941Smrg}
1242b8e80941Smrg
1243b8e80941Smrgfs_reg *
1244b8e80941Smrgfs_visitor::emit_frontfacing_interpolation()
1245b8e80941Smrg{
1246b8e80941Smrg   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1247b8e80941Smrg
1248b8e80941Smrg   if (devinfo->gen >= 6) {
1249b8e80941Smrg      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1250b8e80941Smrg       * a boolean result from this (~0/true or 0/false).
1251b8e80941Smrg       *
1252b8e80941Smrg       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1253b8e80941Smrg       * this task in only one instruction:
1254b8e80941Smrg       *    - a negation source modifier will flip the bit; and
1255b8e80941Smrg       *    - a W -> D type conversion will sign extend the bit into the high
1256b8e80941Smrg       *      word of the destination.
1257b8e80941Smrg       *
1258b8e80941Smrg       * An ASR 15 fills the low word of the destination.
1259b8e80941Smrg       */
1260b8e80941Smrg      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1261b8e80941Smrg      g0.negate = true;
1262b8e80941Smrg
1263b8e80941Smrg      bld.ASR(*reg, g0, brw_imm_d(15));
1264b8e80941Smrg   } else {
1265b8e80941Smrg      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1266b8e80941Smrg       * a boolean result from this (1/true or 0/false).
1267b8e80941Smrg       *
1268b8e80941Smrg       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1269b8e80941Smrg       * the negation source modifier to flip it. Unfortunately the SHR
1270b8e80941Smrg       * instruction only operates on UD (or D with an abs source modifier)
1271b8e80941Smrg       * sources without negation.
1272b8e80941Smrg       *
1273b8e80941Smrg       * Instead, use ASR (which will give ~0/true or 0/false).
1274b8e80941Smrg       */
1275b8e80941Smrg      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1276b8e80941Smrg      g1_6.negate = true;
1277b8e80941Smrg
1278b8e80941Smrg      bld.ASR(*reg, g1_6, brw_imm_d(31));
1279b8e80941Smrg   }
1280b8e80941Smrg
1281b8e80941Smrg   return reg;
1282b8e80941Smrg}
1283b8e80941Smrg
1284b8e80941Smrgvoid
1285b8e80941Smrgfs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1286b8e80941Smrg{
1287b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1288b8e80941Smrg   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1289b8e80941Smrg   assert(dst.type == BRW_REGISTER_TYPE_F);
1290b8e80941Smrg
1291b8e80941Smrg   if (wm_prog_data->persample_dispatch) {
1292b8e80941Smrg      /* Convert int_sample_pos to floating point */
1293b8e80941Smrg      bld.MOV(dst, int_sample_pos);
1294b8e80941Smrg      /* Scale to the range [0, 1] */
1295b8e80941Smrg      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1296b8e80941Smrg   }
1297b8e80941Smrg   else {
1298b8e80941Smrg      /* From ARB_sample_shading specification:
1299b8e80941Smrg       * "When rendering to a non-multisample buffer, or if multisample
1300b8e80941Smrg       *  rasterization is disabled, gl_SamplePosition will always be
1301b8e80941Smrg       *  (0.5, 0.5).
1302b8e80941Smrg       */
1303b8e80941Smrg      bld.MOV(dst, brw_imm_f(0.5f));
1304b8e80941Smrg   }
1305b8e80941Smrg}
1306b8e80941Smrg
1307b8e80941Smrgfs_reg *
1308b8e80941Smrgfs_visitor::emit_samplepos_setup()
1309b8e80941Smrg{
1310b8e80941Smrg   assert(devinfo->gen >= 6);
1311b8e80941Smrg
1312b8e80941Smrg   const fs_builder abld = bld.annotate("compute sample position");
1313b8e80941Smrg   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1314b8e80941Smrg   fs_reg pos = *reg;
1315b8e80941Smrg   fs_reg int_sample_x = vgrf(glsl_type::int_type);
1316b8e80941Smrg   fs_reg int_sample_y = vgrf(glsl_type::int_type);
1317b8e80941Smrg
1318b8e80941Smrg   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1319b8e80941Smrg    * mode will be enabled.
1320b8e80941Smrg    *
1321b8e80941Smrg    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1322b8e80941Smrg    * R31.1:0         Position Offset X/Y for Slot[3:0]
1323b8e80941Smrg    * R31.3:2         Position Offset X/Y for Slot[7:4]
1324b8e80941Smrg    * .....
1325b8e80941Smrg    *
1326b8e80941Smrg    * The X, Y sample positions come in as bytes in  thread payload. So, read
1327b8e80941Smrg    * the positions using vstride=16, width=8, hstride=2.
1328b8e80941Smrg    */
1329b8e80941Smrg   const fs_reg sample_pos_reg =
1330b8e80941Smrg      fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
1331b8e80941Smrg
1332b8e80941Smrg   /* Compute gl_SamplePosition.x */
1333b8e80941Smrg   abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
1334b8e80941Smrg   compute_sample_position(offset(pos, abld, 0), int_sample_x);
1335b8e80941Smrg
1336b8e80941Smrg   /* Compute gl_SamplePosition.y */
1337b8e80941Smrg   abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
1338b8e80941Smrg   compute_sample_position(offset(pos, abld, 1), int_sample_y);
1339b8e80941Smrg   return reg;
1340b8e80941Smrg}
1341b8e80941Smrg
1342b8e80941Smrgfs_reg *
1343b8e80941Smrgfs_visitor::emit_sampleid_setup()
1344b8e80941Smrg{
1345b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1346b8e80941Smrg   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1347b8e80941Smrg   assert(devinfo->gen >= 6);
1348b8e80941Smrg
1349b8e80941Smrg   const fs_builder abld = bld.annotate("compute sample id");
1350b8e80941Smrg   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
1351b8e80941Smrg
1352b8e80941Smrg   if (!key->multisample_fbo) {
1353b8e80941Smrg      /* As per GL_ARB_sample_shading specification:
1354b8e80941Smrg       * "When rendering to a non-multisample buffer, or if multisample
1355b8e80941Smrg       *  rasterization is disabled, gl_SampleID will always be zero."
1356b8e80941Smrg       */
1357b8e80941Smrg      abld.MOV(*reg, brw_imm_d(0));
1358b8e80941Smrg   } else if (devinfo->gen >= 8) {
1359b8e80941Smrg      /* Sample ID comes in as 4-bit numbers in g1.0:
1360b8e80941Smrg       *
1361b8e80941Smrg       *    15:12 Slot 3 SampleID (only used in SIMD16)
1362b8e80941Smrg       *     11:8 Slot 2 SampleID (only used in SIMD16)
1363b8e80941Smrg       *      7:4 Slot 1 SampleID
1364b8e80941Smrg       *      3:0 Slot 0 SampleID
1365b8e80941Smrg       *
1366b8e80941Smrg       * Each slot corresponds to four channels, so we want to replicate each
1367b8e80941Smrg       * half-byte value to 4 channels in a row:
1368b8e80941Smrg       *
1369b8e80941Smrg       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
1370b8e80941Smrg       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
1371b8e80941Smrg       *
1372b8e80941Smrg       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
1373b8e80941Smrg       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
1374b8e80941Smrg       *
1375b8e80941Smrg       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
1376b8e80941Smrg       * channels to read the first byte (7:0), and the second group of 8
1377b8e80941Smrg       * channels to read the second byte (15:8).  Then, we shift right by
1378b8e80941Smrg       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
1379b8e80941Smrg       * values into place.  Finally, we AND with 0xf to keep the low nibble.
1380b8e80941Smrg       *
1381b8e80941Smrg       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
1382b8e80941Smrg       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
1383b8e80941Smrg       *
1384b8e80941Smrg       * TODO: These payload bits exist on Gen7 too, but they appear to always
1385b8e80941Smrg       *       be zero, so this code fails to work.  We should find out why.
1386b8e80941Smrg       */
1387b8e80941Smrg      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
1388b8e80941Smrg
1389b8e80941Smrg      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
1390b8e80941Smrg         const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
1391b8e80941Smrg         hbld.SHR(offset(tmp, hbld, i),
1392b8e80941Smrg                  stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
1393b8e80941Smrg                         1, 8, 0),
1394b8e80941Smrg                  brw_imm_v(0x44440000));
1395b8e80941Smrg      }
1396b8e80941Smrg
1397b8e80941Smrg      abld.AND(*reg, tmp, brw_imm_w(0xf));
1398b8e80941Smrg   } else {
1399b8e80941Smrg      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
1400b8e80941Smrg      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
1401b8e80941Smrg
1402b8e80941Smrg      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1403b8e80941Smrg       * 8x multisampling, subspan 0 will represent sample N (where N
1404b8e80941Smrg       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1405b8e80941Smrg       * 7. We can find the value of N by looking at R0.0 bits 7:6
1406b8e80941Smrg       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1407b8e80941Smrg       * (since samples are always delivered in pairs). That is, we
1408b8e80941Smrg       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1409b8e80941Smrg       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1410b8e80941Smrg       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1411b8e80941Smrg       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1412b8e80941Smrg       * populating a temporary variable with the sequence (0, 1, 2, 3),
1413b8e80941Smrg       * and then reading from it using vstride=1, width=4, hstride=0.
1414b8e80941Smrg       * These computations hold good for 4x multisampling as well.
1415b8e80941Smrg       *
1416b8e80941Smrg       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1417b8e80941Smrg       * the first four slots are sample 0 of subspan 0; the next four
1418b8e80941Smrg       * are sample 1 of subspan 0; the third group is sample 0 of
1419b8e80941Smrg       * subspan 1, and finally sample 1 of subspan 1.
1420b8e80941Smrg       */
1421b8e80941Smrg
1422b8e80941Smrg      /* SKL+ has an extra bit for the Starting Sample Pair Index to
1423b8e80941Smrg       * accomodate 16x MSAA.
1424b8e80941Smrg       */
1425b8e80941Smrg      abld.exec_all().group(1, 0)
1426b8e80941Smrg          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1427b8e80941Smrg               brw_imm_ud(0xc0));
1428b8e80941Smrg      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1429b8e80941Smrg
1430b8e80941Smrg      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
1431b8e80941Smrg       * can assume 4x MSAA.  Disallow it on IVB+
1432b8e80941Smrg       *
1433b8e80941Smrg       * FINISHME: One day, we could come up with a way to do this that
1434b8e80941Smrg       * actually works on gen7.
1435b8e80941Smrg       */
1436b8e80941Smrg      if (devinfo->gen >= 7)
1437b8e80941Smrg         limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gen7");
1438b8e80941Smrg      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
1439b8e80941Smrg
1440b8e80941Smrg      /* This special instruction takes care of setting vstride=1,
1441b8e80941Smrg       * width=4, hstride=0 of t2 during an ADD instruction.
1442b8e80941Smrg       */
1443b8e80941Smrg      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1444b8e80941Smrg   }
1445b8e80941Smrg
1446b8e80941Smrg   return reg;
1447b8e80941Smrg}
1448b8e80941Smrg
1449b8e80941Smrgfs_reg *
1450b8e80941Smrgfs_visitor::emit_samplemaskin_setup()
1451b8e80941Smrg{
1452b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1453b8e80941Smrg   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1454b8e80941Smrg   assert(devinfo->gen >= 6);
1455b8e80941Smrg
1456b8e80941Smrg   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1457b8e80941Smrg
1458b8e80941Smrg   fs_reg coverage_mask =
1459b8e80941Smrg      fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
1460b8e80941Smrg
1461b8e80941Smrg   if (wm_prog_data->persample_dispatch) {
1462b8e80941Smrg      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
1463b8e80941Smrg       * and a mask representing which sample is being processed by the
1464b8e80941Smrg       * current shader invocation.
1465b8e80941Smrg       *
1466b8e80941Smrg       * From the OES_sample_variables specification:
1467b8e80941Smrg       * "When per-sample shading is active due to the use of a fragment input
1468b8e80941Smrg       *  qualified by "sample" or due to the use of the gl_SampleID or
1469b8e80941Smrg       *  gl_SamplePosition variables, only the bit for the current sample is
1470b8e80941Smrg       *  set in gl_SampleMaskIn."
1471b8e80941Smrg       */
1472b8e80941Smrg      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
1473b8e80941Smrg
1474b8e80941Smrg      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
1475b8e80941Smrg         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
1476b8e80941Smrg
1477b8e80941Smrg      fs_reg one = vgrf(glsl_type::int_type);
1478b8e80941Smrg      fs_reg enabled_mask = vgrf(glsl_type::int_type);
1479b8e80941Smrg      abld.MOV(one, brw_imm_d(1));
1480b8e80941Smrg      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
1481b8e80941Smrg      abld.AND(*reg, enabled_mask, coverage_mask);
1482b8e80941Smrg   } else {
1483b8e80941Smrg      /* In per-pixel mode, the coverage mask is sufficient. */
1484b8e80941Smrg      *reg = coverage_mask;
1485b8e80941Smrg   }
1486b8e80941Smrg   return reg;
1487b8e80941Smrg}
1488b8e80941Smrg
1489b8e80941Smrgfs_reg
1490b8e80941Smrgfs_visitor::resolve_source_modifiers(const fs_reg &src)
1491b8e80941Smrg{
1492b8e80941Smrg   if (!src.abs && !src.negate)
1493b8e80941Smrg      return src;
1494b8e80941Smrg
1495b8e80941Smrg   fs_reg temp = bld.vgrf(src.type);
1496b8e80941Smrg   bld.MOV(temp, src);
1497b8e80941Smrg
1498b8e80941Smrg   return temp;
1499b8e80941Smrg}
1500b8e80941Smrg
1501b8e80941Smrgvoid
1502b8e80941Smrgfs_visitor::emit_discard_jump()
1503b8e80941Smrg{
1504b8e80941Smrg   assert(brw_wm_prog_data(this->prog_data)->uses_kill);
1505b8e80941Smrg
1506b8e80941Smrg   /* For performance, after a discard, jump to the end of the
1507b8e80941Smrg    * shader if all relevant channels have been discarded.
1508b8e80941Smrg    */
1509b8e80941Smrg   fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
1510b8e80941Smrg   discard_jump->flag_subreg = 1;
1511b8e80941Smrg
1512b8e80941Smrg   discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
1513b8e80941Smrg   discard_jump->predicate_inverse = true;
1514b8e80941Smrg}
1515b8e80941Smrg
1516b8e80941Smrgvoid
1517b8e80941Smrgfs_visitor::emit_gs_thread_end()
1518b8e80941Smrg{
1519b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
1520b8e80941Smrg
1521b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1522b8e80941Smrg
1523b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 0) {
1524b8e80941Smrg      emit_gs_control_data_bits(this->final_gs_vertex_count);
1525b8e80941Smrg   }
1526b8e80941Smrg
1527b8e80941Smrg   const fs_builder abld = bld.annotate("thread end");
1528b8e80941Smrg   fs_inst *inst;
1529b8e80941Smrg
1530b8e80941Smrg   if (gs_prog_data->static_vertex_count != -1) {
1531b8e80941Smrg      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1532b8e80941Smrg         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1533b8e80941Smrg             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1534b8e80941Smrg             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1535b8e80941Smrg             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1536b8e80941Smrg            prev->eot = true;
1537b8e80941Smrg
1538b8e80941Smrg            /* Delete now dead instructions. */
1539b8e80941Smrg            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1540b8e80941Smrg               if (dead == prev)
1541b8e80941Smrg                  break;
1542b8e80941Smrg               dead->remove();
1543b8e80941Smrg            }
1544b8e80941Smrg            return;
1545b8e80941Smrg         } else if (prev->is_control_flow() || prev->has_side_effects()) {
1546b8e80941Smrg            break;
1547b8e80941Smrg         }
1548b8e80941Smrg      }
1549b8e80941Smrg      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1550b8e80941Smrg      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1551b8e80941Smrg      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1552b8e80941Smrg      inst->mlen = 1;
1553b8e80941Smrg   } else {
1554b8e80941Smrg      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1555b8e80941Smrg      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1556b8e80941Smrg      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1557b8e80941Smrg      sources[1] = this->final_gs_vertex_count;
1558b8e80941Smrg      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1559b8e80941Smrg      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1560b8e80941Smrg      inst->mlen = 2;
1561b8e80941Smrg   }
1562b8e80941Smrg   inst->eot = true;
1563b8e80941Smrg   inst->offset = 0;
1564b8e80941Smrg}
1565b8e80941Smrg
1566b8e80941Smrgvoid
1567b8e80941Smrgfs_visitor::assign_curb_setup()
1568b8e80941Smrg{
1569b8e80941Smrg   unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1570b8e80941Smrg
1571b8e80941Smrg   unsigned ubo_push_length = 0;
1572b8e80941Smrg   unsigned ubo_push_start[4];
1573b8e80941Smrg   for (int i = 0; i < 4; i++) {
1574b8e80941Smrg      ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1575b8e80941Smrg      ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1576b8e80941Smrg   }
1577b8e80941Smrg
1578b8e80941Smrg   prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1579b8e80941Smrg
1580b8e80941Smrg   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1581b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1582b8e80941Smrg      for (unsigned int i = 0; i < inst->sources; i++) {
1583b8e80941Smrg	 if (inst->src[i].file == UNIFORM) {
1584b8e80941Smrg            int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1585b8e80941Smrg            int constant_nr;
1586b8e80941Smrg            if (inst->src[i].nr >= UBO_START) {
1587b8e80941Smrg               /* constant_nr is in 32-bit units, the rest are in bytes */
1588b8e80941Smrg               constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1589b8e80941Smrg                             inst->src[i].offset / 4;
1590b8e80941Smrg            } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1591b8e80941Smrg               constant_nr = push_constant_loc[uniform_nr];
1592b8e80941Smrg            } else {
1593b8e80941Smrg               /* Section 5.11 of the OpenGL 4.1 spec says:
1594b8e80941Smrg                * "Out-of-bounds reads return undefined values, which include
1595b8e80941Smrg                *  values from other variables of the active program or zero."
1596b8e80941Smrg                * Just return the first push constant.
1597b8e80941Smrg                */
1598b8e80941Smrg               constant_nr = 0;
1599b8e80941Smrg            }
1600b8e80941Smrg
1601b8e80941Smrg	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1602b8e80941Smrg						  constant_nr / 8,
1603b8e80941Smrg						  constant_nr % 8);
1604b8e80941Smrg            brw_reg.abs = inst->src[i].abs;
1605b8e80941Smrg            brw_reg.negate = inst->src[i].negate;
1606b8e80941Smrg
1607b8e80941Smrg            assert(inst->src[i].stride == 0);
1608b8e80941Smrg            inst->src[i] = byte_offset(
1609b8e80941Smrg               retype(brw_reg, inst->src[i].type),
1610b8e80941Smrg               inst->src[i].offset % 4);
1611b8e80941Smrg	 }
1612b8e80941Smrg      }
1613b8e80941Smrg   }
1614b8e80941Smrg
1615b8e80941Smrg   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1616b8e80941Smrg   this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1617b8e80941Smrg}
1618b8e80941Smrg
1619b8e80941Smrgvoid
1620b8e80941Smrgfs_visitor::calculate_urb_setup()
1621b8e80941Smrg{
1622b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1623b8e80941Smrg   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1624b8e80941Smrg   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1625b8e80941Smrg
1626b8e80941Smrg   memset(prog_data->urb_setup, -1,
1627b8e80941Smrg          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1628b8e80941Smrg
1629b8e80941Smrg   int urb_next = 0;
1630b8e80941Smrg   /* Figure out where each of the incoming setup attributes lands. */
1631b8e80941Smrg   if (devinfo->gen >= 6) {
1632b8e80941Smrg      if (util_bitcount64(nir->info.inputs_read &
1633b8e80941Smrg                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
1634b8e80941Smrg         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1635b8e80941Smrg          * first 16 varying inputs, so we can put them wherever we want.
1636b8e80941Smrg          * Just put them in order.
1637b8e80941Smrg          *
1638b8e80941Smrg          * This is useful because it means that (a) inputs not used by the
1639b8e80941Smrg          * fragment shader won't take up valuable register space, and (b) we
1640b8e80941Smrg          * won't have to recompile the fragment shader if it gets paired with
1641b8e80941Smrg          * a different vertex (or geometry) shader.
1642b8e80941Smrg          */
1643b8e80941Smrg         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1644b8e80941Smrg            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1645b8e80941Smrg                BITFIELD64_BIT(i)) {
1646b8e80941Smrg               prog_data->urb_setup[i] = urb_next++;
1647b8e80941Smrg            }
1648b8e80941Smrg         }
1649b8e80941Smrg      } else {
1650b8e80941Smrg         /* We have enough input varyings that the SF/SBE pipeline stage can't
1651b8e80941Smrg          * arbitrarily rearrange them to suit our whim; we have to put them
1652b8e80941Smrg          * in an order that matches the output of the previous pipeline stage
1653b8e80941Smrg          * (geometry or vertex shader).
1654b8e80941Smrg          */
1655b8e80941Smrg         struct brw_vue_map prev_stage_vue_map;
1656b8e80941Smrg         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1657b8e80941Smrg                             key->input_slots_valid,
1658b8e80941Smrg                             nir->info.separate_shader);
1659b8e80941Smrg
1660b8e80941Smrg         int first_slot =
1661b8e80941Smrg            brw_compute_first_urb_slot_required(nir->info.inputs_read,
1662b8e80941Smrg                                                &prev_stage_vue_map);
1663b8e80941Smrg
1664b8e80941Smrg         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1665b8e80941Smrg         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1666b8e80941Smrg              slot++) {
1667b8e80941Smrg            int varying = prev_stage_vue_map.slot_to_varying[slot];
1668b8e80941Smrg            if (varying != BRW_VARYING_SLOT_PAD &&
1669b8e80941Smrg                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1670b8e80941Smrg                 BITFIELD64_BIT(varying))) {
1671b8e80941Smrg               prog_data->urb_setup[varying] = slot - first_slot;
1672b8e80941Smrg            }
1673b8e80941Smrg         }
1674b8e80941Smrg         urb_next = prev_stage_vue_map.num_slots - first_slot;
1675b8e80941Smrg      }
1676b8e80941Smrg   } else {
1677b8e80941Smrg      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1678b8e80941Smrg      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1679b8e80941Smrg         /* Point size is packed into the header, not as a general attribute */
1680b8e80941Smrg         if (i == VARYING_SLOT_PSIZ)
1681b8e80941Smrg            continue;
1682b8e80941Smrg
1683b8e80941Smrg	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1684b8e80941Smrg	    /* The back color slot is skipped when the front color is
1685b8e80941Smrg	     * also written to.  In addition, some slots can be
1686b8e80941Smrg	     * written in the vertex shader and not read in the
1687b8e80941Smrg	     * fragment shader.  So the register number must always be
1688b8e80941Smrg	     * incremented, mapped or not.
1689b8e80941Smrg	     */
1690b8e80941Smrg	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1691b8e80941Smrg	       prog_data->urb_setup[i] = urb_next;
1692b8e80941Smrg            urb_next++;
1693b8e80941Smrg	 }
1694b8e80941Smrg      }
1695b8e80941Smrg
1696b8e80941Smrg      /*
1697b8e80941Smrg       * It's a FS only attribute, and we did interpolation for this attribute
1698b8e80941Smrg       * in SF thread. So, count it here, too.
1699b8e80941Smrg       *
1700b8e80941Smrg       * See compile_sf_prog() for more info.
1701b8e80941Smrg       */
1702b8e80941Smrg      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1703b8e80941Smrg         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1704b8e80941Smrg   }
1705b8e80941Smrg
1706b8e80941Smrg   prog_data->num_varying_inputs = urb_next;
1707b8e80941Smrg}
1708b8e80941Smrg
1709b8e80941Smrgvoid
1710b8e80941Smrgfs_visitor::assign_urb_setup()
1711b8e80941Smrg{
1712b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
1713b8e80941Smrg   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1714b8e80941Smrg
1715b8e80941Smrg   int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1716b8e80941Smrg
1717b8e80941Smrg   /* Offset all the urb_setup[] index by the actual position of the
1718b8e80941Smrg    * setup regs, now that the location of the constants has been chosen.
1719b8e80941Smrg    */
1720b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1721b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
1722b8e80941Smrg         if (inst->src[i].file == ATTR) {
1723b8e80941Smrg            /* ATTR regs in the FS are in units of logical scalar inputs each
1724b8e80941Smrg             * of which consumes half of a GRF register.
1725b8e80941Smrg             */
1726b8e80941Smrg            assert(inst->src[i].offset < REG_SIZE / 2);
1727b8e80941Smrg            const unsigned grf = urb_start + inst->src[i].nr / 2;
1728b8e80941Smrg            const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
1729b8e80941Smrg                                    inst->src[i].offset;
1730b8e80941Smrg            const unsigned width = inst->src[i].stride == 0 ?
1731b8e80941Smrg                                   1 : MIN2(inst->exec_size, 8);
1732b8e80941Smrg            struct brw_reg reg = stride(
1733b8e80941Smrg               byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1734b8e80941Smrg                           offset),
1735b8e80941Smrg               width * inst->src[i].stride,
1736b8e80941Smrg               width, inst->src[i].stride);
1737b8e80941Smrg            reg.abs = inst->src[i].abs;
1738b8e80941Smrg            reg.negate = inst->src[i].negate;
1739b8e80941Smrg            inst->src[i] = reg;
1740b8e80941Smrg         }
1741b8e80941Smrg      }
1742b8e80941Smrg   }
1743b8e80941Smrg
1744b8e80941Smrg   /* Each attribute is 4 setup channels, each of which is half a reg. */
1745b8e80941Smrg   this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1746b8e80941Smrg}
1747b8e80941Smrg
1748b8e80941Smrgvoid
1749b8e80941Smrgfs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1750b8e80941Smrg{
1751b8e80941Smrg   for (int i = 0; i < inst->sources; i++) {
1752b8e80941Smrg      if (inst->src[i].file == ATTR) {
1753b8e80941Smrg         int grf = payload.num_regs +
1754b8e80941Smrg                   prog_data->curb_read_length +
1755b8e80941Smrg                   inst->src[i].nr +
1756b8e80941Smrg                   inst->src[i].offset / REG_SIZE;
1757b8e80941Smrg
1758b8e80941Smrg         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
1759b8e80941Smrg          *
1760b8e80941Smrg          * VertStride must be used to cross GRF register boundaries. This
1761b8e80941Smrg          * rule implies that elements within a 'Width' cannot cross GRF
1762b8e80941Smrg          * boundaries.
1763b8e80941Smrg          *
1764b8e80941Smrg          * So, for registers that are large enough, we have to split the exec
1765b8e80941Smrg          * size in two and trust the compression state to sort it out.
1766b8e80941Smrg          */
1767b8e80941Smrg         unsigned total_size = inst->exec_size *
1768b8e80941Smrg                               inst->src[i].stride *
1769b8e80941Smrg                               type_sz(inst->src[i].type);
1770b8e80941Smrg
1771b8e80941Smrg         assert(total_size <= 2 * REG_SIZE);
1772b8e80941Smrg         const unsigned exec_size =
1773b8e80941Smrg            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
1774b8e80941Smrg
1775b8e80941Smrg         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
1776b8e80941Smrg         struct brw_reg reg =
1777b8e80941Smrg            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1778b8e80941Smrg                               inst->src[i].offset % REG_SIZE),
1779b8e80941Smrg                   exec_size * inst->src[i].stride,
1780b8e80941Smrg                   width, inst->src[i].stride);
1781b8e80941Smrg         reg.abs = inst->src[i].abs;
1782b8e80941Smrg         reg.negate = inst->src[i].negate;
1783b8e80941Smrg
1784b8e80941Smrg         inst->src[i] = reg;
1785b8e80941Smrg      }
1786b8e80941Smrg   }
1787b8e80941Smrg}
1788b8e80941Smrg
1789b8e80941Smrgvoid
1790b8e80941Smrgfs_visitor::assign_vs_urb_setup()
1791b8e80941Smrg{
1792b8e80941Smrg   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
1793b8e80941Smrg
1794b8e80941Smrg   assert(stage == MESA_SHADER_VERTEX);
1795b8e80941Smrg
1796b8e80941Smrg   /* Each attribute is 4 regs. */
1797b8e80941Smrg   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
1798b8e80941Smrg
1799b8e80941Smrg   assert(vs_prog_data->base.urb_read_length <= 15);
1800b8e80941Smrg
1801b8e80941Smrg   /* Rewrite all ATTR file references to the hw grf that they land in. */
1802b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1803b8e80941Smrg      convert_attr_sources_to_hw_regs(inst);
1804b8e80941Smrg   }
1805b8e80941Smrg}
1806b8e80941Smrg
1807b8e80941Smrgvoid
1808b8e80941Smrgfs_visitor::assign_tcs_single_patch_urb_setup()
1809b8e80941Smrg{
1810b8e80941Smrg   assert(stage == MESA_SHADER_TESS_CTRL);
1811b8e80941Smrg
1812b8e80941Smrg   /* Rewrite all ATTR file references to HW_REGs. */
1813b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1814b8e80941Smrg      convert_attr_sources_to_hw_regs(inst);
1815b8e80941Smrg   }
1816b8e80941Smrg}
1817b8e80941Smrg
1818b8e80941Smrgvoid
1819b8e80941Smrgfs_visitor::assign_tes_urb_setup()
1820b8e80941Smrg{
1821b8e80941Smrg   assert(stage == MESA_SHADER_TESS_EVAL);
1822b8e80941Smrg
1823b8e80941Smrg   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1824b8e80941Smrg
1825b8e80941Smrg   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
1826b8e80941Smrg
1827b8e80941Smrg   /* Rewrite all ATTR file references to HW_REGs. */
1828b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1829b8e80941Smrg      convert_attr_sources_to_hw_regs(inst);
1830b8e80941Smrg   }
1831b8e80941Smrg}
1832b8e80941Smrg
1833b8e80941Smrgvoid
1834b8e80941Smrgfs_visitor::assign_gs_urb_setup()
1835b8e80941Smrg{
1836b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
1837b8e80941Smrg
1838b8e80941Smrg   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
1839b8e80941Smrg
1840b8e80941Smrg   first_non_payload_grf +=
1841b8e80941Smrg      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
1842b8e80941Smrg
1843b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1844b8e80941Smrg      /* Rewrite all ATTR file references to GRFs. */
1845b8e80941Smrg      convert_attr_sources_to_hw_regs(inst);
1846b8e80941Smrg   }
1847b8e80941Smrg}
1848b8e80941Smrg
1849b8e80941Smrg
1850b8e80941Smrg/**
1851b8e80941Smrg * Split large virtual GRFs into separate components if we can.
1852b8e80941Smrg *
1853b8e80941Smrg * This is mostly duplicated with what brw_fs_vector_splitting does,
1854b8e80941Smrg * but that's really conservative because it's afraid of doing
1855b8e80941Smrg * splitting that doesn't result in real progress after the rest of
1856b8e80941Smrg * the optimization phases, which would cause infinite looping in
1857b8e80941Smrg * optimization.  We can do it once here, safely.  This also has the
1858b8e80941Smrg * opportunity to split interpolated values, or maybe even uniforms,
1859b8e80941Smrg * which we don't have at the IR level.
1860b8e80941Smrg *
1861b8e80941Smrg * We want to split, because virtual GRFs are what we register
1862b8e80941Smrg * allocate and spill (due to contiguousness requirements for some
1863b8e80941Smrg * instructions), and they're what we naturally generate in the
1864b8e80941Smrg * codegen process, but most virtual GRFs don't actually need to be
1865b8e80941Smrg * contiguous sets of GRFs.  If we split, we'll end up with reduced
1866b8e80941Smrg * live intervals and better dead code elimination and coalescing.
1867b8e80941Smrg */
1868b8e80941Smrgvoid
1869b8e80941Smrgfs_visitor::split_virtual_grfs()
1870b8e80941Smrg{
1871b8e80941Smrg   /* Compact the register file so we eliminate dead vgrfs.  This
1872b8e80941Smrg    * only defines split points for live registers, so if we have
1873b8e80941Smrg    * too large dead registers they will hit assertions later.
1874b8e80941Smrg    */
1875b8e80941Smrg   compact_virtual_grfs();
1876b8e80941Smrg
1877b8e80941Smrg   int num_vars = this->alloc.count;
1878b8e80941Smrg
1879b8e80941Smrg   /* Count the total number of registers */
1880b8e80941Smrg   int reg_count = 0;
1881b8e80941Smrg   int vgrf_to_reg[num_vars];
1882b8e80941Smrg   for (int i = 0; i < num_vars; i++) {
1883b8e80941Smrg      vgrf_to_reg[i] = reg_count;
1884b8e80941Smrg      reg_count += alloc.sizes[i];
1885b8e80941Smrg   }
1886b8e80941Smrg
1887b8e80941Smrg   /* An array of "split points".  For each register slot, this indicates
1888b8e80941Smrg    * if this slot can be separated from the previous slot.  Every time an
1889b8e80941Smrg    * instruction uses multiple elements of a register (as a source or
1890b8e80941Smrg    * destination), we mark the used slots as inseparable.  Then we go
1891b8e80941Smrg    * through and split the registers into the smallest pieces we can.
1892b8e80941Smrg    */
1893b8e80941Smrg   bool *split_points = new bool[reg_count];
1894b8e80941Smrg   memset(split_points, 0, reg_count * sizeof(*split_points));
1895b8e80941Smrg
1896b8e80941Smrg   /* Mark all used registers as fully splittable */
1897b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1898b8e80941Smrg      if (inst->dst.file == VGRF) {
1899b8e80941Smrg         int reg = vgrf_to_reg[inst->dst.nr];
1900b8e80941Smrg         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
1901b8e80941Smrg            split_points[reg + j] = true;
1902b8e80941Smrg      }
1903b8e80941Smrg
1904b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
1905b8e80941Smrg         if (inst->src[i].file == VGRF) {
1906b8e80941Smrg            int reg = vgrf_to_reg[inst->src[i].nr];
1907b8e80941Smrg            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
1908b8e80941Smrg               split_points[reg + j] = true;
1909b8e80941Smrg         }
1910b8e80941Smrg      }
1911b8e80941Smrg   }
1912b8e80941Smrg
1913b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1914b8e80941Smrg      if (inst->dst.file == VGRF) {
1915b8e80941Smrg         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1916b8e80941Smrg         for (unsigned j = 1; j < regs_written(inst); j++)
1917b8e80941Smrg            split_points[reg + j] = false;
1918b8e80941Smrg      }
1919b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
1920b8e80941Smrg         if (inst->src[i].file == VGRF) {
1921b8e80941Smrg            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1922b8e80941Smrg            for (unsigned j = 1; j < regs_read(inst, i); j++)
1923b8e80941Smrg               split_points[reg + j] = false;
1924b8e80941Smrg         }
1925b8e80941Smrg      }
1926b8e80941Smrg   }
1927b8e80941Smrg
1928b8e80941Smrg   int *new_virtual_grf = new int[reg_count];
1929b8e80941Smrg   int *new_reg_offset = new int[reg_count];
1930b8e80941Smrg
1931b8e80941Smrg   int reg = 0;
1932b8e80941Smrg   for (int i = 0; i < num_vars; i++) {
1933b8e80941Smrg      /* The first one should always be 0 as a quick sanity check. */
1934b8e80941Smrg      assert(split_points[reg] == false);
1935b8e80941Smrg
1936b8e80941Smrg      /* j = 0 case */
1937b8e80941Smrg      new_reg_offset[reg] = 0;
1938b8e80941Smrg      reg++;
1939b8e80941Smrg      int offset = 1;
1940b8e80941Smrg
1941b8e80941Smrg      /* j > 0 case */
1942b8e80941Smrg      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
1943b8e80941Smrg         /* If this is a split point, reset the offset to 0 and allocate a
1944b8e80941Smrg          * new virtual GRF for the previous offset many registers
1945b8e80941Smrg          */
1946b8e80941Smrg         if (split_points[reg]) {
1947b8e80941Smrg            assert(offset <= MAX_VGRF_SIZE);
1948b8e80941Smrg            int grf = alloc.allocate(offset);
1949b8e80941Smrg            for (int k = reg - offset; k < reg; k++)
1950b8e80941Smrg               new_virtual_grf[k] = grf;
1951b8e80941Smrg            offset = 0;
1952b8e80941Smrg         }
1953b8e80941Smrg         new_reg_offset[reg] = offset;
1954b8e80941Smrg         offset++;
1955b8e80941Smrg         reg++;
1956b8e80941Smrg      }
1957b8e80941Smrg
1958b8e80941Smrg      /* The last one gets the original register number */
1959b8e80941Smrg      assert(offset <= MAX_VGRF_SIZE);
1960b8e80941Smrg      alloc.sizes[i] = offset;
1961b8e80941Smrg      for (int k = reg - offset; k < reg; k++)
1962b8e80941Smrg         new_virtual_grf[k] = i;
1963b8e80941Smrg   }
1964b8e80941Smrg   assert(reg == reg_count);
1965b8e80941Smrg
1966b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1967b8e80941Smrg      if (inst->dst.file == VGRF) {
1968b8e80941Smrg         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
1969b8e80941Smrg         inst->dst.nr = new_virtual_grf[reg];
1970b8e80941Smrg         inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
1971b8e80941Smrg                            inst->dst.offset % REG_SIZE;
1972b8e80941Smrg         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1973b8e80941Smrg      }
1974b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
1975b8e80941Smrg	 if (inst->src[i].file == VGRF) {
1976b8e80941Smrg            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
1977b8e80941Smrg            inst->src[i].nr = new_virtual_grf[reg];
1978b8e80941Smrg            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
1979b8e80941Smrg                                  inst->src[i].offset % REG_SIZE;
1980b8e80941Smrg            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
1981b8e80941Smrg         }
1982b8e80941Smrg      }
1983b8e80941Smrg   }
1984b8e80941Smrg   invalidate_live_intervals();
1985b8e80941Smrg
1986b8e80941Smrg   delete[] split_points;
1987b8e80941Smrg   delete[] new_virtual_grf;
1988b8e80941Smrg   delete[] new_reg_offset;
1989b8e80941Smrg}
1990b8e80941Smrg
1991b8e80941Smrg/**
1992b8e80941Smrg * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1993b8e80941Smrg *
1994b8e80941Smrg * During code generation, we create tons of temporary variables, many of
1995b8e80941Smrg * which get immediately killed and are never used again.  Yet, in later
1996b8e80941Smrg * optimization and analysis passes, such as compute_live_intervals, we need
1997b8e80941Smrg * to loop over all the virtual GRFs.  Compacting them can save a lot of
1998b8e80941Smrg * overhead.
1999b8e80941Smrg */
2000b8e80941Smrgbool
2001b8e80941Smrgfs_visitor::compact_virtual_grfs()
2002b8e80941Smrg{
2003b8e80941Smrg   bool progress = false;
2004b8e80941Smrg   int *remap_table = new int[this->alloc.count];
2005b8e80941Smrg   memset(remap_table, -1, this->alloc.count * sizeof(int));
2006b8e80941Smrg
2007b8e80941Smrg   /* Mark which virtual GRFs are used. */
2008b8e80941Smrg   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2009b8e80941Smrg      if (inst->dst.file == VGRF)
2010b8e80941Smrg         remap_table[inst->dst.nr] = 0;
2011b8e80941Smrg
2012b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
2013b8e80941Smrg         if (inst->src[i].file == VGRF)
2014b8e80941Smrg            remap_table[inst->src[i].nr] = 0;
2015b8e80941Smrg      }
2016b8e80941Smrg   }
2017b8e80941Smrg
2018b8e80941Smrg   /* Compact the GRF arrays. */
2019b8e80941Smrg   int new_index = 0;
2020b8e80941Smrg   for (unsigned i = 0; i < this->alloc.count; i++) {
2021b8e80941Smrg      if (remap_table[i] == -1) {
2022b8e80941Smrg         /* We just found an unused register.  This means that we are
2023b8e80941Smrg          * actually going to compact something.
2024b8e80941Smrg          */
2025b8e80941Smrg         progress = true;
2026b8e80941Smrg      } else {
2027b8e80941Smrg         remap_table[i] = new_index;
2028b8e80941Smrg         alloc.sizes[new_index] = alloc.sizes[i];
2029b8e80941Smrg         invalidate_live_intervals();
2030b8e80941Smrg         ++new_index;
2031b8e80941Smrg      }
2032b8e80941Smrg   }
2033b8e80941Smrg
2034b8e80941Smrg   this->alloc.count = new_index;
2035b8e80941Smrg
2036b8e80941Smrg   /* Patch all the instructions to use the newly renumbered registers */
2037b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2038b8e80941Smrg      if (inst->dst.file == VGRF)
2039b8e80941Smrg         inst->dst.nr = remap_table[inst->dst.nr];
2040b8e80941Smrg
2041b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
2042b8e80941Smrg         if (inst->src[i].file == VGRF)
2043b8e80941Smrg            inst->src[i].nr = remap_table[inst->src[i].nr];
2044b8e80941Smrg      }
2045b8e80941Smrg   }
2046b8e80941Smrg
2047b8e80941Smrg   /* Patch all the references to delta_xy, since they're used in register
2048b8e80941Smrg    * allocation.  If they're unused, switch them to BAD_FILE so we don't
2049b8e80941Smrg    * think some random VGRF is delta_xy.
2050b8e80941Smrg    */
2051b8e80941Smrg   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2052b8e80941Smrg      if (delta_xy[i].file == VGRF) {
2053b8e80941Smrg         if (remap_table[delta_xy[i].nr] != -1) {
2054b8e80941Smrg            delta_xy[i].nr = remap_table[delta_xy[i].nr];
2055b8e80941Smrg         } else {
2056b8e80941Smrg            delta_xy[i].file = BAD_FILE;
2057b8e80941Smrg         }
2058b8e80941Smrg      }
2059b8e80941Smrg   }
2060b8e80941Smrg
2061b8e80941Smrg   delete[] remap_table;
2062b8e80941Smrg
2063b8e80941Smrg   return progress;
2064b8e80941Smrg}
2065b8e80941Smrg
2066b8e80941Smrgstatic int
2067b8e80941Smrgget_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
2068b8e80941Smrg{
2069b8e80941Smrg   if (prog_data->nr_params == 0)
2070b8e80941Smrg      return -1;
2071b8e80941Smrg
2072b8e80941Smrg   /* The local thread id is always the last parameter in the list */
2073b8e80941Smrg   uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2074b8e80941Smrg   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
2075b8e80941Smrg      return prog_data->nr_params - 1;
2076b8e80941Smrg
2077b8e80941Smrg   return -1;
2078b8e80941Smrg}
2079b8e80941Smrg
2080b8e80941Smrg/**
2081b8e80941Smrg * Struct for handling complex alignments.
2082b8e80941Smrg *
2083b8e80941Smrg * A complex alignment is stored as multiplier and an offset.  A value is
2084b8e80941Smrg * considered to be aligned if it is {offset} larger than a multiple of {mul}.
2085b8e80941Smrg * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
2086b8e80941Smrg * following:
2087b8e80941Smrg *
2088b8e80941Smrg *  N  | cplx_align_apply({8, 2}, N)
2089b8e80941Smrg * ----+-----------------------------
2090b8e80941Smrg *  4  | 6
2091b8e80941Smrg *  6  | 6
2092b8e80941Smrg *  8  | 14
2093b8e80941Smrg *  10 | 14
2094b8e80941Smrg *  12 | 14
2095b8e80941Smrg *  14 | 14
2096b8e80941Smrg *  16 | 22
2097b8e80941Smrg */
2098b8e80941Smrgstruct cplx_align {
2099b8e80941Smrg   unsigned mul:4;
2100b8e80941Smrg   unsigned offset:4;
2101b8e80941Smrg};
2102b8e80941Smrg
2103b8e80941Smrg#define CPLX_ALIGN_MAX_MUL 8
2104b8e80941Smrg
2105b8e80941Smrgstatic void
2106b8e80941Smrgcplx_align_assert_sane(struct cplx_align a)
2107b8e80941Smrg{
2108b8e80941Smrg   assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
2109b8e80941Smrg   assert(a.offset < a.mul);
2110b8e80941Smrg}
2111b8e80941Smrg
2112b8e80941Smrg/**
2113b8e80941Smrg * Combines two alignments to produce a least multiple of sorts.
2114b8e80941Smrg *
2115b8e80941Smrg * The returned alignment is the smallest (in terms of multiplier) such that
2116b8e80941Smrg * anything aligned to both a and b will be aligned to the new alignment.
2117b8e80941Smrg * This function will assert-fail if a and b are not compatible, i.e. if the
2118b8e80941Smrg * offset parameters are such that no common alignment is possible.
2119b8e80941Smrg */
2120b8e80941Smrgstatic struct cplx_align
2121b8e80941Smrgcplx_align_combine(struct cplx_align a, struct cplx_align b)
2122b8e80941Smrg{
2123b8e80941Smrg   cplx_align_assert_sane(a);
2124b8e80941Smrg   cplx_align_assert_sane(b);
2125b8e80941Smrg
2126b8e80941Smrg   /* Assert that the alignments agree. */
2127b8e80941Smrg   assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
2128b8e80941Smrg
2129b8e80941Smrg   return a.mul > b.mul ? a : b;
2130b8e80941Smrg}
2131b8e80941Smrg
2132b8e80941Smrg/**
2133b8e80941Smrg * Apply a complex alignment
2134b8e80941Smrg *
2135b8e80941Smrg * This function will return the smallest number greater than or equal to
2136b8e80941Smrg * offset that is aligned to align.
2137b8e80941Smrg */
2138b8e80941Smrgstatic unsigned
2139b8e80941Smrgcplx_align_apply(struct cplx_align align, unsigned offset)
2140b8e80941Smrg{
2141b8e80941Smrg   return ALIGN(offset - align.offset, align.mul) + align.offset;
2142b8e80941Smrg}
2143b8e80941Smrg
2144b8e80941Smrg#define UNIFORM_SLOT_SIZE 4
2145b8e80941Smrg
2146b8e80941Smrgstruct uniform_slot_info {
2147b8e80941Smrg   /** True if the given uniform slot is live */
2148b8e80941Smrg   unsigned is_live:1;
2149b8e80941Smrg
2150b8e80941Smrg   /** True if this slot and the next slot must remain contiguous */
2151b8e80941Smrg   unsigned contiguous:1;
2152b8e80941Smrg
2153b8e80941Smrg   struct cplx_align align;
2154b8e80941Smrg};
2155b8e80941Smrg
2156b8e80941Smrgstatic void
2157b8e80941Smrgmark_uniform_slots_read(struct uniform_slot_info *slots,
2158b8e80941Smrg                        unsigned num_slots, unsigned alignment)
2159b8e80941Smrg{
2160b8e80941Smrg   assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
2161b8e80941Smrg   assert(alignment <= CPLX_ALIGN_MAX_MUL);
2162b8e80941Smrg
2163b8e80941Smrg   /* We can't align a slot to anything less than the slot size */
2164b8e80941Smrg   alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
2165b8e80941Smrg
2166b8e80941Smrg   struct cplx_align align = {alignment, 0};
2167b8e80941Smrg   cplx_align_assert_sane(align);
2168b8e80941Smrg
2169b8e80941Smrg   for (unsigned i = 0; i < num_slots; i++) {
2170b8e80941Smrg      slots[i].is_live = true;
2171b8e80941Smrg      if (i < num_slots - 1)
2172b8e80941Smrg         slots[i].contiguous = true;
2173b8e80941Smrg
2174b8e80941Smrg      align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
2175b8e80941Smrg      if (slots[i].align.mul == 0) {
2176b8e80941Smrg         slots[i].align = align;
2177b8e80941Smrg      } else {
2178b8e80941Smrg         slots[i].align = cplx_align_combine(slots[i].align, align);
2179b8e80941Smrg      }
2180b8e80941Smrg   }
2181b8e80941Smrg}
2182b8e80941Smrg
2183b8e80941Smrg/**
2184b8e80941Smrg * Assign UNIFORM file registers to either push constants or pull constants.
2185b8e80941Smrg *
2186b8e80941Smrg * We allow a fragment shader to have more than the specified minimum
2187b8e80941Smrg * maximum number of fragment shader uniform components (64).  If
2188b8e80941Smrg * there are too many of these, they'd fill up all of register space.
2189b8e80941Smrg * So, this will push some of them out to the pull constant buffer and
2190b8e80941Smrg * update the program to load them.
2191b8e80941Smrg */
2192b8e80941Smrgvoid
2193b8e80941Smrgfs_visitor::assign_constant_locations()
2194b8e80941Smrg{
2195b8e80941Smrg   /* Only the first compile gets to decide on locations. */
2196b8e80941Smrg   if (push_constant_loc) {
2197b8e80941Smrg      assert(pull_constant_loc);
2198b8e80941Smrg      return;
2199b8e80941Smrg   }
2200b8e80941Smrg
2201b8e80941Smrg   struct uniform_slot_info slots[uniforms];
2202b8e80941Smrg   memset(slots, 0, sizeof(slots));
2203b8e80941Smrg
2204b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2205b8e80941Smrg      for (int i = 0 ; i < inst->sources; i++) {
2206b8e80941Smrg         if (inst->src[i].file != UNIFORM)
2207b8e80941Smrg            continue;
2208b8e80941Smrg
2209b8e80941Smrg         /* NIR tightly packs things so the uniform number might not be
2210b8e80941Smrg          * aligned (if we have a double right after a float, for instance).
2211b8e80941Smrg          * This is fine because the process of re-arranging them will ensure
2212b8e80941Smrg          * that things are properly aligned.  The offset into that uniform,
2213b8e80941Smrg          * however, must be aligned.
2214b8e80941Smrg          *
2215b8e80941Smrg          * In Vulkan, we have explicit offsets but everything is crammed
2216b8e80941Smrg          * into a single "variable" so inst->src[i].nr will always be 0.
2217b8e80941Smrg          * Everything will be properly aligned relative to that one base.
2218b8e80941Smrg          */
2219b8e80941Smrg         assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
2220b8e80941Smrg
2221b8e80941Smrg         unsigned u = inst->src[i].nr +
2222b8e80941Smrg                      inst->src[i].offset / UNIFORM_SLOT_SIZE;
2223b8e80941Smrg
2224b8e80941Smrg         if (u >= uniforms)
2225b8e80941Smrg            continue;
2226b8e80941Smrg
2227b8e80941Smrg         unsigned slots_read;
2228b8e80941Smrg         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
2229b8e80941Smrg            slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
2230b8e80941Smrg         } else {
2231b8e80941Smrg            unsigned bytes_read = inst->components_read(i) *
2232b8e80941Smrg                                  type_sz(inst->src[i].type);
2233b8e80941Smrg            slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
2234b8e80941Smrg         }
2235b8e80941Smrg
2236b8e80941Smrg         assert(u + slots_read <= uniforms);
2237b8e80941Smrg         mark_uniform_slots_read(&slots[u], slots_read,
2238b8e80941Smrg                                 type_sz(inst->src[i].type));
2239b8e80941Smrg      }
2240b8e80941Smrg   }
2241b8e80941Smrg
2242b8e80941Smrg   int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
2243b8e80941Smrg
2244b8e80941Smrg   /* Only allow 16 registers (128 uniform components) as push constants.
2245b8e80941Smrg    *
2246b8e80941Smrg    * Just demote the end of the list.  We could probably do better
2247b8e80941Smrg    * here, demoting things that are rarely used in the program first.
2248b8e80941Smrg    *
2249b8e80941Smrg    * If changing this value, note the limitation about total_regs in
2250b8e80941Smrg    * brw_curbe.c.
2251b8e80941Smrg    */
2252b8e80941Smrg   unsigned int max_push_components = 16 * 8;
2253b8e80941Smrg   if (subgroup_id_index >= 0)
2254b8e80941Smrg      max_push_components--; /* Save a slot for the thread ID */
2255b8e80941Smrg
2256b8e80941Smrg   /* We push small arrays, but no bigger than 16 floats.  This is big enough
2257b8e80941Smrg    * for a vec4 but hopefully not large enough to push out other stuff.  We
2258b8e80941Smrg    * should probably use a better heuristic at some point.
2259b8e80941Smrg    */
2260b8e80941Smrg   const unsigned int max_chunk_size = 16;
2261b8e80941Smrg
2262b8e80941Smrg   unsigned int num_push_constants = 0;
2263b8e80941Smrg   unsigned int num_pull_constants = 0;
2264b8e80941Smrg
2265b8e80941Smrg   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2266b8e80941Smrg   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2267b8e80941Smrg
2268b8e80941Smrg   /* Default to -1 meaning no location */
2269b8e80941Smrg   memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
2270b8e80941Smrg   memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2271b8e80941Smrg
2272b8e80941Smrg   int chunk_start = -1;
2273b8e80941Smrg   struct cplx_align align;
2274b8e80941Smrg   for (unsigned u = 0; u < uniforms; u++) {
2275b8e80941Smrg      if (!slots[u].is_live) {
2276b8e80941Smrg         assert(chunk_start == -1);
2277b8e80941Smrg         continue;
2278b8e80941Smrg      }
2279b8e80941Smrg
2280b8e80941Smrg      /* Skip subgroup_id_index to put it in the last push register. */
2281b8e80941Smrg      if (subgroup_id_index == (int)u)
2282b8e80941Smrg         continue;
2283b8e80941Smrg
2284b8e80941Smrg      if (chunk_start == -1) {
2285b8e80941Smrg         chunk_start = u;
2286b8e80941Smrg         align = slots[u].align;
2287b8e80941Smrg      } else {
2288b8e80941Smrg         /* Offset into the chunk */
2289b8e80941Smrg         unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
2290b8e80941Smrg
2291b8e80941Smrg         /* Shift the slot alignment down by the chunk offset so it is
2292b8e80941Smrg          * comparable with the base chunk alignment.
2293b8e80941Smrg          */
2294b8e80941Smrg         struct cplx_align slot_align = slots[u].align;
2295b8e80941Smrg         slot_align.offset =
2296b8e80941Smrg            (slot_align.offset - chunk_offset) & (align.mul - 1);
2297b8e80941Smrg
2298b8e80941Smrg         align = cplx_align_combine(align, slot_align);
2299b8e80941Smrg      }
2300b8e80941Smrg
2301b8e80941Smrg      /* Sanity check the alignment */
2302b8e80941Smrg      cplx_align_assert_sane(align);
2303b8e80941Smrg
2304b8e80941Smrg      if (slots[u].contiguous)
2305b8e80941Smrg         continue;
2306b8e80941Smrg
2307b8e80941Smrg      /* Adjust the alignment to be in terms of slots, not bytes */
2308b8e80941Smrg      assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
2309b8e80941Smrg      assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
2310b8e80941Smrg      align.mul /= UNIFORM_SLOT_SIZE;
2311b8e80941Smrg      align.offset /= UNIFORM_SLOT_SIZE;
2312b8e80941Smrg
2313b8e80941Smrg      unsigned push_start_align = cplx_align_apply(align, num_push_constants);
2314b8e80941Smrg      unsigned chunk_size = u - chunk_start + 1;
2315b8e80941Smrg      if ((!compiler->supports_pull_constants && u < UBO_START) ||
2316b8e80941Smrg          (chunk_size < max_chunk_size &&
2317b8e80941Smrg           push_start_align + chunk_size <= max_push_components)) {
2318b8e80941Smrg         /* Align up the number of push constants */
2319b8e80941Smrg         num_push_constants = push_start_align;
2320b8e80941Smrg         for (unsigned i = 0; i < chunk_size; i++)
2321b8e80941Smrg            push_constant_loc[chunk_start + i] = num_push_constants++;
2322b8e80941Smrg      } else {
2323b8e80941Smrg         /* We need to pull this one */
2324b8e80941Smrg         num_pull_constants = cplx_align_apply(align, num_pull_constants);
2325b8e80941Smrg         for (unsigned i = 0; i < chunk_size; i++)
2326b8e80941Smrg            pull_constant_loc[chunk_start + i] = num_pull_constants++;
2327b8e80941Smrg      }
2328b8e80941Smrg
2329b8e80941Smrg      /* Reset the chunk and start again */
2330b8e80941Smrg      chunk_start = -1;
2331b8e80941Smrg   }
2332b8e80941Smrg
2333b8e80941Smrg   /* Add the CS local thread ID uniform at the end of the push constants */
2334b8e80941Smrg   if (subgroup_id_index >= 0)
2335b8e80941Smrg      push_constant_loc[subgroup_id_index] = num_push_constants++;
2336b8e80941Smrg
2337b8e80941Smrg   /* As the uniforms are going to be reordered, stash the old array and
2338b8e80941Smrg    * create two new arrays for push/pull params.
2339b8e80941Smrg    */
2340b8e80941Smrg   uint32_t *param = stage_prog_data->param;
2341b8e80941Smrg   stage_prog_data->nr_params = num_push_constants;
2342b8e80941Smrg   if (num_push_constants) {
2343b8e80941Smrg      stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
2344b8e80941Smrg                                             num_push_constants);
2345b8e80941Smrg   } else {
2346b8e80941Smrg      stage_prog_data->param = NULL;
2347b8e80941Smrg   }
2348b8e80941Smrg   assert(stage_prog_data->nr_pull_params == 0);
2349b8e80941Smrg   assert(stage_prog_data->pull_param == NULL);
2350b8e80941Smrg   if (num_pull_constants > 0) {
2351b8e80941Smrg      stage_prog_data->nr_pull_params = num_pull_constants;
2352b8e80941Smrg      stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
2353b8e80941Smrg                                                  num_pull_constants);
2354b8e80941Smrg   }
2355b8e80941Smrg
2356b8e80941Smrg   /* Now that we know how many regular uniforms we'll push, reduce the
2357b8e80941Smrg    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2358b8e80941Smrg    */
2359b8e80941Smrg   unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2360b8e80941Smrg   for (int i = 0; i < 4; i++) {
2361b8e80941Smrg      struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
2362b8e80941Smrg
2363b8e80941Smrg      if (push_length + range->length > 64)
2364b8e80941Smrg         range->length = 64 - push_length;
2365b8e80941Smrg
2366b8e80941Smrg      push_length += range->length;
2367b8e80941Smrg   }
2368b8e80941Smrg   assert(push_length <= 64);
2369b8e80941Smrg
2370b8e80941Smrg   /* Up until now, the param[] array has been indexed by reg + offset
2371b8e80941Smrg    * of UNIFORM registers.  Move pull constants into pull_param[] and
2372b8e80941Smrg    * condense param[] to only contain the uniforms we chose to push.
2373b8e80941Smrg    *
2374b8e80941Smrg    * NOTE: Because we are condensing the params[] array, we know that
2375b8e80941Smrg    * push_constant_loc[i] <= i and we can do it in one smooth loop without
2376b8e80941Smrg    * having to make a copy.
2377b8e80941Smrg    */
2378b8e80941Smrg   for (unsigned int i = 0; i < uniforms; i++) {
2379b8e80941Smrg      uint32_t value = param[i];
2380b8e80941Smrg      if (pull_constant_loc[i] != -1) {
2381b8e80941Smrg         stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2382b8e80941Smrg      } else if (push_constant_loc[i] != -1) {
2383b8e80941Smrg         stage_prog_data->param[push_constant_loc[i]] = value;
2384b8e80941Smrg      }
2385b8e80941Smrg   }
2386b8e80941Smrg   ralloc_free(param);
2387b8e80941Smrg}
2388b8e80941Smrg
2389b8e80941Smrgbool
2390b8e80941Smrgfs_visitor::get_pull_locs(const fs_reg &src,
2391b8e80941Smrg                          unsigned *out_surf_index,
2392b8e80941Smrg                          unsigned *out_pull_index)
2393b8e80941Smrg{
2394b8e80941Smrg   assert(src.file == UNIFORM);
2395b8e80941Smrg
2396b8e80941Smrg   if (src.nr >= UBO_START) {
2397b8e80941Smrg      const struct brw_ubo_range *range =
2398b8e80941Smrg         &prog_data->ubo_ranges[src.nr - UBO_START];
2399b8e80941Smrg
2400b8e80941Smrg      /* If this access is in our (reduced) range, use the push data. */
2401b8e80941Smrg      if (src.offset / 32 < range->length)
2402b8e80941Smrg         return false;
2403b8e80941Smrg
2404b8e80941Smrg      *out_surf_index = prog_data->binding_table.ubo_start + range->block;
2405b8e80941Smrg      *out_pull_index = (32 * range->start + src.offset) / 4;
2406b8e80941Smrg      return true;
2407b8e80941Smrg   }
2408b8e80941Smrg
2409b8e80941Smrg   const unsigned location = src.nr + src.offset / 4;
2410b8e80941Smrg
2411b8e80941Smrg   if (location < uniforms && pull_constant_loc[location] != -1) {
2412b8e80941Smrg      /* A regular uniform push constant */
2413b8e80941Smrg      *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
2414b8e80941Smrg      *out_pull_index = pull_constant_loc[location];
2415b8e80941Smrg      return true;
2416b8e80941Smrg   }
2417b8e80941Smrg
2418b8e80941Smrg   return false;
2419b8e80941Smrg}
2420b8e80941Smrg
2421b8e80941Smrg/**
2422b8e80941Smrg * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2423b8e80941Smrg * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2424b8e80941Smrg */
2425b8e80941Smrgvoid
2426b8e80941Smrgfs_visitor::lower_constant_loads()
2427b8e80941Smrg{
2428b8e80941Smrg   unsigned index, pull_index;
2429b8e80941Smrg
2430b8e80941Smrg   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2431b8e80941Smrg      /* Set up the annotation tracking for new generated instructions. */
2432b8e80941Smrg      const fs_builder ibld(this, block, inst);
2433b8e80941Smrg
2434b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
2435b8e80941Smrg	 if (inst->src[i].file != UNIFORM)
2436b8e80941Smrg	    continue;
2437b8e80941Smrg
2438b8e80941Smrg         /* We'll handle this case later */
2439b8e80941Smrg         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
2440b8e80941Smrg            continue;
2441b8e80941Smrg
2442b8e80941Smrg         if (!get_pull_locs(inst->src[i], &index, &pull_index))
2443b8e80941Smrg	    continue;
2444b8e80941Smrg
2445b8e80941Smrg         assert(inst->src[i].stride == 0);
2446b8e80941Smrg
2447b8e80941Smrg         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2448b8e80941Smrg         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2449b8e80941Smrg         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2450b8e80941Smrg         const unsigned base = pull_index * 4;
2451b8e80941Smrg
2452b8e80941Smrg         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2453b8e80941Smrg                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
2454b8e80941Smrg
2455b8e80941Smrg         /* Rewrite the instruction to use the temporary VGRF. */
2456b8e80941Smrg         inst->src[i].file = VGRF;
2457b8e80941Smrg         inst->src[i].nr = dst.nr;
2458b8e80941Smrg         inst->src[i].offset = (base & (block_sz - 1)) +
2459b8e80941Smrg                               inst->src[i].offset % 4;
2460b8e80941Smrg      }
2461b8e80941Smrg
2462b8e80941Smrg      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
2463b8e80941Smrg          inst->src[0].file == UNIFORM) {
2464b8e80941Smrg
2465b8e80941Smrg         if (!get_pull_locs(inst->src[0], &index, &pull_index))
2466b8e80941Smrg            continue;
2467b8e80941Smrg
2468b8e80941Smrg         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2469b8e80941Smrg                                    brw_imm_ud(index),
2470b8e80941Smrg                                    inst->src[1],
2471b8e80941Smrg                                    pull_index * 4);
2472b8e80941Smrg         inst->remove(block);
2473b8e80941Smrg      }
2474b8e80941Smrg   }
2475b8e80941Smrg   invalidate_live_intervals();
2476b8e80941Smrg}
2477b8e80941Smrg
2478b8e80941Smrgbool
2479b8e80941Smrgfs_visitor::opt_algebraic()
2480b8e80941Smrg{
2481b8e80941Smrg   bool progress = false;
2482b8e80941Smrg
2483b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2484b8e80941Smrg      switch (inst->opcode) {
2485b8e80941Smrg      case BRW_OPCODE_MOV:
2486b8e80941Smrg         if (!devinfo->has_64bit_types &&
2487b8e80941Smrg             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2488b8e80941Smrg              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2489b8e80941Smrg              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2490b8e80941Smrg            assert(inst->dst.type == inst->src[0].type);
2491b8e80941Smrg            assert(!inst->saturate);
2492b8e80941Smrg            assert(!inst->src[0].abs);
2493b8e80941Smrg            assert(!inst->src[0].negate);
2494b8e80941Smrg            const brw::fs_builder ibld(this, block, inst);
2495b8e80941Smrg
2496b8e80941Smrg            if (inst->src[0].file == IMM) {
2497b8e80941Smrg               ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2498b8e80941Smrg                        brw_imm_ud(inst->src[0].u64 >> 32));
2499b8e80941Smrg               ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2500b8e80941Smrg                        brw_imm_ud(inst->src[0].u64));
2501b8e80941Smrg            } else {
2502b8e80941Smrg               ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2503b8e80941Smrg                        subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
2504b8e80941Smrg               ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2505b8e80941Smrg                        subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
2506b8e80941Smrg            }
2507b8e80941Smrg
2508b8e80941Smrg            inst->remove(block);
2509b8e80941Smrg            progress = true;
2510b8e80941Smrg         }
2511b8e80941Smrg
2512b8e80941Smrg         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2513b8e80941Smrg              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2514b8e80941Smrg             inst->dst.is_null() &&
2515b8e80941Smrg             (inst->src[0].abs || inst->src[0].negate)) {
2516b8e80941Smrg            inst->src[0].abs = false;
2517b8e80941Smrg            inst->src[0].negate = false;
2518b8e80941Smrg            progress = true;
2519b8e80941Smrg            break;
2520b8e80941Smrg         }
2521b8e80941Smrg
2522b8e80941Smrg         if (inst->src[0].file != IMM)
2523b8e80941Smrg            break;
2524b8e80941Smrg
2525b8e80941Smrg         if (inst->saturate) {
2526b8e80941Smrg            /* Full mixed-type saturates don't happen.  However, we can end up
2527b8e80941Smrg             * with things like:
2528b8e80941Smrg             *
2529b8e80941Smrg             *    mov.sat(8) g21<1>DF       -1F
2530b8e80941Smrg             *
2531b8e80941Smrg             * Other mixed-size-but-same-base-type cases may also be possible.
2532b8e80941Smrg             */
2533b8e80941Smrg            if (inst->dst.type != inst->src[0].type &&
2534b8e80941Smrg                inst->dst.type != BRW_REGISTER_TYPE_DF &&
2535b8e80941Smrg                inst->src[0].type != BRW_REGISTER_TYPE_F)
2536b8e80941Smrg               assert(!"unimplemented: saturate mixed types");
2537b8e80941Smrg
2538b8e80941Smrg            if (brw_saturate_immediate(inst->src[0].type,
2539b8e80941Smrg                                       &inst->src[0].as_brw_reg())) {
2540b8e80941Smrg               inst->saturate = false;
2541b8e80941Smrg               progress = true;
2542b8e80941Smrg            }
2543b8e80941Smrg         }
2544b8e80941Smrg         break;
2545b8e80941Smrg
2546b8e80941Smrg      case BRW_OPCODE_MUL:
2547b8e80941Smrg         if (inst->src[1].file != IMM)
2548b8e80941Smrg            continue;
2549b8e80941Smrg
2550b8e80941Smrg         /* a * 1.0 = a */
2551b8e80941Smrg         if (inst->src[1].is_one()) {
2552b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2553b8e80941Smrg            inst->src[1] = reg_undef;
2554b8e80941Smrg            progress = true;
2555b8e80941Smrg            break;
2556b8e80941Smrg         }
2557b8e80941Smrg
2558b8e80941Smrg         /* a * -1.0 = -a */
2559b8e80941Smrg         if (inst->src[1].is_negative_one()) {
2560b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2561b8e80941Smrg            inst->src[0].negate = !inst->src[0].negate;
2562b8e80941Smrg            inst->src[1] = reg_undef;
2563b8e80941Smrg            progress = true;
2564b8e80941Smrg            break;
2565b8e80941Smrg         }
2566b8e80941Smrg
2567b8e80941Smrg         if (inst->src[0].file == IMM) {
2568b8e80941Smrg            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2569b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2570b8e80941Smrg            inst->src[0].f *= inst->src[1].f;
2571b8e80941Smrg            inst->src[1] = reg_undef;
2572b8e80941Smrg            progress = true;
2573b8e80941Smrg            break;
2574b8e80941Smrg         }
2575b8e80941Smrg         break;
2576b8e80941Smrg      case BRW_OPCODE_ADD:
2577b8e80941Smrg         if (inst->src[1].file != IMM)
2578b8e80941Smrg            continue;
2579b8e80941Smrg
2580b8e80941Smrg         if (inst->src[0].file == IMM) {
2581b8e80941Smrg            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2582b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2583b8e80941Smrg            inst->src[0].f += inst->src[1].f;
2584b8e80941Smrg            inst->src[1] = reg_undef;
2585b8e80941Smrg            progress = true;
2586b8e80941Smrg            break;
2587b8e80941Smrg         }
2588b8e80941Smrg         break;
2589b8e80941Smrg      case BRW_OPCODE_OR:
2590b8e80941Smrg         if (inst->src[0].equals(inst->src[1]) ||
2591b8e80941Smrg             inst->src[1].is_zero()) {
2592b8e80941Smrg            /* On Gen8+, the OR instruction can have a source modifier that
2593b8e80941Smrg             * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2594b8e80941Smrg             * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2595b8e80941Smrg             */
2596b8e80941Smrg            if (inst->src[0].negate) {
2597b8e80941Smrg               inst->opcode = BRW_OPCODE_NOT;
2598b8e80941Smrg               inst->src[0].negate = false;
2599b8e80941Smrg            } else {
2600b8e80941Smrg               inst->opcode = BRW_OPCODE_MOV;
2601b8e80941Smrg            }
2602b8e80941Smrg            inst->src[1] = reg_undef;
2603b8e80941Smrg            progress = true;
2604b8e80941Smrg            break;
2605b8e80941Smrg         }
2606b8e80941Smrg         break;
2607b8e80941Smrg      case BRW_OPCODE_CMP:
2608b8e80941Smrg         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2609b8e80941Smrg              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2610b8e80941Smrg             inst->src[1].is_zero() &&
2611b8e80941Smrg             (inst->src[0].abs || inst->src[0].negate)) {
2612b8e80941Smrg            inst->src[0].abs = false;
2613b8e80941Smrg            inst->src[0].negate = false;
2614b8e80941Smrg            progress = true;
2615b8e80941Smrg            break;
2616b8e80941Smrg         }
2617b8e80941Smrg         break;
2618b8e80941Smrg      case BRW_OPCODE_SEL:
2619b8e80941Smrg         if (!devinfo->has_64bit_types &&
2620b8e80941Smrg             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2621b8e80941Smrg              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2622b8e80941Smrg              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2623b8e80941Smrg            assert(inst->dst.type == inst->src[0].type);
2624b8e80941Smrg            assert(!inst->saturate);
2625b8e80941Smrg            assert(!inst->src[0].abs && !inst->src[0].negate);
2626b8e80941Smrg            assert(!inst->src[1].abs && !inst->src[1].negate);
2627b8e80941Smrg            const brw::fs_builder ibld(this, block, inst);
2628b8e80941Smrg
2629b8e80941Smrg            set_predicate(inst->predicate,
2630b8e80941Smrg                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2631b8e80941Smrg                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
2632b8e80941Smrg                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
2633b8e80941Smrg            set_predicate(inst->predicate,
2634b8e80941Smrg                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2635b8e80941Smrg                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
2636b8e80941Smrg                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
2637b8e80941Smrg
2638b8e80941Smrg            inst->remove(block);
2639b8e80941Smrg            progress = true;
2640b8e80941Smrg         }
2641b8e80941Smrg         if (inst->src[0].equals(inst->src[1])) {
2642b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2643b8e80941Smrg            inst->src[1] = reg_undef;
2644b8e80941Smrg            inst->predicate = BRW_PREDICATE_NONE;
2645b8e80941Smrg            inst->predicate_inverse = false;
2646b8e80941Smrg            progress = true;
2647b8e80941Smrg         } else if (inst->saturate && inst->src[1].file == IMM) {
2648b8e80941Smrg            switch (inst->conditional_mod) {
2649b8e80941Smrg            case BRW_CONDITIONAL_LE:
2650b8e80941Smrg            case BRW_CONDITIONAL_L:
2651b8e80941Smrg               switch (inst->src[1].type) {
2652b8e80941Smrg               case BRW_REGISTER_TYPE_F:
2653b8e80941Smrg                  if (inst->src[1].f >= 1.0f) {
2654b8e80941Smrg                     inst->opcode = BRW_OPCODE_MOV;
2655b8e80941Smrg                     inst->src[1] = reg_undef;
2656b8e80941Smrg                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2657b8e80941Smrg                     progress = true;
2658b8e80941Smrg                  }
2659b8e80941Smrg                  break;
2660b8e80941Smrg               default:
2661b8e80941Smrg                  break;
2662b8e80941Smrg               }
2663b8e80941Smrg               break;
2664b8e80941Smrg            case BRW_CONDITIONAL_GE:
2665b8e80941Smrg            case BRW_CONDITIONAL_G:
2666b8e80941Smrg               switch (inst->src[1].type) {
2667b8e80941Smrg               case BRW_REGISTER_TYPE_F:
2668b8e80941Smrg                  if (inst->src[1].f <= 0.0f) {
2669b8e80941Smrg                     inst->opcode = BRW_OPCODE_MOV;
2670b8e80941Smrg                     inst->src[1] = reg_undef;
2671b8e80941Smrg                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2672b8e80941Smrg                     progress = true;
2673b8e80941Smrg                  }
2674b8e80941Smrg                  break;
2675b8e80941Smrg               default:
2676b8e80941Smrg                  break;
2677b8e80941Smrg               }
2678b8e80941Smrg            default:
2679b8e80941Smrg               break;
2680b8e80941Smrg            }
2681b8e80941Smrg         }
2682b8e80941Smrg         break;
2683b8e80941Smrg      case BRW_OPCODE_MAD:
2684b8e80941Smrg         if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
2685b8e80941Smrg             inst->src[1].type != BRW_REGISTER_TYPE_F ||
2686b8e80941Smrg             inst->src[2].type != BRW_REGISTER_TYPE_F)
2687b8e80941Smrg            break;
2688b8e80941Smrg         if (inst->src[1].is_one()) {
2689b8e80941Smrg            inst->opcode = BRW_OPCODE_ADD;
2690b8e80941Smrg            inst->src[1] = inst->src[2];
2691b8e80941Smrg            inst->src[2] = reg_undef;
2692b8e80941Smrg            progress = true;
2693b8e80941Smrg         } else if (inst->src[2].is_one()) {
2694b8e80941Smrg            inst->opcode = BRW_OPCODE_ADD;
2695b8e80941Smrg            inst->src[2] = reg_undef;
2696b8e80941Smrg            progress = true;
2697b8e80941Smrg         }
2698b8e80941Smrg         break;
2699b8e80941Smrg      case SHADER_OPCODE_BROADCAST:
2700b8e80941Smrg         if (is_uniform(inst->src[0])) {
2701b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2702b8e80941Smrg            inst->sources = 1;
2703b8e80941Smrg            inst->force_writemask_all = true;
2704b8e80941Smrg            progress = true;
2705b8e80941Smrg         } else if (inst->src[1].file == IMM) {
2706b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2707b8e80941Smrg            /* It's possible that the selected component will be too large and
2708b8e80941Smrg             * overflow the register.  This can happen if someone does a
2709b8e80941Smrg             * readInvocation() from GLSL or SPIR-V and provides an OOB
2710b8e80941Smrg             * invocationIndex.  If this happens and we some how manage
2711b8e80941Smrg             * to constant fold it in and get here, then component() may cause
2712b8e80941Smrg             * us to start reading outside of the VGRF which will lead to an
2713b8e80941Smrg             * assert later.  Instead, just let it wrap around if it goes over
2714b8e80941Smrg             * exec_size.
2715b8e80941Smrg             */
2716b8e80941Smrg            const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
2717b8e80941Smrg            inst->src[0] = component(inst->src[0], comp);
2718b8e80941Smrg            inst->sources = 1;
2719b8e80941Smrg            inst->force_writemask_all = true;
2720b8e80941Smrg            progress = true;
2721b8e80941Smrg         }
2722b8e80941Smrg         break;
2723b8e80941Smrg
2724b8e80941Smrg      case SHADER_OPCODE_SHUFFLE:
2725b8e80941Smrg         if (is_uniform(inst->src[0])) {
2726b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2727b8e80941Smrg            inst->sources = 1;
2728b8e80941Smrg            progress = true;
2729b8e80941Smrg         } else if (inst->src[1].file == IMM) {
2730b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
2731b8e80941Smrg            inst->src[0] = component(inst->src[0],
2732b8e80941Smrg                                     inst->src[1].ud);
2733b8e80941Smrg            inst->sources = 1;
2734b8e80941Smrg            progress = true;
2735b8e80941Smrg         }
2736b8e80941Smrg         break;
2737b8e80941Smrg
2738b8e80941Smrg      default:
2739b8e80941Smrg	 break;
2740b8e80941Smrg      }
2741b8e80941Smrg
2742b8e80941Smrg      /* Swap if src[0] is immediate. */
2743b8e80941Smrg      if (progress && inst->is_commutative()) {
2744b8e80941Smrg         if (inst->src[0].file == IMM) {
2745b8e80941Smrg            fs_reg tmp = inst->src[1];
2746b8e80941Smrg            inst->src[1] = inst->src[0];
2747b8e80941Smrg            inst->src[0] = tmp;
2748b8e80941Smrg         }
2749b8e80941Smrg      }
2750b8e80941Smrg   }
2751b8e80941Smrg   return progress;
2752b8e80941Smrg}
2753b8e80941Smrg
2754b8e80941Smrg/**
2755b8e80941Smrg * Optimize sample messages that have constant zero values for the trailing
2756b8e80941Smrg * texture coordinates. We can just reduce the message length for these
2757b8e80941Smrg * instructions instead of reserving a register for it. Trailing parameters
2758b8e80941Smrg * that aren't sent default to zero anyway. This will cause the dead code
2759b8e80941Smrg * eliminator to remove the MOV instruction that would otherwise be emitted to
2760b8e80941Smrg * set up the zero value.
2761b8e80941Smrg */
2762b8e80941Smrgbool
2763b8e80941Smrgfs_visitor::opt_zero_samples()
2764b8e80941Smrg{
2765b8e80941Smrg   /* Gen4 infers the texturing opcode based on the message length so we can't
2766b8e80941Smrg    * change it.
2767b8e80941Smrg    */
2768b8e80941Smrg   if (devinfo->gen < 5)
2769b8e80941Smrg      return false;
2770b8e80941Smrg
2771b8e80941Smrg   bool progress = false;
2772b8e80941Smrg
2773b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2774b8e80941Smrg      if (!inst->is_tex())
2775b8e80941Smrg         continue;
2776b8e80941Smrg
2777b8e80941Smrg      fs_inst *load_payload = (fs_inst *) inst->prev;
2778b8e80941Smrg
2779b8e80941Smrg      if (load_payload->is_head_sentinel() ||
2780b8e80941Smrg          load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2781b8e80941Smrg         continue;
2782b8e80941Smrg
2783b8e80941Smrg      /* We don't want to remove the message header or the first parameter.
2784b8e80941Smrg       * Removing the first parameter is not allowed, see the Haswell PRM
2785b8e80941Smrg       * volume 7, page 149:
2786b8e80941Smrg       *
2787b8e80941Smrg       *     "Parameter 0 is required except for the sampleinfo message, which
2788b8e80941Smrg       *      has no parameter 0"
2789b8e80941Smrg       */
2790b8e80941Smrg      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
2791b8e80941Smrg             load_payload->src[(inst->mlen - inst->header_size) /
2792b8e80941Smrg                               (inst->exec_size / 8) +
2793b8e80941Smrg                               inst->header_size - 1].is_zero()) {
2794b8e80941Smrg         inst->mlen -= inst->exec_size / 8;
2795b8e80941Smrg         progress = true;
2796b8e80941Smrg      }
2797b8e80941Smrg   }
2798b8e80941Smrg
2799b8e80941Smrg   if (progress)
2800b8e80941Smrg      invalidate_live_intervals();
2801b8e80941Smrg
2802b8e80941Smrg   return progress;
2803b8e80941Smrg}
2804b8e80941Smrg
2805b8e80941Smrg/**
2806b8e80941Smrg * Optimize sample messages which are followed by the final RT write.
2807b8e80941Smrg *
2808b8e80941Smrg * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2809b8e80941Smrg * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2810b8e80941Smrg * final texturing results copied to the framebuffer write payload and modify
2811b8e80941Smrg * them to write to the framebuffer directly.
2812b8e80941Smrg */
2813b8e80941Smrgbool
2814b8e80941Smrgfs_visitor::opt_sampler_eot()
2815b8e80941Smrg{
2816b8e80941Smrg   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2817b8e80941Smrg
2818b8e80941Smrg   if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16)
2819b8e80941Smrg      return false;
2820b8e80941Smrg
2821b8e80941Smrg   if (devinfo->gen != 9 && !devinfo->is_cherryview)
2822b8e80941Smrg      return false;
2823b8e80941Smrg
2824b8e80941Smrg   /* FINISHME: It should be possible to implement this optimization when there
2825b8e80941Smrg    * are multiple drawbuffers.
2826b8e80941Smrg    */
2827b8e80941Smrg   if (key->nr_color_regions != 1)
2828b8e80941Smrg      return false;
2829b8e80941Smrg
2830b8e80941Smrg   /* Requires emitting a bunch of saturating MOV instructions during logical
2831b8e80941Smrg    * send lowering to clamp the color payload, which the sampler unit isn't
2832b8e80941Smrg    * going to do for us.
2833b8e80941Smrg    */
2834b8e80941Smrg   if (key->clamp_fragment_color)
2835b8e80941Smrg      return false;
2836b8e80941Smrg
2837b8e80941Smrg   /* Look for a texturing instruction immediately before the final FB_WRITE. */
2838b8e80941Smrg   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
2839b8e80941Smrg   fs_inst *fb_write = (fs_inst *)block->end();
2840b8e80941Smrg   assert(fb_write->eot);
2841b8e80941Smrg   assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
2842b8e80941Smrg
2843b8e80941Smrg   /* There wasn't one; nothing to do. */
2844b8e80941Smrg   if (unlikely(fb_write->prev->is_head_sentinel()))
2845b8e80941Smrg      return false;
2846b8e80941Smrg
2847b8e80941Smrg   fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2848b8e80941Smrg
2849b8e80941Smrg   /* 3D Sampler » Messages » Message Format
2850b8e80941Smrg    *
2851b8e80941Smrg    * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
2852b8e80941Smrg    *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
2853b8e80941Smrg    */
2854b8e80941Smrg   if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
2855b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
2856b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
2857b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
2858b8e80941Smrg       tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
2859b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
2860b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
2861b8e80941Smrg       tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
2862b8e80941Smrg      return false;
2863b8e80941Smrg
2864b8e80941Smrg   /* XXX - This shouldn't be necessary. */
2865b8e80941Smrg   if (tex_inst->prev->is_head_sentinel())
2866b8e80941Smrg      return false;
2867b8e80941Smrg
2868b8e80941Smrg   /* Check that the FB write sources are fully initialized by the single
2869b8e80941Smrg    * texturing instruction.
2870b8e80941Smrg    */
2871b8e80941Smrg   for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
2872b8e80941Smrg      if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
2873b8e80941Smrg         if (!fb_write->src[i].equals(tex_inst->dst) ||
2874b8e80941Smrg             fb_write->size_read(i) != tex_inst->size_written)
2875b8e80941Smrg         return false;
2876b8e80941Smrg      } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
2877b8e80941Smrg         if (fb_write->src[i].file != BAD_FILE)
2878b8e80941Smrg            return false;
2879b8e80941Smrg      }
2880b8e80941Smrg   }
2881b8e80941Smrg
2882b8e80941Smrg   assert(!tex_inst->eot); /* We can't get here twice */
2883b8e80941Smrg   assert((tex_inst->offset & (0xff << 24)) == 0);
2884b8e80941Smrg
2885b8e80941Smrg   const fs_builder ibld(this, block, tex_inst);
2886b8e80941Smrg
2887b8e80941Smrg   tex_inst->offset |= fb_write->target << 24;
2888b8e80941Smrg   tex_inst->eot = true;
2889b8e80941Smrg   tex_inst->dst = ibld.null_reg_ud();
2890b8e80941Smrg   tex_inst->size_written = 0;
2891b8e80941Smrg   fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2892b8e80941Smrg
2893b8e80941Smrg   /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
2894b8e80941Smrg    * flag and submit a header together with the sampler message as required
2895b8e80941Smrg    * by the hardware.
2896b8e80941Smrg    */
2897b8e80941Smrg   invalidate_live_intervals();
2898b8e80941Smrg   return true;
2899b8e80941Smrg}
2900b8e80941Smrg
2901b8e80941Smrgbool
2902b8e80941Smrgfs_visitor::opt_register_renaming()
2903b8e80941Smrg{
2904b8e80941Smrg   bool progress = false;
2905b8e80941Smrg   int depth = 0;
2906b8e80941Smrg
2907b8e80941Smrg   unsigned remap[alloc.count];
2908b8e80941Smrg   memset(remap, ~0u, sizeof(unsigned) * alloc.count);
2909b8e80941Smrg
2910b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2911b8e80941Smrg      if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2912b8e80941Smrg         depth++;
2913b8e80941Smrg      } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2914b8e80941Smrg                 inst->opcode == BRW_OPCODE_WHILE) {
2915b8e80941Smrg         depth--;
2916b8e80941Smrg      }
2917b8e80941Smrg
2918b8e80941Smrg      /* Rewrite instruction sources. */
2919b8e80941Smrg      for (int i = 0; i < inst->sources; i++) {
2920b8e80941Smrg         if (inst->src[i].file == VGRF &&
2921b8e80941Smrg             remap[inst->src[i].nr] != ~0u &&
2922b8e80941Smrg             remap[inst->src[i].nr] != inst->src[i].nr) {
2923b8e80941Smrg            inst->src[i].nr = remap[inst->src[i].nr];
2924b8e80941Smrg            progress = true;
2925b8e80941Smrg         }
2926b8e80941Smrg      }
2927b8e80941Smrg
2928b8e80941Smrg      const unsigned dst = inst->dst.nr;
2929b8e80941Smrg
2930b8e80941Smrg      if (depth == 0 &&
2931b8e80941Smrg          inst->dst.file == VGRF &&
2932b8e80941Smrg          alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
2933b8e80941Smrg          !inst->is_partial_write()) {
2934b8e80941Smrg         if (remap[dst] == ~0u) {
2935b8e80941Smrg            remap[dst] = dst;
2936b8e80941Smrg         } else {
2937b8e80941Smrg            remap[dst] = alloc.allocate(regs_written(inst));
2938b8e80941Smrg            inst->dst.nr = remap[dst];
2939b8e80941Smrg            progress = true;
2940b8e80941Smrg         }
2941b8e80941Smrg      } else if (inst->dst.file == VGRF &&
2942b8e80941Smrg                 remap[dst] != ~0u &&
2943b8e80941Smrg                 remap[dst] != dst) {
2944b8e80941Smrg         inst->dst.nr = remap[dst];
2945b8e80941Smrg         progress = true;
2946b8e80941Smrg      }
2947b8e80941Smrg   }
2948b8e80941Smrg
2949b8e80941Smrg   if (progress) {
2950b8e80941Smrg      invalidate_live_intervals();
2951b8e80941Smrg
2952b8e80941Smrg      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2953b8e80941Smrg         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
2954b8e80941Smrg            delta_xy[i].nr = remap[delta_xy[i].nr];
2955b8e80941Smrg         }
2956b8e80941Smrg      }
2957b8e80941Smrg   }
2958b8e80941Smrg
2959b8e80941Smrg   return progress;
2960b8e80941Smrg}
2961b8e80941Smrg
2962b8e80941Smrg/**
2963b8e80941Smrg * Remove redundant or useless discard jumps.
2964b8e80941Smrg *
2965b8e80941Smrg * For example, we can eliminate jumps in the following sequence:
2966b8e80941Smrg *
2967b8e80941Smrg * discard-jump       (redundant with the next jump)
2968b8e80941Smrg * discard-jump       (useless; jumps to the next instruction)
2969b8e80941Smrg * placeholder-halt
2970b8e80941Smrg */
2971b8e80941Smrgbool
2972b8e80941Smrgfs_visitor::opt_redundant_discard_jumps()
2973b8e80941Smrg{
2974b8e80941Smrg   bool progress = false;
2975b8e80941Smrg
2976b8e80941Smrg   bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2977b8e80941Smrg
2978b8e80941Smrg   fs_inst *placeholder_halt = NULL;
2979b8e80941Smrg   foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2980b8e80941Smrg      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2981b8e80941Smrg         placeholder_halt = inst;
2982b8e80941Smrg         break;
2983b8e80941Smrg      }
2984b8e80941Smrg   }
2985b8e80941Smrg
2986b8e80941Smrg   if (!placeholder_halt)
2987b8e80941Smrg      return false;
2988b8e80941Smrg
2989b8e80941Smrg   /* Delete any HALTs immediately before the placeholder halt. */
2990b8e80941Smrg   for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2991b8e80941Smrg        !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2992b8e80941Smrg        prev = (fs_inst *) placeholder_halt->prev) {
2993b8e80941Smrg      prev->remove(last_bblock);
2994b8e80941Smrg      progress = true;
2995b8e80941Smrg   }
2996b8e80941Smrg
2997b8e80941Smrg   if (progress)
2998b8e80941Smrg      invalidate_live_intervals();
2999b8e80941Smrg
3000b8e80941Smrg   return progress;
3001b8e80941Smrg}
3002b8e80941Smrg
3003b8e80941Smrg/**
3004b8e80941Smrg * Compute a bitmask with GRF granularity with a bit set for each GRF starting
3005b8e80941Smrg * from \p r.offset which overlaps the region starting at \p s.offset and
3006b8e80941Smrg * spanning \p ds bytes.
3007b8e80941Smrg */
3008b8e80941Smrgstatic inline unsigned
3009b8e80941Smrgmask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
3010b8e80941Smrg{
3011b8e80941Smrg   const int rel_offset = reg_offset(s) - reg_offset(r);
3012b8e80941Smrg   const int shift = rel_offset / REG_SIZE;
3013b8e80941Smrg   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3014b8e80941Smrg   assert(reg_space(r) == reg_space(s) &&
3015b8e80941Smrg          shift >= 0 && shift < int(8 * sizeof(unsigned)));
3016b8e80941Smrg   return ((1 << n) - 1) << shift;
3017b8e80941Smrg}
3018b8e80941Smrg
3019b8e80941Smrgbool
3020b8e80941Smrgfs_visitor::opt_peephole_csel()
3021b8e80941Smrg{
3022b8e80941Smrg   if (devinfo->gen < 8)
3023b8e80941Smrg      return false;
3024b8e80941Smrg
3025b8e80941Smrg   bool progress = false;
3026b8e80941Smrg
3027b8e80941Smrg   foreach_block_reverse(block, cfg) {
3028b8e80941Smrg      int ip = block->end_ip + 1;
3029b8e80941Smrg
3030b8e80941Smrg      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
3031b8e80941Smrg         ip--;
3032b8e80941Smrg
3033b8e80941Smrg         if (inst->opcode != BRW_OPCODE_SEL ||
3034b8e80941Smrg             inst->predicate != BRW_PREDICATE_NORMAL ||
3035b8e80941Smrg             (inst->dst.type != BRW_REGISTER_TYPE_F &&
3036b8e80941Smrg              inst->dst.type != BRW_REGISTER_TYPE_D &&
3037b8e80941Smrg              inst->dst.type != BRW_REGISTER_TYPE_UD))
3038b8e80941Smrg            continue;
3039b8e80941Smrg
3040b8e80941Smrg         /* Because it is a 3-src instruction, CSEL cannot have an immediate
3041b8e80941Smrg          * value as a source, but we can sometimes handle zero.
3042b8e80941Smrg          */
3043b8e80941Smrg         if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
3044b8e80941Smrg              inst->src[0].file != UNIFORM) ||
3045b8e80941Smrg             (inst->src[1].file != VGRF && inst->src[1].file != ATTR &&
3046b8e80941Smrg              inst->src[1].file != UNIFORM && !inst->src[1].is_zero()))
3047b8e80941Smrg            continue;
3048b8e80941Smrg
3049b8e80941Smrg         foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3050b8e80941Smrg            if (!scan_inst->flags_written())
3051b8e80941Smrg               continue;
3052b8e80941Smrg
3053b8e80941Smrg            if ((scan_inst->opcode != BRW_OPCODE_CMP &&
3054b8e80941Smrg                 scan_inst->opcode != BRW_OPCODE_MOV) ||
3055b8e80941Smrg                scan_inst->predicate != BRW_PREDICATE_NONE ||
3056b8e80941Smrg                (scan_inst->src[0].file != VGRF &&
3057b8e80941Smrg                 scan_inst->src[0].file != ATTR &&
3058b8e80941Smrg                 scan_inst->src[0].file != UNIFORM) ||
3059b8e80941Smrg                scan_inst->src[0].type != BRW_REGISTER_TYPE_F)
3060b8e80941Smrg               break;
3061b8e80941Smrg
3062b8e80941Smrg            if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero())
3063b8e80941Smrg               break;
3064b8e80941Smrg
3065b8e80941Smrg            const brw::fs_builder ibld(this, block, inst);
3066b8e80941Smrg
3067b8e80941Smrg            const enum brw_conditional_mod cond =
3068b8e80941Smrg               inst->predicate_inverse
3069b8e80941Smrg               ? brw_negate_cmod(scan_inst->conditional_mod)
3070b8e80941Smrg               : scan_inst->conditional_mod;
3071b8e80941Smrg
3072b8e80941Smrg            fs_inst *csel_inst = NULL;
3073b8e80941Smrg
3074b8e80941Smrg            if (inst->src[1].file != IMM) {
3075b8e80941Smrg               csel_inst = ibld.CSEL(inst->dst,
3076b8e80941Smrg                                     inst->src[0],
3077b8e80941Smrg                                     inst->src[1],
3078b8e80941Smrg                                     scan_inst->src[0],
3079b8e80941Smrg                                     cond);
3080b8e80941Smrg            } else if (cond == BRW_CONDITIONAL_NZ) {
3081b8e80941Smrg               /* Consider the sequence
3082b8e80941Smrg                *
3083b8e80941Smrg                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
3084b8e80941Smrg                * (+f0) sel  g124<1>UD  g2<8,8,1>UD  0x00000000UD
3085b8e80941Smrg                *
3086b8e80941Smrg                * The sel will pick the immediate value 0 if r0 is ±0.0.
3087b8e80941Smrg                * Therefore, this sequence is equivalent:
3088b8e80941Smrg                *
3089b8e80941Smrg                * cmp.nz.f0  null<1>F   g3<8,8,1>F   0F
3090b8e80941Smrg                * (+f0) sel  g124<1>F   g2<8,8,1>F   (abs)g3<8,8,1>F
3091b8e80941Smrg                *
3092b8e80941Smrg                * The abs is ensures that the result is 0UD when g3 is -0.0F.
3093b8e80941Smrg                * By normal cmp-sel merging, this is also equivalent:
3094b8e80941Smrg                *
3095b8e80941Smrg                * csel.nz    g124<1>F   g2<4,4,1>F   (abs)g3<4,4,1>F  g3<4,4,1>F
3096b8e80941Smrg                */
3097b8e80941Smrg               csel_inst = ibld.CSEL(inst->dst,
3098b8e80941Smrg                                     inst->src[0],
3099b8e80941Smrg                                     scan_inst->src[0],
3100b8e80941Smrg                                     scan_inst->src[0],
3101b8e80941Smrg                                     cond);
3102b8e80941Smrg
3103b8e80941Smrg               csel_inst->src[1].abs = true;
3104b8e80941Smrg            }
3105b8e80941Smrg
3106b8e80941Smrg            if (csel_inst != NULL) {
3107b8e80941Smrg               progress = true;
3108b8e80941Smrg               csel_inst->saturate = inst->saturate;
3109b8e80941Smrg               inst->remove(block);
3110b8e80941Smrg            }
3111b8e80941Smrg
3112b8e80941Smrg            break;
3113b8e80941Smrg         }
3114b8e80941Smrg      }
3115b8e80941Smrg   }
3116b8e80941Smrg
3117b8e80941Smrg   return progress;
3118b8e80941Smrg}
3119b8e80941Smrg
3120b8e80941Smrgbool
3121b8e80941Smrgfs_visitor::compute_to_mrf()
3122b8e80941Smrg{
3123b8e80941Smrg   bool progress = false;
3124b8e80941Smrg   int next_ip = 0;
3125b8e80941Smrg
3126b8e80941Smrg   /* No MRFs on Gen >= 7. */
3127b8e80941Smrg   if (devinfo->gen >= 7)
3128b8e80941Smrg      return false;
3129b8e80941Smrg
3130b8e80941Smrg   calculate_live_intervals();
3131b8e80941Smrg
3132b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3133b8e80941Smrg      int ip = next_ip;
3134b8e80941Smrg      next_ip++;
3135b8e80941Smrg
3136b8e80941Smrg      if (inst->opcode != BRW_OPCODE_MOV ||
3137b8e80941Smrg	  inst->is_partial_write() ||
3138b8e80941Smrg	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
3139b8e80941Smrg	  inst->dst.type != inst->src[0].type ||
3140b8e80941Smrg	  inst->src[0].abs || inst->src[0].negate ||
3141b8e80941Smrg          !inst->src[0].is_contiguous() ||
3142b8e80941Smrg          inst->src[0].offset % REG_SIZE != 0)
3143b8e80941Smrg	 continue;
3144b8e80941Smrg
3145b8e80941Smrg      /* Can't compute-to-MRF this GRF if someone else was going to
3146b8e80941Smrg       * read it later.
3147b8e80941Smrg       */
3148b8e80941Smrg      if (this->virtual_grf_end[inst->src[0].nr] > ip)
3149b8e80941Smrg	 continue;
3150b8e80941Smrg
3151b8e80941Smrg      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
3152b8e80941Smrg       * things that computed the value of all GRFs of the source region.  The
3153b8e80941Smrg       * regs_left bitset keeps track of the registers we haven't yet found a
3154b8e80941Smrg       * generating instruction for.
3155b8e80941Smrg       */
3156b8e80941Smrg      unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3157b8e80941Smrg
3158b8e80941Smrg      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3159b8e80941Smrg         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3160b8e80941Smrg                             inst->src[0], inst->size_read(0))) {
3161b8e80941Smrg	    /* Found the last thing to write our reg we want to turn
3162b8e80941Smrg	     * into a compute-to-MRF.
3163b8e80941Smrg	     */
3164b8e80941Smrg
3165b8e80941Smrg	    /* If this one instruction didn't populate all the
3166b8e80941Smrg	     * channels, bail.  We might be able to rewrite everything
3167b8e80941Smrg	     * that writes that reg, but it would require smarter
3168b8e80941Smrg	     * tracking.
3169b8e80941Smrg	     */
3170b8e80941Smrg	    if (scan_inst->is_partial_write())
3171b8e80941Smrg	       break;
3172b8e80941Smrg
3173b8e80941Smrg            /* Handling things not fully contained in the source of the copy
3174b8e80941Smrg             * would need us to understand coalescing out more than one MOV at
3175b8e80941Smrg             * a time.
3176b8e80941Smrg             */
3177b8e80941Smrg            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3178b8e80941Smrg                                     inst->src[0], inst->size_read(0)))
3179b8e80941Smrg               break;
3180b8e80941Smrg
3181b8e80941Smrg	    /* SEND instructions can't have MRF as a destination. */
3182b8e80941Smrg	    if (scan_inst->mlen)
3183b8e80941Smrg	       break;
3184b8e80941Smrg
3185b8e80941Smrg	    if (devinfo->gen == 6) {
3186b8e80941Smrg	       /* gen6 math instructions must have the destination be
3187b8e80941Smrg		* GRF, so no compute-to-MRF for them.
3188b8e80941Smrg		*/
3189b8e80941Smrg	       if (scan_inst->is_math()) {
3190b8e80941Smrg		  break;
3191b8e80941Smrg	       }
3192b8e80941Smrg	    }
3193b8e80941Smrg
3194b8e80941Smrg            /* Clear the bits for any registers this instruction overwrites. */
3195b8e80941Smrg            regs_left &= ~mask_relative_to(
3196b8e80941Smrg               inst->src[0], scan_inst->dst, scan_inst->size_written);
3197b8e80941Smrg            if (!regs_left)
3198b8e80941Smrg               break;
3199b8e80941Smrg	 }
3200b8e80941Smrg
3201b8e80941Smrg	 /* We don't handle control flow here.  Most computation of
3202b8e80941Smrg	  * values that end up in MRFs are shortly before the MRF
3203b8e80941Smrg	  * write anyway.
3204b8e80941Smrg	  */
3205b8e80941Smrg	 if (block->start() == scan_inst)
3206b8e80941Smrg	    break;
3207b8e80941Smrg
3208b8e80941Smrg	 /* You can't read from an MRF, so if someone else reads our
3209b8e80941Smrg	  * MRF's source GRF that we wanted to rewrite, that stops us.
3210b8e80941Smrg	  */
3211b8e80941Smrg	 bool interfered = false;
3212b8e80941Smrg	 for (int i = 0; i < scan_inst->sources; i++) {
3213b8e80941Smrg            if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3214b8e80941Smrg                                inst->src[0], inst->size_read(0))) {
3215b8e80941Smrg	       interfered = true;
3216b8e80941Smrg	    }
3217b8e80941Smrg	 }
3218b8e80941Smrg	 if (interfered)
3219b8e80941Smrg	    break;
3220b8e80941Smrg
3221b8e80941Smrg         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3222b8e80941Smrg                             inst->dst, inst->size_written)) {
3223b8e80941Smrg	    /* If somebody else writes our MRF here, we can't
3224b8e80941Smrg	     * compute-to-MRF before that.
3225b8e80941Smrg	     */
3226b8e80941Smrg            break;
3227b8e80941Smrg         }
3228b8e80941Smrg
3229b8e80941Smrg         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3230b8e80941Smrg             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3231b8e80941Smrg                             inst->dst, inst->size_written)) {
3232b8e80941Smrg	    /* Found a SEND instruction, which means that there are
3233b8e80941Smrg	     * live values in MRFs from base_mrf to base_mrf +
3234b8e80941Smrg	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3235b8e80941Smrg	     * above it.
3236b8e80941Smrg	     */
3237b8e80941Smrg            break;
3238b8e80941Smrg         }
3239b8e80941Smrg      }
3240b8e80941Smrg
3241b8e80941Smrg      if (regs_left)
3242b8e80941Smrg         continue;
3243b8e80941Smrg
3244b8e80941Smrg      /* Found all generating instructions of our MRF's source value, so it
3245b8e80941Smrg       * should be safe to rewrite them to point to the MRF directly.
3246b8e80941Smrg       */
3247b8e80941Smrg      regs_left = (1 << regs_read(inst, 0)) - 1;
3248b8e80941Smrg
3249b8e80941Smrg      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3250b8e80941Smrg         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3251b8e80941Smrg                             inst->src[0], inst->size_read(0))) {
3252b8e80941Smrg            /* Clear the bits for any registers this instruction overwrites. */
3253b8e80941Smrg            regs_left &= ~mask_relative_to(
3254b8e80941Smrg               inst->src[0], scan_inst->dst, scan_inst->size_written);
3255b8e80941Smrg
3256b8e80941Smrg            const unsigned rel_offset = reg_offset(scan_inst->dst) -
3257b8e80941Smrg                                        reg_offset(inst->src[0]);
3258b8e80941Smrg
3259b8e80941Smrg            if (inst->dst.nr & BRW_MRF_COMPR4) {
3260b8e80941Smrg               /* Apply the same address transformation done by the hardware
3261b8e80941Smrg                * for COMPR4 MRF writes.
3262b8e80941Smrg                */
3263b8e80941Smrg               assert(rel_offset < 2 * REG_SIZE);
3264b8e80941Smrg               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3265b8e80941Smrg
3266b8e80941Smrg               /* Clear the COMPR4 bit if the generating instruction is not
3267b8e80941Smrg                * compressed.
3268b8e80941Smrg                */
3269b8e80941Smrg               if (scan_inst->size_written < 2 * REG_SIZE)
3270b8e80941Smrg                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
3271b8e80941Smrg
3272b8e80941Smrg            } else {
3273b8e80941Smrg               /* Calculate the MRF number the result of this instruction is
3274b8e80941Smrg                * ultimately written to.
3275b8e80941Smrg                */
3276b8e80941Smrg               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3277b8e80941Smrg            }
3278b8e80941Smrg
3279b8e80941Smrg            scan_inst->dst.file = MRF;
3280b8e80941Smrg            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3281b8e80941Smrg            scan_inst->saturate |= inst->saturate;
3282b8e80941Smrg            if (!regs_left)
3283b8e80941Smrg               break;
3284b8e80941Smrg         }
3285b8e80941Smrg      }
3286b8e80941Smrg
3287b8e80941Smrg      assert(!regs_left);
3288b8e80941Smrg      inst->remove(block);
3289b8e80941Smrg      progress = true;
3290b8e80941Smrg   }
3291b8e80941Smrg
3292b8e80941Smrg   if (progress)
3293b8e80941Smrg      invalidate_live_intervals();
3294b8e80941Smrg
3295b8e80941Smrg   return progress;
3296b8e80941Smrg}
3297b8e80941Smrg
3298b8e80941Smrg/**
3299b8e80941Smrg * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3300b8e80941Smrg * flow.  We could probably do better here with some form of divergence
3301b8e80941Smrg * analysis.
3302b8e80941Smrg */
3303b8e80941Smrgbool
3304b8e80941Smrgfs_visitor::eliminate_find_live_channel()
3305b8e80941Smrg{
3306b8e80941Smrg   bool progress = false;
3307b8e80941Smrg   unsigned depth = 0;
3308b8e80941Smrg
3309b8e80941Smrg   if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
3310b8e80941Smrg      /* The optimization below assumes that channel zero is live on thread
3311b8e80941Smrg       * dispatch, which may not be the case if the fixed function dispatches
3312b8e80941Smrg       * threads sparsely.
3313b8e80941Smrg       */
3314b8e80941Smrg      return false;
3315b8e80941Smrg   }
3316b8e80941Smrg
3317b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3318b8e80941Smrg      switch (inst->opcode) {
3319b8e80941Smrg      case BRW_OPCODE_IF:
3320b8e80941Smrg      case BRW_OPCODE_DO:
3321b8e80941Smrg         depth++;
3322b8e80941Smrg         break;
3323b8e80941Smrg
3324b8e80941Smrg      case BRW_OPCODE_ENDIF:
3325b8e80941Smrg      case BRW_OPCODE_WHILE:
3326b8e80941Smrg         depth--;
3327b8e80941Smrg         break;
3328b8e80941Smrg
3329b8e80941Smrg      case FS_OPCODE_DISCARD_JUMP:
3330b8e80941Smrg         /* This can potentially make control flow non-uniform until the end
3331b8e80941Smrg          * of the program.
3332b8e80941Smrg          */
3333b8e80941Smrg         return progress;
3334b8e80941Smrg
3335b8e80941Smrg      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3336b8e80941Smrg         if (depth == 0) {
3337b8e80941Smrg            inst->opcode = BRW_OPCODE_MOV;
3338b8e80941Smrg            inst->src[0] = brw_imm_ud(0u);
3339b8e80941Smrg            inst->sources = 1;
3340b8e80941Smrg            inst->force_writemask_all = true;
3341b8e80941Smrg            progress = true;
3342b8e80941Smrg         }
3343b8e80941Smrg         break;
3344b8e80941Smrg
3345b8e80941Smrg      default:
3346b8e80941Smrg         break;
3347b8e80941Smrg      }
3348b8e80941Smrg   }
3349b8e80941Smrg
3350b8e80941Smrg   return progress;
3351b8e80941Smrg}
3352b8e80941Smrg
3353b8e80941Smrg/**
3354b8e80941Smrg * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3355b8e80941Smrg * instructions to FS_OPCODE_REP_FB_WRITE.
3356b8e80941Smrg */
3357b8e80941Smrgvoid
3358b8e80941Smrgfs_visitor::emit_repclear_shader()
3359b8e80941Smrg{
3360b8e80941Smrg   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3361b8e80941Smrg   int base_mrf = 0;
3362b8e80941Smrg   int color_mrf = base_mrf + 2;
3363b8e80941Smrg   fs_inst *mov;
3364b8e80941Smrg
3365b8e80941Smrg   if (uniforms > 0) {
3366b8e80941Smrg      mov = bld.exec_all().group(4, 0)
3367b8e80941Smrg               .MOV(brw_message_reg(color_mrf),
3368b8e80941Smrg                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
3369b8e80941Smrg   } else {
3370b8e80941Smrg      struct brw_reg reg =
3371b8e80941Smrg         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
3372b8e80941Smrg                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
3373b8e80941Smrg                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3374b8e80941Smrg
3375b8e80941Smrg      mov = bld.exec_all().group(4, 0)
3376b8e80941Smrg               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
3377b8e80941Smrg   }
3378b8e80941Smrg
3379b8e80941Smrg   fs_inst *write = NULL;
3380b8e80941Smrg   if (key->nr_color_regions == 1) {
3381b8e80941Smrg      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3382b8e80941Smrg      write->saturate = key->clamp_fragment_color;
3383b8e80941Smrg      write->base_mrf = color_mrf;
3384b8e80941Smrg      write->target = 0;
3385b8e80941Smrg      write->header_size = 0;
3386b8e80941Smrg      write->mlen = 1;
3387b8e80941Smrg   } else {
3388b8e80941Smrg      assume(key->nr_color_regions > 0);
3389b8e80941Smrg
3390b8e80941Smrg      struct brw_reg header =
3391b8e80941Smrg         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
3392b8e80941Smrg      bld.exec_all().group(16, 0)
3393b8e80941Smrg         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3394b8e80941Smrg
3395b8e80941Smrg      for (int i = 0; i < key->nr_color_regions; ++i) {
3396b8e80941Smrg         if (i > 0) {
3397b8e80941Smrg            bld.exec_all().group(1, 0)
3398b8e80941Smrg               .MOV(component(header, 2), brw_imm_ud(i));
3399b8e80941Smrg         }
3400b8e80941Smrg
3401b8e80941Smrg         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3402b8e80941Smrg         write->saturate = key->clamp_fragment_color;
3403b8e80941Smrg         write->base_mrf = base_mrf;
3404b8e80941Smrg         write->target = i;
3405b8e80941Smrg         write->header_size = 2;
3406b8e80941Smrg         write->mlen = 3;
3407b8e80941Smrg      }
3408b8e80941Smrg   }
3409b8e80941Smrg   write->eot = true;
3410b8e80941Smrg   write->last_rt = true;
3411b8e80941Smrg
3412b8e80941Smrg   calculate_cfg();
3413b8e80941Smrg
3414b8e80941Smrg   assign_constant_locations();
3415b8e80941Smrg   assign_curb_setup();
3416b8e80941Smrg
3417b8e80941Smrg   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3418b8e80941Smrg   if (uniforms > 0) {
3419b8e80941Smrg      assert(mov->src[0].file == FIXED_GRF);
3420b8e80941Smrg      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
3421b8e80941Smrg   }
3422b8e80941Smrg}
3423b8e80941Smrg
3424b8e80941Smrg/**
3425b8e80941Smrg * Walks through basic blocks, looking for repeated MRF writes and
3426b8e80941Smrg * removing the later ones.
3427b8e80941Smrg */
3428b8e80941Smrgbool
3429b8e80941Smrgfs_visitor::remove_duplicate_mrf_writes()
3430b8e80941Smrg{
3431b8e80941Smrg   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
3432b8e80941Smrg   bool progress = false;
3433b8e80941Smrg
3434b8e80941Smrg   /* Need to update the MRF tracking for compressed instructions. */
3435b8e80941Smrg   if (dispatch_width >= 16)
3436b8e80941Smrg      return false;
3437b8e80941Smrg
3438b8e80941Smrg   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3439b8e80941Smrg
3440b8e80941Smrg   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3441b8e80941Smrg      if (inst->is_control_flow()) {
3442b8e80941Smrg	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3443b8e80941Smrg      }
3444b8e80941Smrg
3445b8e80941Smrg      if (inst->opcode == BRW_OPCODE_MOV &&
3446b8e80941Smrg	  inst->dst.file == MRF) {
3447b8e80941Smrg         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3448b8e80941Smrg	 if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
3449b8e80941Smrg             inst->dst.equals(prev_inst->dst) &&
3450b8e80941Smrg             inst->src[0].equals(prev_inst->src[0]) &&
3451b8e80941Smrg             inst->saturate == prev_inst->saturate &&
3452b8e80941Smrg             inst->predicate == prev_inst->predicate &&
3453b8e80941Smrg             inst->conditional_mod == prev_inst->conditional_mod &&
3454b8e80941Smrg             inst->exec_size == prev_inst->exec_size) {
3455b8e80941Smrg	    inst->remove(block);
3456b8e80941Smrg	    progress = true;
3457b8e80941Smrg	    continue;
3458b8e80941Smrg	 }
3459b8e80941Smrg      }
3460b8e80941Smrg
3461b8e80941Smrg      /* Clear out the last-write records for MRFs that were overwritten. */
3462b8e80941Smrg      if (inst->dst.file == MRF) {
3463b8e80941Smrg         last_mrf_move[inst->dst.nr] = NULL;
3464b8e80941Smrg      }
3465b8e80941Smrg
3466b8e80941Smrg      if (inst->mlen > 0 && inst->base_mrf != -1) {
3467b8e80941Smrg	 /* Found a SEND instruction, which will include two or fewer
3468b8e80941Smrg	  * implied MRF writes.  We could do better here.
3469b8e80941Smrg	  */
3470b8e80941Smrg	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3471b8e80941Smrg	    last_mrf_move[inst->base_mrf + i] = NULL;
3472b8e80941Smrg	 }
3473b8e80941Smrg      }
3474b8e80941Smrg
3475b8e80941Smrg      /* Clear out any MRF move records whose sources got overwritten. */
3476b8e80941Smrg      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3477b8e80941Smrg         if (last_mrf_move[i] &&
3478b8e80941Smrg             regions_overlap(inst->dst, inst->size_written,
3479b8e80941Smrg                             last_mrf_move[i]->src[0],
3480b8e80941Smrg                             last_mrf_move[i]->size_read(0))) {
3481b8e80941Smrg            last_mrf_move[i] = NULL;
3482b8e80941Smrg         }
3483b8e80941Smrg      }
3484b8e80941Smrg
3485b8e80941Smrg      if (inst->opcode == BRW_OPCODE_MOV &&
3486b8e80941Smrg	  inst->dst.file == MRF &&
3487b8e80941Smrg	  inst->src[0].file != ARF &&
3488b8e80941Smrg	  !inst->is_partial_write()) {
3489b8e80941Smrg         last_mrf_move[inst->dst.nr] = inst;
3490b8e80941Smrg      }
3491b8e80941Smrg   }
3492b8e80941Smrg
3493b8e80941Smrg   if (progress)
3494b8e80941Smrg      invalidate_live_intervals();
3495b8e80941Smrg
3496b8e80941Smrg   return progress;
3497b8e80941Smrg}
3498b8e80941Smrg
3499b8e80941Smrg/**
3500b8e80941Smrg * Rounding modes for conversion instructions are included for each
3501b8e80941Smrg * conversion, but right now it is a state. So once it is set,
3502b8e80941Smrg * we don't need to call it again for subsequent calls.
3503b8e80941Smrg *
3504b8e80941Smrg * This is useful for vector/matrices conversions, as setting the
3505b8e80941Smrg * mode once is enough for the full vector/matrix
3506b8e80941Smrg */
3507b8e80941Smrgbool
3508b8e80941Smrgfs_visitor::remove_extra_rounding_modes()
3509b8e80941Smrg{
3510b8e80941Smrg   bool progress = false;
3511b8e80941Smrg
3512b8e80941Smrg   foreach_block (block, cfg) {
3513b8e80941Smrg      brw_rnd_mode prev_mode = BRW_RND_MODE_UNSPECIFIED;
3514b8e80941Smrg
3515b8e80941Smrg      foreach_inst_in_block_safe (fs_inst, inst, block) {
3516b8e80941Smrg         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
3517b8e80941Smrg            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
3518b8e80941Smrg            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
3519b8e80941Smrg            if (mode == prev_mode) {
3520b8e80941Smrg               inst->remove(block);
3521b8e80941Smrg               progress = true;
3522b8e80941Smrg            } else {
3523b8e80941Smrg               prev_mode = mode;
3524b8e80941Smrg            }
3525b8e80941Smrg         }
3526b8e80941Smrg      }
3527b8e80941Smrg   }
3528b8e80941Smrg
3529b8e80941Smrg   if (progress)
3530b8e80941Smrg      invalidate_live_intervals();
3531b8e80941Smrg
3532b8e80941Smrg   return progress;
3533b8e80941Smrg}
3534b8e80941Smrg
3535b8e80941Smrgstatic void
3536b8e80941Smrgclear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3537b8e80941Smrg{
3538b8e80941Smrg   /* Clear the flag for registers that actually got read (as expected). */
3539b8e80941Smrg   for (int i = 0; i < inst->sources; i++) {
3540b8e80941Smrg      int grf;
3541b8e80941Smrg      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3542b8e80941Smrg         grf = inst->src[i].nr;
3543b8e80941Smrg      } else {
3544b8e80941Smrg         continue;
3545b8e80941Smrg      }
3546b8e80941Smrg
3547b8e80941Smrg      if (grf >= first_grf &&
3548b8e80941Smrg          grf < first_grf + grf_len) {
3549b8e80941Smrg         deps[grf - first_grf] = false;
3550b8e80941Smrg         if (inst->exec_size == 16)
3551b8e80941Smrg            deps[grf - first_grf + 1] = false;
3552b8e80941Smrg      }
3553b8e80941Smrg   }
3554b8e80941Smrg}
3555b8e80941Smrg
3556b8e80941Smrg/**
3557b8e80941Smrg * Implements this workaround for the original 965:
3558b8e80941Smrg *
3559b8e80941Smrg *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3560b8e80941Smrg *      check for post destination dependencies on this instruction, software
3561b8e80941Smrg *      must ensure that there is no destination hazard for the case of ‘write
3562b8e80941Smrg *      followed by a posted write’ shown in the following example.
3563b8e80941Smrg *
3564b8e80941Smrg *      1. mov r3 0
3565b8e80941Smrg *      2. send r3.xy <rest of send instruction>
3566b8e80941Smrg *      3. mov r2 r3
3567b8e80941Smrg *
3568b8e80941Smrg *      Due to no post-destination dependency check on the ‘send’, the above
3569b8e80941Smrg *      code sequence could have two instructions (1 and 2) in flight at the
3570b8e80941Smrg *      same time that both consider ‘r3’ as the target of their final writes.
3571b8e80941Smrg */
3572b8e80941Smrgvoid
3573b8e80941Smrgfs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3574b8e80941Smrg                                                        fs_inst *inst)
3575b8e80941Smrg{
3576b8e80941Smrg   int write_len = regs_written(inst);
3577b8e80941Smrg   int first_write_grf = inst->dst.nr;
3578b8e80941Smrg   bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
3579b8e80941Smrg   assert(write_len < (int)sizeof(needs_dep) - 1);
3580b8e80941Smrg
3581b8e80941Smrg   memset(needs_dep, false, sizeof(needs_dep));
3582b8e80941Smrg   memset(needs_dep, true, write_len);
3583b8e80941Smrg
3584b8e80941Smrg   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3585b8e80941Smrg
3586b8e80941Smrg   /* Walk backwards looking for writes to registers we're writing which
3587b8e80941Smrg    * aren't read since being written.  If we hit the start of the program,
3588b8e80941Smrg    * we assume that there are no outstanding dependencies on entry to the
3589b8e80941Smrg    * program.
3590b8e80941Smrg    */
3591b8e80941Smrg   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3592b8e80941Smrg      /* If we hit control flow, assume that there *are* outstanding
3593b8e80941Smrg       * dependencies, and force their cleanup before our instruction.
3594b8e80941Smrg       */
3595b8e80941Smrg      if (block->start() == scan_inst && block->num != 0) {
3596b8e80941Smrg         for (int i = 0; i < write_len; i++) {
3597b8e80941Smrg            if (needs_dep[i])
3598b8e80941Smrg               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3599b8e80941Smrg                               first_write_grf + i);
3600b8e80941Smrg         }
3601b8e80941Smrg         return;
3602b8e80941Smrg      }
3603b8e80941Smrg
3604b8e80941Smrg      /* We insert our reads as late as possible on the assumption that any
3605b8e80941Smrg       * instruction but a MOV that might have left us an outstanding
3606b8e80941Smrg       * dependency has more latency than a MOV.
3607b8e80941Smrg       */
3608b8e80941Smrg      if (scan_inst->dst.file == VGRF) {
3609b8e80941Smrg         for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3610b8e80941Smrg            int reg = scan_inst->dst.nr + i;
3611b8e80941Smrg
3612b8e80941Smrg            if (reg >= first_write_grf &&
3613b8e80941Smrg                reg < first_write_grf + write_len &&
3614b8e80941Smrg                needs_dep[reg - first_write_grf]) {
3615b8e80941Smrg               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3616b8e80941Smrg               needs_dep[reg - first_write_grf] = false;
3617b8e80941Smrg               if (scan_inst->exec_size == 16)
3618b8e80941Smrg                  needs_dep[reg - first_write_grf + 1] = false;
3619b8e80941Smrg            }
3620b8e80941Smrg         }
3621b8e80941Smrg      }
3622b8e80941Smrg
3623b8e80941Smrg      /* Clear the flag for registers that actually got read (as expected). */
3624b8e80941Smrg      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3625b8e80941Smrg
3626b8e80941Smrg      /* Continue the loop only if we haven't resolved all the dependencies */
3627b8e80941Smrg      int i;
3628b8e80941Smrg      for (i = 0; i < write_len; i++) {
3629b8e80941Smrg         if (needs_dep[i])
3630b8e80941Smrg            break;
3631b8e80941Smrg      }
3632b8e80941Smrg      if (i == write_len)
3633b8e80941Smrg         return;
3634b8e80941Smrg   }
3635b8e80941Smrg}
3636b8e80941Smrg
3637b8e80941Smrg/**
3638b8e80941Smrg * Implements this workaround for the original 965:
3639b8e80941Smrg *
3640b8e80941Smrg *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3641b8e80941Smrg *      used as a destination register until after it has been sourced by an
3642b8e80941Smrg *      instruction with a different destination register.
3643b8e80941Smrg */
3644b8e80941Smrgvoid
3645b8e80941Smrgfs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3646b8e80941Smrg{
3647b8e80941Smrg   int write_len = regs_written(inst);
3648b8e80941Smrg   unsigned first_write_grf = inst->dst.nr;
3649b8e80941Smrg   bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
3650b8e80941Smrg   assert(write_len < (int)sizeof(needs_dep) - 1);
3651b8e80941Smrg
3652b8e80941Smrg   memset(needs_dep, false, sizeof(needs_dep));
3653b8e80941Smrg   memset(needs_dep, true, write_len);
3654b8e80941Smrg   /* Walk forwards looking for writes to registers we're writing which aren't
3655b8e80941Smrg    * read before being written.
3656b8e80941Smrg    */
3657b8e80941Smrg   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
3658b8e80941Smrg      /* If we hit control flow, force resolve all remaining dependencies. */
3659b8e80941Smrg      if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3660b8e80941Smrg         for (int i = 0; i < write_len; i++) {
3661b8e80941Smrg            if (needs_dep[i])
3662b8e80941Smrg               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3663b8e80941Smrg                               first_write_grf + i);
3664b8e80941Smrg         }
3665b8e80941Smrg         return;
3666b8e80941Smrg      }
3667b8e80941Smrg
3668b8e80941Smrg      /* Clear the flag for registers that actually got read (as expected). */
3669b8e80941Smrg      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3670b8e80941Smrg
3671b8e80941Smrg      /* We insert our reads as late as possible since they're reading the
3672b8e80941Smrg       * result of a SEND, which has massive latency.
3673b8e80941Smrg       */
3674b8e80941Smrg      if (scan_inst->dst.file == VGRF &&
3675b8e80941Smrg          scan_inst->dst.nr >= first_write_grf &&
3676b8e80941Smrg          scan_inst->dst.nr < first_write_grf + write_len &&
3677b8e80941Smrg          needs_dep[scan_inst->dst.nr - first_write_grf]) {
3678b8e80941Smrg         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3679b8e80941Smrg                         scan_inst->dst.nr);
3680b8e80941Smrg         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3681b8e80941Smrg      }
3682b8e80941Smrg
3683b8e80941Smrg      /* Continue the loop only if we haven't resolved all the dependencies */
3684b8e80941Smrg      int i;
3685b8e80941Smrg      for (i = 0; i < write_len; i++) {
3686b8e80941Smrg         if (needs_dep[i])
3687b8e80941Smrg            break;
3688b8e80941Smrg      }
3689b8e80941Smrg      if (i == write_len)
3690b8e80941Smrg         return;
3691b8e80941Smrg   }
3692b8e80941Smrg}
3693b8e80941Smrg
3694b8e80941Smrgvoid
3695b8e80941Smrgfs_visitor::insert_gen4_send_dependency_workarounds()
3696b8e80941Smrg{
3697b8e80941Smrg   if (devinfo->gen != 4 || devinfo->is_g4x)
3698b8e80941Smrg      return;
3699b8e80941Smrg
3700b8e80941Smrg   bool progress = false;
3701b8e80941Smrg
3702b8e80941Smrg   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3703b8e80941Smrg      if (inst->mlen != 0 && inst->dst.file == VGRF) {
3704b8e80941Smrg         insert_gen4_pre_send_dependency_workarounds(block, inst);
3705b8e80941Smrg         insert_gen4_post_send_dependency_workarounds(block, inst);
3706b8e80941Smrg         progress = true;
3707b8e80941Smrg      }
3708b8e80941Smrg   }
3709b8e80941Smrg
3710b8e80941Smrg   if (progress)
3711b8e80941Smrg      invalidate_live_intervals();
3712b8e80941Smrg}
3713b8e80941Smrg
3714b8e80941Smrg/**
3715b8e80941Smrg * Turns the generic expression-style uniform pull constant load instruction
3716b8e80941Smrg * into a hardware-specific series of instructions for loading a pull
3717b8e80941Smrg * constant.
3718b8e80941Smrg *
3719b8e80941Smrg * The expression style allows the CSE pass before this to optimize out
3720b8e80941Smrg * repeated loads from the same offset, and gives the pre-register-allocation
3721b8e80941Smrg * scheduling full flexibility, while the conversion to native instructions
3722b8e80941Smrg * allows the post-register-allocation scheduler the best information
3723b8e80941Smrg * possible.
3724b8e80941Smrg *
3725b8e80941Smrg * Note that execution masking for setting up pull constant loads is special:
3726b8e80941Smrg * the channels that need to be written are unrelated to the current execution
3727b8e80941Smrg * mask, since a later instruction will use one of the result channels as a
3728b8e80941Smrg * source operand for all 8 or 16 of its channels.
3729b8e80941Smrg */
3730b8e80941Smrgvoid
3731b8e80941Smrgfs_visitor::lower_uniform_pull_constant_loads()
3732b8e80941Smrg{
3733b8e80941Smrg   foreach_block_and_inst (block, fs_inst, inst, cfg) {
3734b8e80941Smrg      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3735b8e80941Smrg         continue;
3736b8e80941Smrg
3737b8e80941Smrg      if (devinfo->gen >= 7) {
3738b8e80941Smrg         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3739b8e80941Smrg         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
3740b8e80941Smrg
3741b8e80941Smrg         ubld.group(8, 0).MOV(payload,
3742b8e80941Smrg                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3743b8e80941Smrg         ubld.group(1, 0).MOV(component(payload, 2),
3744b8e80941Smrg                              brw_imm_ud(inst->src[1].ud / 16));
3745b8e80941Smrg
3746b8e80941Smrg         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3747b8e80941Smrg         inst->src[1] = payload;
3748b8e80941Smrg         inst->header_size = 1;
3749b8e80941Smrg         inst->mlen = 1;
3750b8e80941Smrg
3751b8e80941Smrg         invalidate_live_intervals();
3752b8e80941Smrg      } else {
3753b8e80941Smrg         /* Before register allocation, we didn't tell the scheduler about the
3754b8e80941Smrg          * MRF we use.  We know it's safe to use this MRF because nothing
3755b8e80941Smrg          * else does except for register spill/unspill, which generates and
3756b8e80941Smrg          * uses its MRF within a single IR instruction.
3757b8e80941Smrg          */
3758b8e80941Smrg         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
3759b8e80941Smrg         inst->mlen = 1;
3760b8e80941Smrg      }
3761b8e80941Smrg   }
3762b8e80941Smrg}
3763b8e80941Smrg
3764b8e80941Smrgbool
3765b8e80941Smrgfs_visitor::lower_load_payload()
3766b8e80941Smrg{
3767b8e80941Smrg   bool progress = false;
3768b8e80941Smrg
3769b8e80941Smrg   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3770b8e80941Smrg      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3771b8e80941Smrg         continue;
3772b8e80941Smrg
3773b8e80941Smrg      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3774b8e80941Smrg      assert(inst->saturate == false);
3775b8e80941Smrg      fs_reg dst = inst->dst;
3776b8e80941Smrg
3777b8e80941Smrg      /* Get rid of COMPR4.  We'll add it back in if we need it */
3778b8e80941Smrg      if (dst.file == MRF)
3779b8e80941Smrg         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3780b8e80941Smrg
3781b8e80941Smrg      const fs_builder ibld(this, block, inst);
3782b8e80941Smrg      const fs_builder hbld = ibld.exec_all().group(8, 0);
3783b8e80941Smrg
3784b8e80941Smrg      for (uint8_t i = 0; i < inst->header_size; i++) {
3785b8e80941Smrg         if (inst->src[i].file != BAD_FILE) {
3786b8e80941Smrg            fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3787b8e80941Smrg            fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3788b8e80941Smrg            hbld.MOV(mov_dst, mov_src);
3789b8e80941Smrg         }
3790b8e80941Smrg         dst = offset(dst, hbld, 1);
3791b8e80941Smrg      }
3792b8e80941Smrg
3793b8e80941Smrg      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3794b8e80941Smrg          inst->exec_size > 8) {
3795b8e80941Smrg         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3796b8e80941Smrg          * a straightforward copy.  Instead, the result of the
3797b8e80941Smrg          * LOAD_PAYLOAD is treated as interleaved and the first four
3798b8e80941Smrg          * non-header sources are unpacked as:
3799b8e80941Smrg          *
3800b8e80941Smrg          * m + 0: r0
3801b8e80941Smrg          * m + 1: g0
3802b8e80941Smrg          * m + 2: b0
3803b8e80941Smrg          * m + 3: a0
3804b8e80941Smrg          * m + 4: r1
3805b8e80941Smrg          * m + 5: g1
3806b8e80941Smrg          * m + 6: b1
3807b8e80941Smrg          * m + 7: a1
3808b8e80941Smrg          *
3809b8e80941Smrg          * This is used for gen <= 5 fb writes.
3810b8e80941Smrg          */
3811b8e80941Smrg         assert(inst->exec_size == 16);
3812b8e80941Smrg         assert(inst->header_size + 4 <= inst->sources);
3813b8e80941Smrg         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3814b8e80941Smrg            if (inst->src[i].file != BAD_FILE) {
3815b8e80941Smrg               if (devinfo->has_compr4) {
3816b8e80941Smrg                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
3817b8e80941Smrg                  compr4_dst.nr |= BRW_MRF_COMPR4;
3818b8e80941Smrg                  ibld.MOV(compr4_dst, inst->src[i]);
3819b8e80941Smrg               } else {
3820b8e80941Smrg                  /* Platform doesn't have COMPR4.  We have to fake it */
3821b8e80941Smrg                  fs_reg mov_dst = retype(dst, inst->src[i].type);
3822b8e80941Smrg                  ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
3823b8e80941Smrg                  mov_dst.nr += 4;
3824b8e80941Smrg                  ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
3825b8e80941Smrg               }
3826b8e80941Smrg            }
3827b8e80941Smrg
3828b8e80941Smrg            dst.nr++;
3829b8e80941Smrg         }
3830b8e80941Smrg
3831b8e80941Smrg         /* The loop above only ever incremented us through the first set
3832b8e80941Smrg          * of 4 registers.  However, thanks to the magic of COMPR4, we
3833b8e80941Smrg          * actually wrote to the first 8 registers, so we need to take
3834b8e80941Smrg          * that into account now.
3835b8e80941Smrg          */
3836b8e80941Smrg         dst.nr += 4;
3837b8e80941Smrg
3838b8e80941Smrg         /* The COMPR4 code took care of the first 4 sources.  We'll let
3839b8e80941Smrg          * the regular path handle any remaining sources.  Yes, we are
3840b8e80941Smrg          * modifying the instruction but we're about to delete it so
3841b8e80941Smrg          * this really doesn't hurt anything.
3842b8e80941Smrg          */
3843b8e80941Smrg         inst->header_size += 4;
3844b8e80941Smrg      }
3845b8e80941Smrg
3846b8e80941Smrg      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3847b8e80941Smrg         if (inst->src[i].file != BAD_FILE) {
3848b8e80941Smrg            dst.type = inst->src[i].type;
3849b8e80941Smrg            ibld.MOV(dst, inst->src[i]);
3850b8e80941Smrg         } else {
3851b8e80941Smrg            dst.type = BRW_REGISTER_TYPE_UD;
3852b8e80941Smrg         }
3853b8e80941Smrg         dst = offset(dst, ibld, 1);
3854b8e80941Smrg      }
3855b8e80941Smrg
3856b8e80941Smrg      inst->remove(block);
3857b8e80941Smrg      progress = true;
3858b8e80941Smrg   }
3859b8e80941Smrg
3860b8e80941Smrg   if (progress)
3861b8e80941Smrg      invalidate_live_intervals();
3862b8e80941Smrg
3863b8e80941Smrg   return progress;
3864b8e80941Smrg}
3865b8e80941Smrg
3866b8e80941Smrgbool
3867b8e80941Smrgfs_visitor::lower_linterp()
3868b8e80941Smrg{
3869b8e80941Smrg   bool progress = false;
3870b8e80941Smrg
3871b8e80941Smrg   if (devinfo->gen < 11)
3872b8e80941Smrg      return false;
3873b8e80941Smrg
3874b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3875b8e80941Smrg      const fs_builder ibld(this, block, inst);
3876b8e80941Smrg
3877b8e80941Smrg      if (inst->opcode != FS_OPCODE_LINTERP)
3878b8e80941Smrg         continue;
3879b8e80941Smrg
3880b8e80941Smrg      fs_reg dwP = component(inst->src[1], 0);
3881b8e80941Smrg      fs_reg dwQ = component(inst->src[1], 1);
3882b8e80941Smrg      fs_reg dwR = component(inst->src[1], 3);
3883b8e80941Smrg      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 8); i++) {
3884b8e80941Smrg         const fs_builder hbld(ibld.half(i));
3885b8e80941Smrg         fs_reg dst = half(inst->dst, i);
3886b8e80941Smrg         fs_reg delta_xy = offset(inst->src[0], ibld, i);
3887b8e80941Smrg         hbld.MAD(dst, dwR, half(delta_xy, 0), dwP);
3888b8e80941Smrg         fs_inst *mad = hbld.MAD(dst, dst, half(delta_xy, 1), dwQ);
3889b8e80941Smrg
3890b8e80941Smrg         /* Propagate conditional mod and saturate from the original
3891b8e80941Smrg          * instruction to the second MAD instruction.
3892b8e80941Smrg          */
3893b8e80941Smrg         set_saturate(inst->saturate, mad);
3894b8e80941Smrg         set_condmod(inst->conditional_mod, mad);
3895b8e80941Smrg      }
3896b8e80941Smrg
3897b8e80941Smrg      inst->remove(block);
3898b8e80941Smrg      progress = true;
3899b8e80941Smrg   }
3900b8e80941Smrg
3901b8e80941Smrg   if (progress)
3902b8e80941Smrg      invalidate_live_intervals();
3903b8e80941Smrg
3904b8e80941Smrg   return progress;
3905b8e80941Smrg}
3906b8e80941Smrg
3907b8e80941Smrgbool
3908b8e80941Smrgfs_visitor::lower_integer_multiplication()
3909b8e80941Smrg{
3910b8e80941Smrg   bool progress = false;
3911b8e80941Smrg
3912b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3913b8e80941Smrg      const fs_builder ibld(this, block, inst);
3914b8e80941Smrg
3915b8e80941Smrg      if (inst->opcode == BRW_OPCODE_MUL) {
3916b8e80941Smrg         if (inst->dst.is_accumulator() ||
3917b8e80941Smrg             (inst->dst.type != BRW_REGISTER_TYPE_D &&
3918b8e80941Smrg              inst->dst.type != BRW_REGISTER_TYPE_UD))
3919b8e80941Smrg            continue;
3920b8e80941Smrg
3921b8e80941Smrg         if (devinfo->has_integer_dword_mul)
3922b8e80941Smrg            continue;
3923b8e80941Smrg
3924b8e80941Smrg         if (inst->src[1].file == IMM &&
3925b8e80941Smrg             inst->src[1].ud < (1 << 16)) {
3926b8e80941Smrg            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3927b8e80941Smrg             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3928b8e80941Smrg             * src1 are used.
3929b8e80941Smrg             *
3930b8e80941Smrg             * If multiplying by an immediate value that fits in 16-bits, do a
3931b8e80941Smrg             * single MUL instruction with that value in the proper location.
3932b8e80941Smrg             */
3933b8e80941Smrg            if (devinfo->gen < 7) {
3934b8e80941Smrg               fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
3935b8e80941Smrg                          inst->dst.type);
3936b8e80941Smrg               ibld.MOV(imm, inst->src[1]);
3937b8e80941Smrg               ibld.MUL(inst->dst, imm, inst->src[0]);
3938b8e80941Smrg            } else {
3939b8e80941Smrg               const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
3940b8e80941Smrg               ibld.MUL(inst->dst, inst->src[0],
3941b8e80941Smrg                        ud ? brw_imm_uw(inst->src[1].ud)
3942b8e80941Smrg                           : brw_imm_w(inst->src[1].d));
3943b8e80941Smrg            }
3944b8e80941Smrg         } else {
3945b8e80941Smrg            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3946b8e80941Smrg             * do 32-bit integer multiplication in one instruction, but instead
3947b8e80941Smrg             * must do a sequence (which actually calculates a 64-bit result):
3948b8e80941Smrg             *
3949b8e80941Smrg             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3950b8e80941Smrg             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3951b8e80941Smrg             *    mov(8)  g2<1>D     acc0<8,8,1>D
3952b8e80941Smrg             *
3953b8e80941Smrg             * But on Gen > 6, the ability to use second accumulator register
3954b8e80941Smrg             * (acc1) for non-float data types was removed, preventing a simple
3955b8e80941Smrg             * implementation in SIMD16. A 16-channel result can be calculated by
3956b8e80941Smrg             * executing the three instructions twice in SIMD8, once with quarter
3957b8e80941Smrg             * control of 1Q for the first eight channels and again with 2Q for
3958b8e80941Smrg             * the second eight channels.
3959b8e80941Smrg             *
3960b8e80941Smrg             * Which accumulator register is implicitly accessed (by AccWrEnable
3961b8e80941Smrg             * for instance) is determined by the quarter control. Unfortunately
3962b8e80941Smrg             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3963b8e80941Smrg             * implicit accumulator access by an instruction with 2Q will access
3964b8e80941Smrg             * acc1 regardless of whether the data type is usable in acc1.
3965b8e80941Smrg             *
3966b8e80941Smrg             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3967b8e80941Smrg             * integer data types.
3968b8e80941Smrg             *
3969b8e80941Smrg             * Since we only want the low 32-bits of the result, we can do two
3970b8e80941Smrg             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3971b8e80941Smrg             * adjust the high result and add them (like the mach is doing):
3972b8e80941Smrg             *
3973b8e80941Smrg             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3974b8e80941Smrg             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3975b8e80941Smrg             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3976b8e80941Smrg             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3977b8e80941Smrg             *
3978b8e80941Smrg             * We avoid the shl instruction by realizing that we only want to add
3979b8e80941Smrg             * the low 16-bits of the "high" result to the high 16-bits of the
3980b8e80941Smrg             * "low" result and using proper regioning on the add:
3981b8e80941Smrg             *
3982b8e80941Smrg             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3983b8e80941Smrg             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3984b8e80941Smrg             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3985b8e80941Smrg             *
3986b8e80941Smrg             * Since it does not use the (single) accumulator register, we can
3987b8e80941Smrg             * schedule multi-component multiplications much better.
3988b8e80941Smrg             */
3989b8e80941Smrg
3990b8e80941Smrg            bool needs_mov = false;
3991b8e80941Smrg            fs_reg orig_dst = inst->dst;
3992b8e80941Smrg
3993b8e80941Smrg            /* Get a new VGRF for the "low" 32x16-bit multiplication result if
3994b8e80941Smrg             * reusing the original destination is impossible due to hardware
3995b8e80941Smrg             * restrictions, source/destination overlap, or it being the null
3996b8e80941Smrg             * register.
3997b8e80941Smrg             */
3998b8e80941Smrg            fs_reg low = inst->dst;
3999b8e80941Smrg            if (orig_dst.is_null() || orig_dst.file == MRF ||
4000b8e80941Smrg                regions_overlap(inst->dst, inst->size_written,
4001b8e80941Smrg                                inst->src[0], inst->size_read(0)) ||
4002b8e80941Smrg                regions_overlap(inst->dst, inst->size_written,
4003b8e80941Smrg                                inst->src[1], inst->size_read(1)) ||
4004b8e80941Smrg                inst->dst.stride >= 4) {
4005b8e80941Smrg               needs_mov = true;
4006b8e80941Smrg               low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
4007b8e80941Smrg                            inst->dst.type);
4008b8e80941Smrg            }
4009b8e80941Smrg
4010b8e80941Smrg            /* Get a new VGRF but keep the same stride as inst->dst */
4011b8e80941Smrg            fs_reg high(VGRF, alloc.allocate(regs_written(inst)),
4012b8e80941Smrg                        inst->dst.type);
4013b8e80941Smrg            high.stride = inst->dst.stride;
4014b8e80941Smrg            high.offset = inst->dst.offset % REG_SIZE;
4015b8e80941Smrg
4016b8e80941Smrg            if (devinfo->gen >= 7) {
4017b8e80941Smrg               if (inst->src[1].abs)
4018b8e80941Smrg                  lower_src_modifiers(this, block, inst, 1);
4019b8e80941Smrg
4020b8e80941Smrg               if (inst->src[1].file == IMM) {
4021b8e80941Smrg                  ibld.MUL(low, inst->src[0],
4022b8e80941Smrg                           brw_imm_uw(inst->src[1].ud & 0xffff));
4023b8e80941Smrg                  ibld.MUL(high, inst->src[0],
4024b8e80941Smrg                           brw_imm_uw(inst->src[1].ud >> 16));
4025b8e80941Smrg               } else {
4026b8e80941Smrg                  ibld.MUL(low, inst->src[0],
4027b8e80941Smrg                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4028b8e80941Smrg                  ibld.MUL(high, inst->src[0],
4029b8e80941Smrg                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
4030b8e80941Smrg               }
4031b8e80941Smrg            } else {
4032b8e80941Smrg               if (inst->src[0].abs)
4033b8e80941Smrg                  lower_src_modifiers(this, block, inst, 0);
4034b8e80941Smrg
4035b8e80941Smrg               ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
4036b8e80941Smrg                        inst->src[1]);
4037b8e80941Smrg               ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
4038b8e80941Smrg                        inst->src[1]);
4039b8e80941Smrg            }
4040b8e80941Smrg
4041b8e80941Smrg            ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
4042b8e80941Smrg                     subscript(low, BRW_REGISTER_TYPE_UW, 1),
4043b8e80941Smrg                     subscript(high, BRW_REGISTER_TYPE_UW, 0));
4044b8e80941Smrg
4045b8e80941Smrg            if (needs_mov || inst->conditional_mod) {
4046b8e80941Smrg               set_condmod(inst->conditional_mod,
4047b8e80941Smrg                           ibld.MOV(orig_dst, low));
4048b8e80941Smrg            }
4049b8e80941Smrg         }
4050b8e80941Smrg
4051b8e80941Smrg      } else if (inst->opcode == SHADER_OPCODE_MULH) {
4052b8e80941Smrg         /* According to the BDW+ BSpec page for the "Multiply Accumulate
4053b8e80941Smrg          * High" instruction:
4054b8e80941Smrg          *
4055b8e80941Smrg          *  "An added preliminary mov is required for source modification on
4056b8e80941Smrg          *   src1:
4057b8e80941Smrg          *      mov (8) r3.0<1>:d -r3<8;8,1>:d
4058b8e80941Smrg          *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4059b8e80941Smrg          *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4060b8e80941Smrg          */
4061b8e80941Smrg         if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs))
4062b8e80941Smrg            lower_src_modifiers(this, block, inst, 1);
4063b8e80941Smrg
4064b8e80941Smrg         /* Should have been lowered to 8-wide. */
4065b8e80941Smrg         assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
4066b8e80941Smrg         const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
4067b8e80941Smrg                                   inst->dst.type);
4068b8e80941Smrg         fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4069b8e80941Smrg         fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4070b8e80941Smrg
4071b8e80941Smrg         if (devinfo->gen >= 8) {
4072b8e80941Smrg            /* Until Gen8, integer multiplies read 32-bits from one source,
4073b8e80941Smrg             * and 16-bits from the other, and relying on the MACH instruction
4074b8e80941Smrg             * to generate the high bits of the result.
4075b8e80941Smrg             *
4076b8e80941Smrg             * On Gen8, the multiply instruction does a full 32x32-bit
4077b8e80941Smrg             * multiply, but in order to do a 64-bit multiply we can simulate
4078b8e80941Smrg             * the previous behavior and then use a MACH instruction.
4079b8e80941Smrg             */
4080b8e80941Smrg            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
4081b8e80941Smrg                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
4082b8e80941Smrg            mul->src[1].type = BRW_REGISTER_TYPE_UW;
4083b8e80941Smrg            mul->src[1].stride *= 2;
4084b8e80941Smrg
4085b8e80941Smrg         } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
4086b8e80941Smrg                    inst->group > 0) {
4087b8e80941Smrg            /* Among other things the quarter control bits influence which
4088b8e80941Smrg             * accumulator register is used by the hardware for instructions
4089b8e80941Smrg             * that access the accumulator implicitly (e.g. MACH).  A
4090b8e80941Smrg             * second-half instruction would normally map to acc1, which
4091b8e80941Smrg             * doesn't exist on Gen7 and up (the hardware does emulate it for
4092b8e80941Smrg             * floating-point instructions *only* by taking advantage of the
4093b8e80941Smrg             * extra precision of acc0 not normally used for floating point
4094b8e80941Smrg             * arithmetic).
4095b8e80941Smrg             *
4096b8e80941Smrg             * HSW and up are careful enough not to try to access an
4097b8e80941Smrg             * accumulator register that doesn't exist, but on earlier Gen7
4098b8e80941Smrg             * hardware we need to make sure that the quarter control bits are
4099b8e80941Smrg             * zero to avoid non-deterministic behaviour and emit an extra MOV
4100b8e80941Smrg             * to get the result masked correctly according to the current
4101b8e80941Smrg             * channel enables.
4102b8e80941Smrg             */
4103b8e80941Smrg            mach->group = 0;
4104b8e80941Smrg            mach->force_writemask_all = true;
4105b8e80941Smrg            mach->dst = ibld.vgrf(inst->dst.type);
4106b8e80941Smrg            ibld.MOV(inst->dst, mach->dst);
4107b8e80941Smrg         }
4108b8e80941Smrg      } else {
4109b8e80941Smrg         continue;
4110b8e80941Smrg      }
4111b8e80941Smrg
4112b8e80941Smrg      inst->remove(block);
4113b8e80941Smrg      progress = true;
4114b8e80941Smrg   }
4115b8e80941Smrg
4116b8e80941Smrg   if (progress)
4117b8e80941Smrg      invalidate_live_intervals();
4118b8e80941Smrg
4119b8e80941Smrg   return progress;
4120b8e80941Smrg}
4121b8e80941Smrg
4122b8e80941Smrgbool
4123b8e80941Smrgfs_visitor::lower_minmax()
4124b8e80941Smrg{
4125b8e80941Smrg   assert(devinfo->gen < 6);
4126b8e80941Smrg
4127b8e80941Smrg   bool progress = false;
4128b8e80941Smrg
4129b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4130b8e80941Smrg      const fs_builder ibld(this, block, inst);
4131b8e80941Smrg
4132b8e80941Smrg      if (inst->opcode == BRW_OPCODE_SEL &&
4133b8e80941Smrg          inst->predicate == BRW_PREDICATE_NONE) {
4134b8e80941Smrg         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
4135b8e80941Smrg          *        the original SEL.L/GE instruction
4136b8e80941Smrg          */
4137b8e80941Smrg         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4138b8e80941Smrg                  inst->conditional_mod);
4139b8e80941Smrg         inst->predicate = BRW_PREDICATE_NORMAL;
4140b8e80941Smrg         inst->conditional_mod = BRW_CONDITIONAL_NONE;
4141b8e80941Smrg
4142b8e80941Smrg         progress = true;
4143b8e80941Smrg      }
4144b8e80941Smrg   }
4145b8e80941Smrg
4146b8e80941Smrg   if (progress)
4147b8e80941Smrg      invalidate_live_intervals();
4148b8e80941Smrg
4149b8e80941Smrg   return progress;
4150b8e80941Smrg}
4151b8e80941Smrg
4152b8e80941Smrgstatic void
4153b8e80941Smrgsetup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
4154b8e80941Smrg                    fs_reg *dst, fs_reg color, unsigned components)
4155b8e80941Smrg{
4156b8e80941Smrg   if (key->clamp_fragment_color) {
4157b8e80941Smrg      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
4158b8e80941Smrg      assert(color.type == BRW_REGISTER_TYPE_F);
4159b8e80941Smrg
4160b8e80941Smrg      for (unsigned i = 0; i < components; i++)
4161b8e80941Smrg         set_saturate(true,
4162b8e80941Smrg                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
4163b8e80941Smrg
4164b8e80941Smrg      color = tmp;
4165b8e80941Smrg   }
4166b8e80941Smrg
4167b8e80941Smrg   for (unsigned i = 0; i < components; i++)
4168b8e80941Smrg      dst[i] = offset(color, bld, i);
4169b8e80941Smrg}
4170b8e80941Smrg
4171b8e80941Smrgstatic void
4172b8e80941Smrglower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
4173b8e80941Smrg                            const struct brw_wm_prog_data *prog_data,
4174b8e80941Smrg                            const brw_wm_prog_key *key,
4175b8e80941Smrg                            const fs_visitor::thread_payload &payload)
4176b8e80941Smrg{
4177b8e80941Smrg   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
4178b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
4179b8e80941Smrg   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
4180b8e80941Smrg   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
4181b8e80941Smrg   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
4182b8e80941Smrg   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
4183b8e80941Smrg   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
4184b8e80941Smrg   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
4185b8e80941Smrg   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
4186b8e80941Smrg   const unsigned components =
4187b8e80941Smrg      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
4188b8e80941Smrg
4189b8e80941Smrg   /* We can potentially have a message length of up to 15, so we have to set
4190b8e80941Smrg    * base_mrf to either 0 or 1 in order to fit in m0..m15.
4191b8e80941Smrg    */
4192b8e80941Smrg   fs_reg sources[15];
4193b8e80941Smrg   int header_size = 2, payload_header_size;
4194b8e80941Smrg   unsigned length = 0;
4195b8e80941Smrg
4196b8e80941Smrg   if (devinfo->gen < 6) {
4197b8e80941Smrg      /* TODO: Support SIMD32 on gen4-5 */
4198b8e80941Smrg      assert(bld.group() < 16);
4199b8e80941Smrg
4200b8e80941Smrg      /* For gen4-5, we always have a header consisting of g0 and g1.  We have
4201b8e80941Smrg       * an implied MOV from g0,g1 to the start of the message.  The MOV from
4202b8e80941Smrg       * g0 is handled by the hardware and the MOV from g1 is provided by the
4203b8e80941Smrg       * generator.  This is required because, on gen4-5, the generator may
4204b8e80941Smrg       * generate two write messages with different message lengths in order
4205b8e80941Smrg       * to handle AA data properly.
4206b8e80941Smrg       *
4207b8e80941Smrg       * Also, since the pixel mask goes in the g0 portion of the message and
4208b8e80941Smrg       * since render target writes are the last thing in the shader, we write
4209b8e80941Smrg       * the pixel mask directly into g0 and it will get copied as part of the
4210b8e80941Smrg       * implied write.
4211b8e80941Smrg       */
4212b8e80941Smrg      if (prog_data->uses_kill) {
4213b8e80941Smrg         bld.exec_all().group(1, 0)
4214b8e80941Smrg            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
4215b8e80941Smrg                 brw_flag_reg(0, 1));
4216b8e80941Smrg      }
4217b8e80941Smrg
4218b8e80941Smrg      assert(length == 0);
4219b8e80941Smrg      length = 2;
4220b8e80941Smrg   } else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
4221b8e80941Smrg               prog_data->uses_kill) ||
4222b8e80941Smrg              color1.file != BAD_FILE ||
4223b8e80941Smrg              key->nr_color_regions > 1) {
4224b8e80941Smrg      /* From the Sandy Bridge PRM, volume 4, page 198:
4225b8e80941Smrg       *
4226b8e80941Smrg       *     "Dispatched Pixel Enables. One bit per pixel indicating
4227b8e80941Smrg       *      which pixels were originally enabled when the thread was
4228b8e80941Smrg       *      dispatched. This field is only required for the end-of-
4229b8e80941Smrg       *      thread message and on all dual-source messages."
4230b8e80941Smrg       */
4231b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(8, 0);
4232b8e80941Smrg
4233b8e80941Smrg      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4234b8e80941Smrg      if (bld.group() < 16) {
4235b8e80941Smrg         /* The header starts off as g0 and g1 for the first half */
4236b8e80941Smrg         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4237b8e80941Smrg                                              BRW_REGISTER_TYPE_UD));
4238b8e80941Smrg      } else {
4239b8e80941Smrg         /* The header starts off as g0 and g2 for the second half */
4240b8e80941Smrg         assert(bld.group() < 32);
4241b8e80941Smrg         const fs_reg header_sources[2] = {
4242b8e80941Smrg            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4243b8e80941Smrg            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
4244b8e80941Smrg         };
4245b8e80941Smrg         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
4246b8e80941Smrg      }
4247b8e80941Smrg
4248b8e80941Smrg      uint32_t g00_bits = 0;
4249b8e80941Smrg
4250b8e80941Smrg      /* Set "Source0 Alpha Present to RenderTarget" bit in message
4251b8e80941Smrg       * header.
4252b8e80941Smrg       */
4253b8e80941Smrg      if (inst->target > 0 && prog_data->replicate_alpha)
4254b8e80941Smrg         g00_bits |= 1 << 11;
4255b8e80941Smrg
4256b8e80941Smrg      /* Set computes stencil to render target */
4257b8e80941Smrg      if (prog_data->computed_stencil)
4258b8e80941Smrg         g00_bits |= 1 << 14;
4259b8e80941Smrg
4260b8e80941Smrg      if (g00_bits) {
4261b8e80941Smrg         /* OR extra bits into g0.0 */
4262b8e80941Smrg         ubld.group(1, 0).OR(component(header, 0),
4263b8e80941Smrg                             retype(brw_vec1_grf(0, 0),
4264b8e80941Smrg                                    BRW_REGISTER_TYPE_UD),
4265b8e80941Smrg                             brw_imm_ud(g00_bits));
4266b8e80941Smrg      }
4267b8e80941Smrg
4268b8e80941Smrg      /* Set the render target index for choosing BLEND_STATE. */
4269b8e80941Smrg      if (inst->target > 0) {
4270b8e80941Smrg         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
4271b8e80941Smrg      }
4272b8e80941Smrg
4273b8e80941Smrg      if (prog_data->uses_kill) {
4274b8e80941Smrg         assert(bld.group() < 16);
4275b8e80941Smrg         ubld.group(1, 0).MOV(retype(component(header, 15),
4276b8e80941Smrg                                     BRW_REGISTER_TYPE_UW),
4277b8e80941Smrg                              brw_flag_reg(0, 1));
4278b8e80941Smrg      }
4279b8e80941Smrg
4280b8e80941Smrg      assert(length == 0);
4281b8e80941Smrg      sources[0] = header;
4282b8e80941Smrg      sources[1] = horiz_offset(header, 8);
4283b8e80941Smrg      length = 2;
4284b8e80941Smrg   }
4285b8e80941Smrg   assert(length == 0 || length == 2);
4286b8e80941Smrg   header_size = length;
4287b8e80941Smrg
4288b8e80941Smrg   if (payload.aa_dest_stencil_reg[0]) {
4289b8e80941Smrg      assert(inst->group < 16);
4290b8e80941Smrg      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
4291b8e80941Smrg      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
4292b8e80941Smrg         .MOV(sources[length],
4293b8e80941Smrg              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
4294b8e80941Smrg      length++;
4295b8e80941Smrg   }
4296b8e80941Smrg
4297b8e80941Smrg   if (src0_alpha.file != BAD_FILE) {
4298b8e80941Smrg      for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
4299b8e80941Smrg         const fs_builder &ubld = bld.exec_all().group(8, i)
4300b8e80941Smrg                                    .annotate("FB write src0 alpha");
4301b8e80941Smrg         const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
4302b8e80941Smrg         ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
4303b8e80941Smrg         setup_color_payload(ubld, key, &sources[length], tmp, 1);
4304b8e80941Smrg         length++;
4305b8e80941Smrg      }
4306b8e80941Smrg   } else if (prog_data->replicate_alpha && inst->target != 0) {
4307b8e80941Smrg      /* Handle the case when fragment shader doesn't write to draw buffer
4308b8e80941Smrg       * zero. No need to call setup_color_payload() for src0_alpha because
4309b8e80941Smrg       * alpha value will be undefined.
4310b8e80941Smrg       */
4311b8e80941Smrg      length += bld.dispatch_width() / 8;
4312b8e80941Smrg   }
4313b8e80941Smrg
4314b8e80941Smrg   if (sample_mask.file != BAD_FILE) {
4315b8e80941Smrg      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
4316b8e80941Smrg                               BRW_REGISTER_TYPE_UD);
4317b8e80941Smrg
4318b8e80941Smrg      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
4319b8e80941Smrg       * relevant.  Since it's unsigned single words one vgrf is always
4320b8e80941Smrg       * 16-wide, but only the lower or higher 8 channels will be used by the
4321b8e80941Smrg       * hardware when doing a SIMD8 write depending on whether we have
4322b8e80941Smrg       * selected the subspans for the first or second half respectively.
4323b8e80941Smrg       */
4324b8e80941Smrg      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
4325b8e80941Smrg      sample_mask.type = BRW_REGISTER_TYPE_UW;
4326b8e80941Smrg      sample_mask.stride *= 2;
4327b8e80941Smrg
4328b8e80941Smrg      bld.exec_all().annotate("FB write oMask")
4329b8e80941Smrg         .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
4330b8e80941Smrg                           inst->group % 16),
4331b8e80941Smrg              sample_mask);
4332b8e80941Smrg      length++;
4333b8e80941Smrg   }
4334b8e80941Smrg
4335b8e80941Smrg   payload_header_size = length;
4336b8e80941Smrg
4337b8e80941Smrg   setup_color_payload(bld, key, &sources[length], color0, components);
4338b8e80941Smrg   length += 4;
4339b8e80941Smrg
4340b8e80941Smrg   if (color1.file != BAD_FILE) {
4341b8e80941Smrg      setup_color_payload(bld, key, &sources[length], color1, components);
4342b8e80941Smrg      length += 4;
4343b8e80941Smrg   }
4344b8e80941Smrg
4345b8e80941Smrg   if (src_depth.file != BAD_FILE) {
4346b8e80941Smrg      sources[length] = src_depth;
4347b8e80941Smrg      length++;
4348b8e80941Smrg   }
4349b8e80941Smrg
4350b8e80941Smrg   if (dst_depth.file != BAD_FILE) {
4351b8e80941Smrg      sources[length] = dst_depth;
4352b8e80941Smrg      length++;
4353b8e80941Smrg   }
4354b8e80941Smrg
4355b8e80941Smrg   if (src_stencil.file != BAD_FILE) {
4356b8e80941Smrg      assert(devinfo->gen >= 9);
4357b8e80941Smrg      assert(bld.dispatch_width() == 8);
4358b8e80941Smrg
4359b8e80941Smrg      /* XXX: src_stencil is only available on gen9+. dst_depth is never
4360b8e80941Smrg       * available on gen9+. As such it's impossible to have both enabled at the
4361b8e80941Smrg       * same time and therefore length cannot overrun the array.
4362b8e80941Smrg       */
4363b8e80941Smrg      assert(length < 15);
4364b8e80941Smrg
4365b8e80941Smrg      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4366b8e80941Smrg      bld.exec_all().annotate("FB write OS")
4367b8e80941Smrg         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
4368b8e80941Smrg              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
4369b8e80941Smrg      length++;
4370b8e80941Smrg   }
4371b8e80941Smrg
4372b8e80941Smrg   fs_inst *load;
4373b8e80941Smrg   if (devinfo->gen >= 7) {
4374b8e80941Smrg      /* Send from the GRF */
4375b8e80941Smrg      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
4376b8e80941Smrg      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
4377b8e80941Smrg      payload.nr = bld.shader->alloc.allocate(regs_written(load));
4378b8e80941Smrg      load->dst = payload;
4379b8e80941Smrg
4380b8e80941Smrg      inst->src[0] = payload;
4381b8e80941Smrg      inst->resize_sources(1);
4382b8e80941Smrg   } else {
4383b8e80941Smrg      /* Send from the MRF */
4384b8e80941Smrg      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
4385b8e80941Smrg                              sources, length, payload_header_size);
4386b8e80941Smrg
4387b8e80941Smrg      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
4388b8e80941Smrg       * will do this for us if we just give it a COMPR4 destination.
4389b8e80941Smrg       */
4390b8e80941Smrg      if (devinfo->gen < 6 && bld.dispatch_width() == 16)
4391b8e80941Smrg         load->dst.nr |= BRW_MRF_COMPR4;
4392b8e80941Smrg
4393b8e80941Smrg      if (devinfo->gen < 6) {
4394b8e80941Smrg         /* Set up src[0] for the implied MOV from grf0-1 */
4395b8e80941Smrg         inst->resize_sources(1);
4396b8e80941Smrg         inst->src[0] = brw_vec8_grf(0, 0);
4397b8e80941Smrg      } else {
4398b8e80941Smrg         inst->resize_sources(0);
4399b8e80941Smrg      }
4400b8e80941Smrg      inst->base_mrf = 1;
4401b8e80941Smrg   }
4402b8e80941Smrg
4403b8e80941Smrg   inst->opcode = FS_OPCODE_FB_WRITE;
4404b8e80941Smrg   inst->mlen = regs_written(load);
4405b8e80941Smrg   inst->header_size = header_size;
4406b8e80941Smrg}
4407b8e80941Smrg
4408b8e80941Smrgstatic void
4409b8e80941Smrglower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
4410b8e80941Smrg{
4411b8e80941Smrg   const fs_builder &ubld = bld.exec_all().group(8, 0);
4412b8e80941Smrg   const unsigned length = 2;
4413b8e80941Smrg   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
4414b8e80941Smrg
4415b8e80941Smrg   if (bld.group() < 16) {
4416b8e80941Smrg      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4417b8e80941Smrg                                           BRW_REGISTER_TYPE_UD));
4418b8e80941Smrg   } else {
4419b8e80941Smrg      assert(bld.group() < 32);
4420b8e80941Smrg      const fs_reg header_sources[] = {
4421b8e80941Smrg         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4422b8e80941Smrg         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
4423b8e80941Smrg      };
4424b8e80941Smrg      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
4425b8e80941Smrg   }
4426b8e80941Smrg
4427b8e80941Smrg   inst->resize_sources(1);
4428b8e80941Smrg   inst->src[0] = header;
4429b8e80941Smrg   inst->opcode = FS_OPCODE_FB_READ;
4430b8e80941Smrg   inst->mlen = length;
4431b8e80941Smrg   inst->header_size = length;
4432b8e80941Smrg}
4433b8e80941Smrg
4434b8e80941Smrgstatic void
4435b8e80941Smrglower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
4436b8e80941Smrg                                const fs_reg &coordinate,
4437b8e80941Smrg                                const fs_reg &shadow_c,
4438b8e80941Smrg                                const fs_reg &lod, const fs_reg &lod2,
4439b8e80941Smrg                                const fs_reg &surface,
4440b8e80941Smrg                                const fs_reg &sampler,
4441b8e80941Smrg                                unsigned coord_components,
4442b8e80941Smrg                                unsigned grad_components)
4443b8e80941Smrg{
4444b8e80941Smrg   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
4445b8e80941Smrg                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
4446b8e80941Smrg   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
4447b8e80941Smrg   fs_reg msg_end = msg_begin;
4448b8e80941Smrg
4449b8e80941Smrg   /* g0 header. */
4450b8e80941Smrg   msg_end = offset(msg_end, bld.group(8, 0), 1);
4451b8e80941Smrg
4452b8e80941Smrg   for (unsigned i = 0; i < coord_components; i++)
4453b8e80941Smrg      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
4454b8e80941Smrg              offset(coordinate, bld, i));
4455b8e80941Smrg
4456b8e80941Smrg   msg_end = offset(msg_end, bld, coord_components);
4457b8e80941Smrg
4458b8e80941Smrg   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
4459b8e80941Smrg    * require all three components to be present and zero if they are unused.
4460b8e80941Smrg    */
4461b8e80941Smrg   if (coord_components > 0 &&
4462b8e80941Smrg       (has_lod || shadow_c.file != BAD_FILE ||
4463b8e80941Smrg        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
4464b8e80941Smrg      for (unsigned i = coord_components; i < 3; i++)
4465b8e80941Smrg         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
4466b8e80941Smrg
4467b8e80941Smrg      msg_end = offset(msg_end, bld, 3 - coord_components);
4468b8e80941Smrg   }
4469b8e80941Smrg
4470b8e80941Smrg   if (op == SHADER_OPCODE_TXD) {
4471b8e80941Smrg      /* TXD unsupported in SIMD16 mode. */
4472b8e80941Smrg      assert(bld.dispatch_width() == 8);
4473b8e80941Smrg
4474b8e80941Smrg      /* the slots for u and v are always present, but r is optional */
4475b8e80941Smrg      if (coord_components < 2)
4476b8e80941Smrg         msg_end = offset(msg_end, bld, 2 - coord_components);
4477b8e80941Smrg
4478b8e80941Smrg      /*  P   = u, v, r
4479b8e80941Smrg       * dPdx = dudx, dvdx, drdx
4480b8e80941Smrg       * dPdy = dudy, dvdy, drdy
4481b8e80941Smrg       *
4482b8e80941Smrg       * 1-arg: Does not exist.
4483b8e80941Smrg       *
4484b8e80941Smrg       * 2-arg: dudx   dvdx   dudy   dvdy
4485b8e80941Smrg       *        dPdx.x dPdx.y dPdy.x dPdy.y
4486b8e80941Smrg       *        m4     m5     m6     m7
4487b8e80941Smrg       *
4488b8e80941Smrg       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
4489b8e80941Smrg       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
4490b8e80941Smrg       *        m5     m6     m7     m8     m9     m10
4491b8e80941Smrg       */
4492b8e80941Smrg      for (unsigned i = 0; i < grad_components; i++)
4493b8e80941Smrg         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
4494b8e80941Smrg
4495b8e80941Smrg      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4496b8e80941Smrg
4497b8e80941Smrg      for (unsigned i = 0; i < grad_components; i++)
4498b8e80941Smrg         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
4499b8e80941Smrg
4500b8e80941Smrg      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4501b8e80941Smrg   }
4502b8e80941Smrg
4503b8e80941Smrg   if (has_lod) {
4504b8e80941Smrg      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
4505b8e80941Smrg       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
4506b8e80941Smrg       */
4507b8e80941Smrg      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
4508b8e80941Smrg             bld.dispatch_width() == 16);
4509b8e80941Smrg
4510b8e80941Smrg      const brw_reg_type type =
4511b8e80941Smrg         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
4512b8e80941Smrg          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
4513b8e80941Smrg      bld.MOV(retype(msg_end, type), lod);
4514b8e80941Smrg      msg_end = offset(msg_end, bld, 1);
4515b8e80941Smrg   }
4516b8e80941Smrg
4517b8e80941Smrg   if (shadow_c.file != BAD_FILE) {
4518b8e80941Smrg      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
4519b8e80941Smrg         /* There's no plain shadow compare message, so we use shadow
4520b8e80941Smrg          * compare with a bias of 0.0.
4521b8e80941Smrg          */
4522b8e80941Smrg         bld.MOV(msg_end, brw_imm_f(0.0f));
4523b8e80941Smrg         msg_end = offset(msg_end, bld, 1);
4524b8e80941Smrg      }
4525b8e80941Smrg
4526b8e80941Smrg      bld.MOV(msg_end, shadow_c);
4527b8e80941Smrg      msg_end = offset(msg_end, bld, 1);
4528b8e80941Smrg   }
4529b8e80941Smrg
4530b8e80941Smrg   inst->opcode = op;
4531b8e80941Smrg   inst->src[0] = reg_undef;
4532b8e80941Smrg   inst->src[1] = surface;
4533b8e80941Smrg   inst->src[2] = sampler;
4534b8e80941Smrg   inst->resize_sources(3);
4535b8e80941Smrg   inst->base_mrf = msg_begin.nr;
4536b8e80941Smrg   inst->mlen = msg_end.nr - msg_begin.nr;
4537b8e80941Smrg   inst->header_size = 1;
4538b8e80941Smrg}
4539b8e80941Smrg
4540b8e80941Smrgstatic void
4541b8e80941Smrglower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
4542b8e80941Smrg                                const fs_reg &coordinate,
4543b8e80941Smrg                                const fs_reg &shadow_c,
4544b8e80941Smrg                                const fs_reg &lod, const fs_reg &lod2,
4545b8e80941Smrg                                const fs_reg &sample_index,
4546b8e80941Smrg                                const fs_reg &surface,
4547b8e80941Smrg                                const fs_reg &sampler,
4548b8e80941Smrg                                unsigned coord_components,
4549b8e80941Smrg                                unsigned grad_components)
4550b8e80941Smrg{
4551b8e80941Smrg   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
4552b8e80941Smrg   fs_reg msg_coords = message;
4553b8e80941Smrg   unsigned header_size = 0;
4554b8e80941Smrg
4555b8e80941Smrg   if (inst->offset != 0) {
4556b8e80941Smrg      /* The offsets set up by the visitor are in the m1 header, so we can't
4557b8e80941Smrg       * go headerless.
4558b8e80941Smrg       */
4559b8e80941Smrg      header_size = 1;
4560b8e80941Smrg      message.nr--;
4561b8e80941Smrg   }
4562b8e80941Smrg
4563b8e80941Smrg   for (unsigned i = 0; i < coord_components; i++)
4564b8e80941Smrg      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
4565b8e80941Smrg              offset(coordinate, bld, i));
4566b8e80941Smrg
4567b8e80941Smrg   fs_reg msg_end = offset(msg_coords, bld, coord_components);
4568b8e80941Smrg   fs_reg msg_lod = offset(msg_coords, bld, 4);
4569b8e80941Smrg
4570b8e80941Smrg   if (shadow_c.file != BAD_FILE) {
4571b8e80941Smrg      fs_reg msg_shadow = msg_lod;
4572b8e80941Smrg      bld.MOV(msg_shadow, shadow_c);
4573b8e80941Smrg      msg_lod = offset(msg_shadow, bld, 1);
4574b8e80941Smrg      msg_end = msg_lod;
4575b8e80941Smrg   }
4576b8e80941Smrg
4577b8e80941Smrg   switch (op) {
4578b8e80941Smrg   case SHADER_OPCODE_TXL:
4579b8e80941Smrg   case FS_OPCODE_TXB:
4580b8e80941Smrg      bld.MOV(msg_lod, lod);
4581b8e80941Smrg      msg_end = offset(msg_lod, bld, 1);
4582b8e80941Smrg      break;
4583b8e80941Smrg   case SHADER_OPCODE_TXD:
4584b8e80941Smrg      /**
4585b8e80941Smrg       *  P   =  u,    v,    r
4586b8e80941Smrg       * dPdx = dudx, dvdx, drdx
4587b8e80941Smrg       * dPdy = dudy, dvdy, drdy
4588b8e80941Smrg       *
4589b8e80941Smrg       * Load up these values:
4590b8e80941Smrg       * - dudx   dudy   dvdx   dvdy   drdx   drdy
4591b8e80941Smrg       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
4592b8e80941Smrg       */
4593b8e80941Smrg      msg_end = msg_lod;
4594b8e80941Smrg      for (unsigned i = 0; i < grad_components; i++) {
4595b8e80941Smrg         bld.MOV(msg_end, offset(lod, bld, i));
4596b8e80941Smrg         msg_end = offset(msg_end, bld, 1);
4597b8e80941Smrg
4598b8e80941Smrg         bld.MOV(msg_end, offset(lod2, bld, i));
4599b8e80941Smrg         msg_end = offset(msg_end, bld, 1);
4600b8e80941Smrg      }
4601b8e80941Smrg      break;
4602b8e80941Smrg   case SHADER_OPCODE_TXS:
4603b8e80941Smrg      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
4604b8e80941Smrg      bld.MOV(msg_lod, lod);
4605b8e80941Smrg      msg_end = offset(msg_lod, bld, 1);
4606b8e80941Smrg      break;
4607b8e80941Smrg   case SHADER_OPCODE_TXF:
4608b8e80941Smrg      msg_lod = offset(msg_coords, bld, 3);
4609b8e80941Smrg      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
4610b8e80941Smrg      msg_end = offset(msg_lod, bld, 1);
4611b8e80941Smrg      break;
4612b8e80941Smrg   case SHADER_OPCODE_TXF_CMS:
4613b8e80941Smrg      msg_lod = offset(msg_coords, bld, 3);
4614b8e80941Smrg      /* lod */
4615b8e80941Smrg      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
4616b8e80941Smrg      /* sample index */
4617b8e80941Smrg      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
4618b8e80941Smrg      msg_end = offset(msg_lod, bld, 2);
4619b8e80941Smrg      break;
4620b8e80941Smrg   default:
4621b8e80941Smrg      break;
4622b8e80941Smrg   }
4623b8e80941Smrg
4624b8e80941Smrg   inst->opcode = op;
4625b8e80941Smrg   inst->src[0] = reg_undef;
4626b8e80941Smrg   inst->src[1] = surface;
4627b8e80941Smrg   inst->src[2] = sampler;
4628b8e80941Smrg   inst->resize_sources(3);
4629b8e80941Smrg   inst->base_mrf = message.nr;
4630b8e80941Smrg   inst->mlen = msg_end.nr - message.nr;
4631b8e80941Smrg   inst->header_size = header_size;
4632b8e80941Smrg
4633b8e80941Smrg   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
4634b8e80941Smrg   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
4635b8e80941Smrg}
4636b8e80941Smrg
4637b8e80941Smrgstatic bool
4638b8e80941Smrgis_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
4639b8e80941Smrg{
4640b8e80941Smrg   if (devinfo->gen < 8 && !devinfo->is_haswell)
4641b8e80941Smrg      return false;
4642b8e80941Smrg
4643b8e80941Smrg   return sampler.file != IMM || sampler.ud >= 16;
4644b8e80941Smrg}
4645b8e80941Smrg
4646b8e80941Smrgstatic unsigned
4647b8e80941Smrgsampler_msg_type(const gen_device_info *devinfo,
4648b8e80941Smrg                 opcode opcode, bool shadow_compare)
4649b8e80941Smrg{
4650b8e80941Smrg   assert(devinfo->gen >= 5);
4651b8e80941Smrg   switch (opcode) {
4652b8e80941Smrg   case SHADER_OPCODE_TEX:
4653b8e80941Smrg      return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
4654b8e80941Smrg                              GEN5_SAMPLER_MESSAGE_SAMPLE;
4655b8e80941Smrg   case FS_OPCODE_TXB:
4656b8e80941Smrg      return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
4657b8e80941Smrg                              GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
4658b8e80941Smrg   case SHADER_OPCODE_TXL:
4659b8e80941Smrg      return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
4660b8e80941Smrg                              GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
4661b8e80941Smrg   case SHADER_OPCODE_TXL_LZ:
4662b8e80941Smrg      return shadow_compare ? GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
4663b8e80941Smrg                              GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
4664b8e80941Smrg   case SHADER_OPCODE_TXS:
4665b8e80941Smrg   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
4666b8e80941Smrg      return GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
4667b8e80941Smrg   case SHADER_OPCODE_TXD:
4668b8e80941Smrg      assert(!shadow_compare || devinfo->gen >= 8 || devinfo->is_haswell);
4669b8e80941Smrg      return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
4670b8e80941Smrg                              GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
4671b8e80941Smrg   case SHADER_OPCODE_TXF:
4672b8e80941Smrg      return GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
4673b8e80941Smrg   case SHADER_OPCODE_TXF_LZ:
4674b8e80941Smrg      assert(devinfo->gen >= 9);
4675b8e80941Smrg      return GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
4676b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_W:
4677b8e80941Smrg      assert(devinfo->gen >= 9);
4678b8e80941Smrg      return GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
4679b8e80941Smrg   case SHADER_OPCODE_TXF_CMS:
4680b8e80941Smrg      return devinfo->gen >= 7 ? GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
4681b8e80941Smrg                                 GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
4682b8e80941Smrg   case SHADER_OPCODE_TXF_UMS:
4683b8e80941Smrg      assert(devinfo->gen >= 7);
4684b8e80941Smrg      return GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
4685b8e80941Smrg   case SHADER_OPCODE_TXF_MCS:
4686b8e80941Smrg      assert(devinfo->gen >= 7);
4687b8e80941Smrg      return GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
4688b8e80941Smrg   case SHADER_OPCODE_LOD:
4689b8e80941Smrg      return GEN5_SAMPLER_MESSAGE_LOD;
4690b8e80941Smrg   case SHADER_OPCODE_TG4:
4691b8e80941Smrg      assert(devinfo->gen >= 7);
4692b8e80941Smrg      return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
4693b8e80941Smrg                              GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
4694b8e80941Smrg      break;
4695b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET:
4696b8e80941Smrg      assert(devinfo->gen >= 7);
4697b8e80941Smrg      return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
4698b8e80941Smrg                              GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
4699b8e80941Smrg   case SHADER_OPCODE_SAMPLEINFO:
4700b8e80941Smrg      return GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
4701b8e80941Smrg   default:
4702b8e80941Smrg      unreachable("not reached");
4703b8e80941Smrg   }
4704b8e80941Smrg}
4705b8e80941Smrg
4706b8e80941Smrgstatic void
4707b8e80941Smrglower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
4708b8e80941Smrg                                const fs_reg &coordinate,
4709b8e80941Smrg                                const fs_reg &shadow_c,
4710b8e80941Smrg                                fs_reg lod, const fs_reg &lod2,
4711b8e80941Smrg                                const fs_reg &min_lod,
4712b8e80941Smrg                                const fs_reg &sample_index,
4713b8e80941Smrg                                const fs_reg &mcs,
4714b8e80941Smrg                                const fs_reg &surface,
4715b8e80941Smrg                                const fs_reg &sampler,
4716b8e80941Smrg                                const fs_reg &surface_handle,
4717b8e80941Smrg                                const fs_reg &sampler_handle,
4718b8e80941Smrg                                const fs_reg &tg4_offset,
4719b8e80941Smrg                                unsigned coord_components,
4720b8e80941Smrg                                unsigned grad_components)
4721b8e80941Smrg{
4722b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
4723b8e80941Smrg   const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
4724b8e80941Smrg   unsigned reg_width = bld.dispatch_width() / 8;
4725b8e80941Smrg   unsigned header_size = 0, length = 0;
4726b8e80941Smrg   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
4727b8e80941Smrg   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
4728b8e80941Smrg      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
4729b8e80941Smrg
4730b8e80941Smrg   /* We must have exactly one of surface/sampler and surface/sampler_handle */
4731b8e80941Smrg   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
4732b8e80941Smrg   assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
4733b8e80941Smrg
4734b8e80941Smrg   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
4735b8e80941Smrg       inst->offset != 0 || inst->eot ||
4736b8e80941Smrg       op == SHADER_OPCODE_SAMPLEINFO ||
4737b8e80941Smrg       sampler_handle.file != BAD_FILE ||
4738b8e80941Smrg       is_high_sampler(devinfo, sampler)) {
4739b8e80941Smrg      /* For general texture offsets (no txf workaround), we need a header to
4740b8e80941Smrg       * put them in.
4741b8e80941Smrg       *
4742b8e80941Smrg       * TG4 needs to place its channel select in the header, for interaction
4743b8e80941Smrg       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
4744b8e80941Smrg       * larger sampler numbers we need to offset the Sampler State Pointer in
4745b8e80941Smrg       * the header.
4746b8e80941Smrg       */
4747b8e80941Smrg      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
4748b8e80941Smrg      header_size = 1;
4749b8e80941Smrg      length++;
4750b8e80941Smrg
4751b8e80941Smrg      /* If we're requesting fewer than four channels worth of response,
4752b8e80941Smrg       * and we have an explicit header, we need to set up the sampler
4753b8e80941Smrg       * writemask.  It's reversed from normal: 1 means "don't write".
4754b8e80941Smrg       */
4755b8e80941Smrg      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
4756b8e80941Smrg         assert(regs_written(inst) % reg_width == 0);
4757b8e80941Smrg         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
4758b8e80941Smrg         inst->offset |= mask << 12;
4759b8e80941Smrg      }
4760b8e80941Smrg
4761b8e80941Smrg      /* Build the actual header */
4762b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(8, 0);
4763b8e80941Smrg      const fs_builder ubld1 = ubld.group(1, 0);
4764b8e80941Smrg      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
4765b8e80941Smrg      if (inst->offset) {
4766b8e80941Smrg         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
4767b8e80941Smrg      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
4768b8e80941Smrg                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
4769b8e80941Smrg         /* The vertex and fragment stages have g0.2 set to 0, so
4770b8e80941Smrg          * header0.2 is 0 when g0 is copied. Other stages may not, so we
4771b8e80941Smrg          * must set it to 0 to avoid setting undesirable bits in the
4772b8e80941Smrg          * message.
4773b8e80941Smrg          */
4774b8e80941Smrg         ubld1.MOV(component(header, 2), brw_imm_ud(0));
4775b8e80941Smrg      }
4776b8e80941Smrg
4777b8e80941Smrg      if (sampler_handle.file != BAD_FILE) {
4778b8e80941Smrg         /* Bindless sampler handles aren't relative to the sampler state
4779b8e80941Smrg          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
4780b8e80941Smrg          * Instead, it's an absolute pointer relative to dynamic state base
4781b8e80941Smrg          * address.
4782b8e80941Smrg          *
4783b8e80941Smrg          * Sampler states are 16 bytes each and the pointer we give here has
4784b8e80941Smrg          * to be 32-byte aligned.  In order to avoid more indirect messages
4785b8e80941Smrg          * than required, we assume that all bindless sampler states are
4786b8e80941Smrg          * 32-byte aligned.  This sacrifices a bit of general state base
4787b8e80941Smrg          * address space but means we can do something more efficient in the
4788b8e80941Smrg          * shader.
4789b8e80941Smrg          */
4790b8e80941Smrg         ubld1.MOV(component(header, 3), sampler_handle);
4791b8e80941Smrg      } else if (is_high_sampler(devinfo, sampler)) {
4792b8e80941Smrg         if (sampler.file == BRW_IMMEDIATE_VALUE) {
4793b8e80941Smrg            assert(sampler.ud >= 16);
4794b8e80941Smrg            const int sampler_state_size = 16; /* 16 bytes */
4795b8e80941Smrg
4796b8e80941Smrg            ubld1.ADD(component(header, 3),
4797b8e80941Smrg                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
4798b8e80941Smrg                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
4799b8e80941Smrg         } else {
4800b8e80941Smrg            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
4801b8e80941Smrg            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
4802b8e80941Smrg            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
4803b8e80941Smrg            ubld1.ADD(component(header, 3),
4804b8e80941Smrg                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
4805b8e80941Smrg                      tmp);
4806b8e80941Smrg         }
4807b8e80941Smrg      }
4808b8e80941Smrg   }
4809b8e80941Smrg
4810b8e80941Smrg   if (shadow_c.file != BAD_FILE) {
4811b8e80941Smrg      bld.MOV(sources[length], shadow_c);
4812b8e80941Smrg      length++;
4813b8e80941Smrg   }
4814b8e80941Smrg
4815b8e80941Smrg   bool coordinate_done = false;
4816b8e80941Smrg
4817b8e80941Smrg   /* Set up the LOD info */
4818b8e80941Smrg   switch (op) {
4819b8e80941Smrg   case FS_OPCODE_TXB:
4820b8e80941Smrg   case SHADER_OPCODE_TXL:
4821b8e80941Smrg      if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
4822b8e80941Smrg         op = SHADER_OPCODE_TXL_LZ;
4823b8e80941Smrg         break;
4824b8e80941Smrg      }
4825b8e80941Smrg      bld.MOV(sources[length], lod);
4826b8e80941Smrg      length++;
4827b8e80941Smrg      break;
4828b8e80941Smrg   case SHADER_OPCODE_TXD:
4829b8e80941Smrg      /* TXD should have been lowered in SIMD16 mode. */
4830b8e80941Smrg      assert(bld.dispatch_width() == 8);
4831b8e80941Smrg
4832b8e80941Smrg      /* Load dPdx and the coordinate together:
4833b8e80941Smrg       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
4834b8e80941Smrg       */
4835b8e80941Smrg      for (unsigned i = 0; i < coord_components; i++) {
4836b8e80941Smrg         bld.MOV(sources[length++], offset(coordinate, bld, i));
4837b8e80941Smrg
4838b8e80941Smrg         /* For cube map array, the coordinate is (u,v,r,ai) but there are
4839b8e80941Smrg          * only derivatives for (u, v, r).
4840b8e80941Smrg          */
4841b8e80941Smrg         if (i < grad_components) {
4842b8e80941Smrg            bld.MOV(sources[length++], offset(lod, bld, i));
4843b8e80941Smrg            bld.MOV(sources[length++], offset(lod2, bld, i));
4844b8e80941Smrg         }
4845b8e80941Smrg      }
4846b8e80941Smrg
4847b8e80941Smrg      coordinate_done = true;
4848b8e80941Smrg      break;
4849b8e80941Smrg   case SHADER_OPCODE_TXS:
4850b8e80941Smrg      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
4851b8e80941Smrg      length++;
4852b8e80941Smrg      break;
4853b8e80941Smrg   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
4854b8e80941Smrg      /* We need an LOD; just use 0 */
4855b8e80941Smrg      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
4856b8e80941Smrg      length++;
4857b8e80941Smrg      break;
4858b8e80941Smrg   case SHADER_OPCODE_TXF:
4859b8e80941Smrg      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
4860b8e80941Smrg       * On Gen9 they are u, v, lod, r
4861b8e80941Smrg       */
4862b8e80941Smrg      bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
4863b8e80941Smrg
4864b8e80941Smrg      if (devinfo->gen >= 9) {
4865b8e80941Smrg         if (coord_components >= 2) {
4866b8e80941Smrg            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
4867b8e80941Smrg                    offset(coordinate, bld, 1));
4868b8e80941Smrg         } else {
4869b8e80941Smrg            sources[length] = brw_imm_d(0);
4870b8e80941Smrg         }
4871b8e80941Smrg         length++;
4872b8e80941Smrg      }
4873b8e80941Smrg
4874b8e80941Smrg      if (devinfo->gen >= 9 && lod.is_zero()) {
4875b8e80941Smrg         op = SHADER_OPCODE_TXF_LZ;
4876b8e80941Smrg      } else {
4877b8e80941Smrg         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
4878b8e80941Smrg         length++;
4879b8e80941Smrg      }
4880b8e80941Smrg
4881b8e80941Smrg      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
4882b8e80941Smrg         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4883b8e80941Smrg                 offset(coordinate, bld, i));
4884b8e80941Smrg
4885b8e80941Smrg      coordinate_done = true;
4886b8e80941Smrg      break;
4887b8e80941Smrg
4888b8e80941Smrg   case SHADER_OPCODE_TXF_CMS:
4889b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_W:
4890b8e80941Smrg   case SHADER_OPCODE_TXF_UMS:
4891b8e80941Smrg   case SHADER_OPCODE_TXF_MCS:
4892b8e80941Smrg      if (op == SHADER_OPCODE_TXF_UMS ||
4893b8e80941Smrg          op == SHADER_OPCODE_TXF_CMS ||
4894b8e80941Smrg          op == SHADER_OPCODE_TXF_CMS_W) {
4895b8e80941Smrg         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
4896b8e80941Smrg         length++;
4897b8e80941Smrg      }
4898b8e80941Smrg
4899b8e80941Smrg      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
4900b8e80941Smrg         /* Data from the multisample control surface. */
4901b8e80941Smrg         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
4902b8e80941Smrg         length++;
4903b8e80941Smrg
4904b8e80941Smrg         /* On Gen9+ we'll use ld2dms_w instead which has two registers for
4905b8e80941Smrg          * the MCS data.
4906b8e80941Smrg          */
4907b8e80941Smrg         if (op == SHADER_OPCODE_TXF_CMS_W) {
4908b8e80941Smrg            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
4909b8e80941Smrg                    mcs.file == IMM ?
4910b8e80941Smrg                    mcs :
4911b8e80941Smrg                    offset(mcs, bld, 1));
4912b8e80941Smrg            length++;
4913b8e80941Smrg         }
4914b8e80941Smrg      }
4915b8e80941Smrg
4916b8e80941Smrg      /* There is no offsetting for this message; just copy in the integer
4917b8e80941Smrg       * texture coordinates.
4918b8e80941Smrg       */
4919b8e80941Smrg      for (unsigned i = 0; i < coord_components; i++)
4920b8e80941Smrg         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4921b8e80941Smrg                 offset(coordinate, bld, i));
4922b8e80941Smrg
4923b8e80941Smrg      coordinate_done = true;
4924b8e80941Smrg      break;
4925b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET:
4926b8e80941Smrg      /* More crazy intermixing */
4927b8e80941Smrg      for (unsigned i = 0; i < 2; i++) /* u, v */
4928b8e80941Smrg         bld.MOV(sources[length++], offset(coordinate, bld, i));
4929b8e80941Smrg
4930b8e80941Smrg      for (unsigned i = 0; i < 2; i++) /* offu, offv */
4931b8e80941Smrg         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
4932b8e80941Smrg                 offset(tg4_offset, bld, i));
4933b8e80941Smrg
4934b8e80941Smrg      if (coord_components == 3) /* r if present */
4935b8e80941Smrg         bld.MOV(sources[length++], offset(coordinate, bld, 2));
4936b8e80941Smrg
4937b8e80941Smrg      coordinate_done = true;
4938b8e80941Smrg      break;
4939b8e80941Smrg   default:
4940b8e80941Smrg      break;
4941b8e80941Smrg   }
4942b8e80941Smrg
4943b8e80941Smrg   /* Set up the coordinate (except for cases where it was done above) */
4944b8e80941Smrg   if (!coordinate_done) {
4945b8e80941Smrg      for (unsigned i = 0; i < coord_components; i++)
4946b8e80941Smrg         bld.MOV(sources[length++], offset(coordinate, bld, i));
4947b8e80941Smrg   }
4948b8e80941Smrg
4949b8e80941Smrg   if (min_lod.file != BAD_FILE) {
4950b8e80941Smrg      /* Account for all of the missing coordinate sources */
4951b8e80941Smrg      length += 4 - coord_components;
4952b8e80941Smrg      if (op == SHADER_OPCODE_TXD)
4953b8e80941Smrg         length += (3 - grad_components) * 2;
4954b8e80941Smrg
4955b8e80941Smrg      bld.MOV(sources[length++], min_lod);
4956b8e80941Smrg   }
4957b8e80941Smrg
4958b8e80941Smrg   unsigned mlen;
4959b8e80941Smrg   if (reg_width == 2)
4960b8e80941Smrg      mlen = length * reg_width - header_size;
4961b8e80941Smrg   else
4962b8e80941Smrg      mlen = length * reg_width;
4963b8e80941Smrg
4964b8e80941Smrg   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
4965b8e80941Smrg                                     BRW_REGISTER_TYPE_F);
4966b8e80941Smrg   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
4967b8e80941Smrg
4968b8e80941Smrg   /* Generate the SEND. */
4969b8e80941Smrg   inst->opcode = SHADER_OPCODE_SEND;
4970b8e80941Smrg   inst->mlen = mlen;
4971b8e80941Smrg   inst->header_size = header_size;
4972b8e80941Smrg
4973b8e80941Smrg   const unsigned msg_type =
4974b8e80941Smrg      sampler_msg_type(devinfo, op, inst->shadow_compare);
4975b8e80941Smrg   const unsigned simd_mode =
4976b8e80941Smrg      inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
4977b8e80941Smrg                             BRW_SAMPLER_SIMD_MODE_SIMD16;
4978b8e80941Smrg
4979b8e80941Smrg   uint32_t base_binding_table_index;
4980b8e80941Smrg   switch (op) {
4981b8e80941Smrg   case SHADER_OPCODE_TG4:
4982b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET:
4983b8e80941Smrg      base_binding_table_index = prog_data->binding_table.gather_texture_start;
4984b8e80941Smrg      break;
4985b8e80941Smrg   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
4986b8e80941Smrg      base_binding_table_index = prog_data->binding_table.image_start;
4987b8e80941Smrg      break;
4988b8e80941Smrg   default:
4989b8e80941Smrg      base_binding_table_index = prog_data->binding_table.texture_start;
4990b8e80941Smrg      break;
4991b8e80941Smrg   }
4992b8e80941Smrg
4993b8e80941Smrg   inst->sfid = BRW_SFID_SAMPLER;
4994b8e80941Smrg   if (surface.file == IMM &&
4995b8e80941Smrg       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
4996b8e80941Smrg      inst->desc = brw_sampler_desc(devinfo,
4997b8e80941Smrg                                    surface.ud + base_binding_table_index,
4998b8e80941Smrg                                    sampler.file == IMM ? sampler.ud % 16 : 0,
4999b8e80941Smrg                                    msg_type,
5000b8e80941Smrg                                    simd_mode,
5001b8e80941Smrg                                    0 /* return_format unused on gen7+ */);
5002b8e80941Smrg      inst->src[0] = brw_imm_ud(0);
5003b8e80941Smrg      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5004b8e80941Smrg   } else if (surface_handle.file != BAD_FILE) {
5005b8e80941Smrg      /* Bindless surface */
5006b8e80941Smrg      assert(devinfo->gen >= 9);
5007b8e80941Smrg      inst->desc = brw_sampler_desc(devinfo,
5008b8e80941Smrg                                    GEN9_BTI_BINDLESS,
5009b8e80941Smrg                                    sampler.file == IMM ? sampler.ud % 16 : 0,
5010b8e80941Smrg                                    msg_type,
5011b8e80941Smrg                                    simd_mode,
5012b8e80941Smrg                                    0 /* return_format unused on gen7+ */);
5013b8e80941Smrg
5014b8e80941Smrg      /* For bindless samplers, the entire address is included in the message
5015b8e80941Smrg       * header so we can leave the portion in the message descriptor 0.
5016b8e80941Smrg       */
5017b8e80941Smrg      if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
5018b8e80941Smrg         inst->src[0] = brw_imm_ud(0);
5019b8e80941Smrg      } else {
5020b8e80941Smrg         const fs_builder ubld = bld.group(1, 0).exec_all();
5021b8e80941Smrg         fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5022b8e80941Smrg         ubld.SHL(desc, sampler, brw_imm_ud(8));
5023b8e80941Smrg         inst->src[0] = desc;
5024b8e80941Smrg      }
5025b8e80941Smrg
5026b8e80941Smrg      /* We assume that the driver provided the handle in the top 20 bits so
5027b8e80941Smrg       * we can use the surface handle directly as the extended descriptor.
5028b8e80941Smrg       */
5029b8e80941Smrg      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5030b8e80941Smrg   } else {
5031b8e80941Smrg      /* Immediate portion of the descriptor */
5032b8e80941Smrg      inst->desc = brw_sampler_desc(devinfo,
5033b8e80941Smrg                                    0, /* surface */
5034b8e80941Smrg                                    0, /* sampler */
5035b8e80941Smrg                                    msg_type,
5036b8e80941Smrg                                    simd_mode,
5037b8e80941Smrg                                    0 /* return_format unused on gen7+ */);
5038b8e80941Smrg      const fs_builder ubld = bld.group(1, 0).exec_all();
5039b8e80941Smrg      fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5040b8e80941Smrg      if (surface.equals(sampler)) {
5041b8e80941Smrg         /* This case is common in GL */
5042b8e80941Smrg         ubld.MUL(desc, surface, brw_imm_ud(0x101));
5043b8e80941Smrg      } else {
5044b8e80941Smrg         if (sampler_handle.file != BAD_FILE) {
5045b8e80941Smrg            ubld.MOV(desc, surface);
5046b8e80941Smrg         } else if (sampler.file == IMM) {
5047b8e80941Smrg            ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
5048b8e80941Smrg         } else {
5049b8e80941Smrg            ubld.SHL(desc, sampler, brw_imm_ud(8));
5050b8e80941Smrg            ubld.OR(desc, desc, surface);
5051b8e80941Smrg         }
5052b8e80941Smrg      }
5053b8e80941Smrg      if (base_binding_table_index)
5054b8e80941Smrg         ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
5055b8e80941Smrg      ubld.AND(desc, desc, brw_imm_ud(0xfff));
5056b8e80941Smrg
5057b8e80941Smrg      inst->src[0] = component(desc, 0);
5058b8e80941Smrg      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5059b8e80941Smrg   }
5060b8e80941Smrg
5061b8e80941Smrg   inst->src[2] = src_payload;
5062b8e80941Smrg   inst->resize_sources(3);
5063b8e80941Smrg
5064b8e80941Smrg   if (inst->eot) {
5065b8e80941Smrg      /* EOT sampler messages don't make sense to split because it would
5066b8e80941Smrg       * involve ending half of the thread early.
5067b8e80941Smrg       */
5068b8e80941Smrg      assert(inst->group == 0);
5069b8e80941Smrg      /* We need to use SENDC for EOT sampler messages */
5070b8e80941Smrg      inst->check_tdr = true;
5071b8e80941Smrg      inst->send_has_side_effects = true;
5072b8e80941Smrg   }
5073b8e80941Smrg
5074b8e80941Smrg   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5075b8e80941Smrg   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5076b8e80941Smrg}
5077b8e80941Smrg
5078b8e80941Smrgstatic void
5079b8e80941Smrglower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
5080b8e80941Smrg{
5081b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
5082b8e80941Smrg   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
5083b8e80941Smrg   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
5084b8e80941Smrg   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
5085b8e80941Smrg   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
5086b8e80941Smrg   const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
5087b8e80941Smrg   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
5088b8e80941Smrg   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
5089b8e80941Smrg   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
5090b8e80941Smrg   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
5091b8e80941Smrg   const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
5092b8e80941Smrg   const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
5093b8e80941Smrg   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
5094b8e80941Smrg   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
5095b8e80941Smrg   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
5096b8e80941Smrg   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
5097b8e80941Smrg   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
5098b8e80941Smrg
5099b8e80941Smrg   if (devinfo->gen >= 7) {
5100b8e80941Smrg      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
5101b8e80941Smrg                                      shadow_c, lod, lod2, min_lod,
5102b8e80941Smrg                                      sample_index,
5103b8e80941Smrg                                      mcs, surface, sampler,
5104b8e80941Smrg                                      surface_handle, sampler_handle,
5105b8e80941Smrg                                      tg4_offset,
5106b8e80941Smrg                                      coord_components, grad_components);
5107b8e80941Smrg   } else if (devinfo->gen >= 5) {
5108b8e80941Smrg      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
5109b8e80941Smrg                                      shadow_c, lod, lod2, sample_index,
5110b8e80941Smrg                                      surface, sampler,
5111b8e80941Smrg                                      coord_components, grad_components);
5112b8e80941Smrg   } else {
5113b8e80941Smrg      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
5114b8e80941Smrg                                      shadow_c, lod, lod2,
5115b8e80941Smrg                                      surface, sampler,
5116b8e80941Smrg                                      coord_components, grad_components);
5117b8e80941Smrg   }
5118b8e80941Smrg}
5119b8e80941Smrg
5120b8e80941Smrg/**
5121b8e80941Smrg * Initialize the header present in some typed and untyped surface
5122b8e80941Smrg * messages.
5123b8e80941Smrg */
5124b8e80941Smrgstatic fs_reg
5125b8e80941Smrgemit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
5126b8e80941Smrg{
5127b8e80941Smrg   fs_builder ubld = bld.exec_all().group(8, 0);
5128b8e80941Smrg   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5129b8e80941Smrg   ubld.MOV(dst, brw_imm_d(0));
5130b8e80941Smrg   ubld.group(1, 0).MOV(component(dst, 7), sample_mask);
5131b8e80941Smrg   return dst;
5132b8e80941Smrg}
5133b8e80941Smrg
5134b8e80941Smrgstatic void
5135b8e80941Smrglower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5136b8e80941Smrg{
5137b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
5138b8e80941Smrg
5139b8e80941Smrg   /* Get the logical send arguments. */
5140b8e80941Smrg   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5141b8e80941Smrg   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5142b8e80941Smrg   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5143b8e80941Smrg   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5144b8e80941Smrg   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5145b8e80941Smrg   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5146b8e80941Smrg   assert(arg.file == IMM);
5147b8e80941Smrg
5148b8e80941Smrg   /* We must have exactly one of surface and surface_handle */
5149b8e80941Smrg   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5150b8e80941Smrg
5151b8e80941Smrg   /* Calculate the total number of components of the payload. */
5152b8e80941Smrg   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5153b8e80941Smrg   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5154b8e80941Smrg
5155b8e80941Smrg   const bool is_typed_access =
5156b8e80941Smrg      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
5157b8e80941Smrg      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
5158b8e80941Smrg      inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
5159b8e80941Smrg
5160b8e80941Smrg   /* From the BDW PRM Volume 7, page 147:
5161b8e80941Smrg    *
5162b8e80941Smrg    *  "For the Data Cache Data Port*, the header must be present for the
5163b8e80941Smrg    *   following message types: [...] Typed read/write/atomics"
5164b8e80941Smrg    *
5165b8e80941Smrg    * Earlier generations have a similar wording.  Because of this restriction
5166b8e80941Smrg    * we don't attempt to implement sample masks via predication for such
5167b8e80941Smrg    * messages prior to Gen9, since we have to provide a header anyway.  On
5168b8e80941Smrg    * Gen11+ the header has been removed so we can only use predication.
5169b8e80941Smrg    */
5170b8e80941Smrg   const unsigned header_sz = devinfo->gen < 9 && is_typed_access ? 1 : 0;
5171b8e80941Smrg
5172b8e80941Smrg   const bool has_side_effects = inst->has_side_effects();
5173b8e80941Smrg   fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() :
5174b8e80941Smrg                                           fs_reg(brw_imm_d(0xffff));
5175b8e80941Smrg
5176b8e80941Smrg   fs_reg payload, payload2;
5177b8e80941Smrg   unsigned mlen, ex_mlen = 0;
5178b8e80941Smrg   if (devinfo->gen >= 9) {
5179b8e80941Smrg      /* We have split sends on gen9 and above */
5180b8e80941Smrg      assert(header_sz == 0);
5181b8e80941Smrg      payload = bld.move_to_vgrf(addr, addr_sz);
5182b8e80941Smrg      payload2 = bld.move_to_vgrf(src, src_sz);
5183b8e80941Smrg      mlen = addr_sz * (inst->exec_size / 8);
5184b8e80941Smrg      ex_mlen = src_sz * (inst->exec_size / 8);
5185b8e80941Smrg   } else {
5186b8e80941Smrg      /* Allocate space for the payload. */
5187b8e80941Smrg      const unsigned sz = header_sz + addr_sz + src_sz;
5188b8e80941Smrg      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
5189b8e80941Smrg      fs_reg *const components = new fs_reg[sz];
5190b8e80941Smrg      unsigned n = 0;
5191b8e80941Smrg
5192b8e80941Smrg      /* Construct the payload. */
5193b8e80941Smrg      if (header_sz)
5194b8e80941Smrg         components[n++] = emit_surface_header(bld, sample_mask);
5195b8e80941Smrg
5196b8e80941Smrg      for (unsigned i = 0; i < addr_sz; i++)
5197b8e80941Smrg         components[n++] = offset(addr, bld, i);
5198b8e80941Smrg
5199b8e80941Smrg      for (unsigned i = 0; i < src_sz; i++)
5200b8e80941Smrg         components[n++] = offset(src, bld, i);
5201b8e80941Smrg
5202b8e80941Smrg      bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
5203b8e80941Smrg      mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
5204b8e80941Smrg
5205b8e80941Smrg      delete[] components;
5206b8e80941Smrg   }
5207b8e80941Smrg
5208b8e80941Smrg   /* Predicate the instruction on the sample mask if no header is
5209b8e80941Smrg    * provided.
5210b8e80941Smrg    */
5211b8e80941Smrg   if (!header_sz && sample_mask.file != BAD_FILE &&
5212b8e80941Smrg       sample_mask.file != IMM) {
5213b8e80941Smrg      const fs_builder ubld = bld.group(1, 0).exec_all();
5214b8e80941Smrg      if (inst->predicate) {
5215b8e80941Smrg         assert(inst->predicate == BRW_PREDICATE_NORMAL);
5216b8e80941Smrg         assert(!inst->predicate_inverse);
5217b8e80941Smrg         assert(inst->flag_subreg < 2);
5218b8e80941Smrg         /* Combine the sample mask with the existing predicate by using a
5219b8e80941Smrg          * vertical predication mode.
5220b8e80941Smrg          */
5221b8e80941Smrg         inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
5222b8e80941Smrg         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2),
5223b8e80941Smrg                         sample_mask.type),
5224b8e80941Smrg                  sample_mask);
5225b8e80941Smrg      } else {
5226b8e80941Smrg         inst->flag_subreg = 2;
5227b8e80941Smrg         inst->predicate = BRW_PREDICATE_NORMAL;
5228b8e80941Smrg         inst->predicate_inverse = false;
5229b8e80941Smrg         ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
5230b8e80941Smrg                  sample_mask);
5231b8e80941Smrg      }
5232b8e80941Smrg   }
5233b8e80941Smrg
5234b8e80941Smrg   uint32_t sfid;
5235b8e80941Smrg   switch (inst->opcode) {
5236b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5237b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5238b8e80941Smrg      /* Byte scattered opcodes go through the normal data cache */
5239b8e80941Smrg      sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
5240b8e80941Smrg      break;
5241b8e80941Smrg
5242b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5243b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5244b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5245b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5246b8e80941Smrg      /* Untyped Surface messages go through the data cache but the SFID value
5247b8e80941Smrg       * changed on Haswell.
5248b8e80941Smrg       */
5249b8e80941Smrg      sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
5250b8e80941Smrg              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5251b8e80941Smrg              GEN7_SFID_DATAPORT_DATA_CACHE);
5252b8e80941Smrg      break;
5253b8e80941Smrg
5254b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5255b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5256b8e80941Smrg   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5257b8e80941Smrg      /* Typed surface messages go through the render cache on IVB and the
5258b8e80941Smrg       * data cache on HSW+.
5259b8e80941Smrg       */
5260b8e80941Smrg      sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
5261b8e80941Smrg              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5262b8e80941Smrg              GEN6_SFID_DATAPORT_RENDER_CACHE);
5263b8e80941Smrg      break;
5264b8e80941Smrg
5265b8e80941Smrg   default:
5266b8e80941Smrg      unreachable("Unsupported surface opcode");
5267b8e80941Smrg   }
5268b8e80941Smrg
5269b8e80941Smrg   uint32_t desc;
5270b8e80941Smrg   switch (inst->opcode) {
5271b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5272b8e80941Smrg      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5273b8e80941Smrg                                            arg.ud, /* num_channels */
5274b8e80941Smrg                                            false   /* write */);
5275b8e80941Smrg      break;
5276b8e80941Smrg
5277b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5278b8e80941Smrg      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5279b8e80941Smrg                                            arg.ud, /* num_channels */
5280b8e80941Smrg                                            true    /* write */);
5281b8e80941Smrg      break;
5282b8e80941Smrg
5283b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5284b8e80941Smrg      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5285b8e80941Smrg                                           arg.ud, /* bit_size */
5286b8e80941Smrg                                           false   /* write */);
5287b8e80941Smrg      break;
5288b8e80941Smrg
5289b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5290b8e80941Smrg      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5291b8e80941Smrg                                           arg.ud, /* bit_size */
5292b8e80941Smrg                                           true    /* write */);
5293b8e80941Smrg      break;
5294b8e80941Smrg
5295b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5296b8e80941Smrg      desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
5297b8e80941Smrg                                        arg.ud, /* atomic_op */
5298b8e80941Smrg                                        !inst->dst.is_null());
5299b8e80941Smrg      break;
5300b8e80941Smrg
5301b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5302b8e80941Smrg      desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
5303b8e80941Smrg                                              arg.ud, /* atomic_op */
5304b8e80941Smrg                                              !inst->dst.is_null());
5305b8e80941Smrg      break;
5306b8e80941Smrg
5307b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5308b8e80941Smrg      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5309b8e80941Smrg                                          arg.ud, /* num_channels */
5310b8e80941Smrg                                          false   /* write */);
5311b8e80941Smrg      break;
5312b8e80941Smrg
5313b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5314b8e80941Smrg      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5315b8e80941Smrg                                          arg.ud, /* num_channels */
5316b8e80941Smrg                                          true    /* write */);
5317b8e80941Smrg      break;
5318b8e80941Smrg
5319b8e80941Smrg   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5320b8e80941Smrg      desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
5321b8e80941Smrg                                      arg.ud, /* atomic_op */
5322b8e80941Smrg                                      !inst->dst.is_null());
5323b8e80941Smrg      break;
5324b8e80941Smrg
5325b8e80941Smrg   default:
5326b8e80941Smrg      unreachable("Unknown surface logical instruction");
5327b8e80941Smrg   }
5328b8e80941Smrg
5329b8e80941Smrg   /* Update the original instruction. */
5330b8e80941Smrg   inst->opcode = SHADER_OPCODE_SEND;
5331b8e80941Smrg   inst->mlen = mlen;
5332b8e80941Smrg   inst->ex_mlen = ex_mlen;
5333b8e80941Smrg   inst->header_size = header_sz;
5334b8e80941Smrg   inst->send_has_side_effects = has_side_effects;
5335b8e80941Smrg   inst->send_is_volatile = !has_side_effects;
5336b8e80941Smrg
5337b8e80941Smrg   /* Set up SFID and descriptors */
5338b8e80941Smrg   inst->sfid = sfid;
5339b8e80941Smrg   inst->desc = desc;
5340b8e80941Smrg   if (surface.file == IMM) {
5341b8e80941Smrg      inst->desc |= surface.ud & 0xff;
5342b8e80941Smrg      inst->src[0] = brw_imm_ud(0);
5343b8e80941Smrg      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5344b8e80941Smrg   } else if (surface_handle.file != BAD_FILE) {
5345b8e80941Smrg      /* Bindless surface */
5346b8e80941Smrg      assert(devinfo->gen >= 9);
5347b8e80941Smrg      inst->desc |= GEN9_BTI_BINDLESS;
5348b8e80941Smrg      inst->src[0] = brw_imm_ud(0);
5349b8e80941Smrg
5350b8e80941Smrg      /* We assume that the driver provided the handle in the top 20 bits so
5351b8e80941Smrg       * we can use the surface handle directly as the extended descriptor.
5352b8e80941Smrg       */
5353b8e80941Smrg      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5354b8e80941Smrg   } else {
5355b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
5356b8e80941Smrg      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5357b8e80941Smrg      ubld.AND(tmp, surface, brw_imm_ud(0xff));
5358b8e80941Smrg      inst->src[0] = component(tmp, 0);
5359b8e80941Smrg      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5360b8e80941Smrg   }
5361b8e80941Smrg
5362b8e80941Smrg   /* Finally, the payload */
5363b8e80941Smrg   inst->src[2] = payload;
5364b8e80941Smrg   inst->src[3] = payload2;
5365b8e80941Smrg
5366b8e80941Smrg   inst->resize_sources(4);
5367b8e80941Smrg}
5368b8e80941Smrg
5369b8e80941Smrgstatic void
5370b8e80941Smrglower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
5371b8e80941Smrg{
5372b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
5373b8e80941Smrg
5374b8e80941Smrg   const fs_reg &addr = inst->src[0];
5375b8e80941Smrg   const fs_reg &src = inst->src[1];
5376b8e80941Smrg   const unsigned src_comps = inst->components_read(1);
5377b8e80941Smrg   assert(inst->src[2].file == IMM);
5378b8e80941Smrg   const unsigned arg = inst->src[2].ud;
5379b8e80941Smrg   const bool has_side_effects = inst->has_side_effects();
5380b8e80941Smrg
5381b8e80941Smrg   /* If the surface message has side effects and we're a fragment shader, we
5382b8e80941Smrg    * have to predicate with the sample mask to avoid helper invocations.
5383b8e80941Smrg    */
5384b8e80941Smrg   if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) {
5385b8e80941Smrg      inst->flag_subreg = 2;
5386b8e80941Smrg      inst->predicate = BRW_PREDICATE_NORMAL;
5387b8e80941Smrg      inst->predicate_inverse = false;
5388b8e80941Smrg
5389b8e80941Smrg      fs_reg sample_mask = bld.sample_mask_reg();
5390b8e80941Smrg      const fs_builder ubld = bld.group(1, 0).exec_all();
5391b8e80941Smrg      ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
5392b8e80941Smrg               sample_mask);
5393b8e80941Smrg   }
5394b8e80941Smrg
5395b8e80941Smrg   fs_reg payload, payload2;
5396b8e80941Smrg   unsigned mlen, ex_mlen = 0;
5397b8e80941Smrg   if (devinfo->gen >= 9) {
5398b8e80941Smrg      /* On Skylake and above, we have SENDS */
5399b8e80941Smrg      mlen = 2 * (inst->exec_size / 8);
5400b8e80941Smrg      ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
5401b8e80941Smrg      payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
5402b8e80941Smrg      payload2 = retype(bld.move_to_vgrf(src, src_comps),
5403b8e80941Smrg                        BRW_REGISTER_TYPE_UD);
5404b8e80941Smrg   } else {
5405b8e80941Smrg      /* Add two because the address is 64-bit */
5406b8e80941Smrg      const unsigned dwords = 2 + src_comps;
5407b8e80941Smrg      mlen = dwords * (inst->exec_size / 8);
5408b8e80941Smrg
5409b8e80941Smrg      fs_reg sources[5];
5410b8e80941Smrg
5411b8e80941Smrg      sources[0] = addr;
5412b8e80941Smrg
5413b8e80941Smrg      for (unsigned i = 0; i < src_comps; i++)
5414b8e80941Smrg         sources[1 + i] = offset(src, bld, i);
5415b8e80941Smrg
5416b8e80941Smrg      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
5417b8e80941Smrg      bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
5418b8e80941Smrg   }
5419b8e80941Smrg
5420b8e80941Smrg   uint32_t desc;
5421b8e80941Smrg   switch (inst->opcode) {
5422b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
5423b8e80941Smrg      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
5424b8e80941Smrg                                                arg,   /* num_channels */
5425b8e80941Smrg                                                false  /* write */);
5426b8e80941Smrg      break;
5427b8e80941Smrg
5428b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
5429b8e80941Smrg      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
5430b8e80941Smrg                                                arg,   /* num_channels */
5431b8e80941Smrg                                                true   /* write */);
5432b8e80941Smrg      break;
5433b8e80941Smrg
5434b8e80941Smrg   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
5435b8e80941Smrg      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
5436b8e80941Smrg                                               arg,   /* bit_size */
5437b8e80941Smrg                                               false  /* write */);
5438b8e80941Smrg      break;
5439b8e80941Smrg
5440b8e80941Smrg   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
5441b8e80941Smrg      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
5442b8e80941Smrg                                               arg,   /* bit_size */
5443b8e80941Smrg                                               true   /* write */);
5444b8e80941Smrg      break;
5445b8e80941Smrg
5446b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
5447b8e80941Smrg      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
5448b8e80941Smrg                                            arg,   /* atomic_op */
5449b8e80941Smrg                                            !inst->dst.is_null());
5450b8e80941Smrg      break;
5451b8e80941Smrg
5452b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
5453b8e80941Smrg      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
5454b8e80941Smrg                                            arg,   /* atomic_op */
5455b8e80941Smrg                                            !inst->dst.is_null());
5456b8e80941Smrg      break;
5457b8e80941Smrg
5458b8e80941Smrg
5459b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5460b8e80941Smrg      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
5461b8e80941Smrg                                                  arg,   /* atomic_op */
5462b8e80941Smrg                                                  !inst->dst.is_null());
5463b8e80941Smrg      break;
5464b8e80941Smrg
5465b8e80941Smrg   default:
5466b8e80941Smrg      unreachable("Unknown A64 logical instruction");
5467b8e80941Smrg   }
5468b8e80941Smrg
5469b8e80941Smrg   /* Update the original instruction. */
5470b8e80941Smrg   inst->opcode = SHADER_OPCODE_SEND;
5471b8e80941Smrg   inst->mlen = mlen;
5472b8e80941Smrg   inst->ex_mlen = ex_mlen;
5473b8e80941Smrg   inst->header_size = 0;
5474b8e80941Smrg   inst->send_has_side_effects = has_side_effects;
5475b8e80941Smrg   inst->send_is_volatile = !has_side_effects;
5476b8e80941Smrg
5477b8e80941Smrg   /* Set up SFID and descriptors */
5478b8e80941Smrg   inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
5479b8e80941Smrg   inst->desc = desc;
5480b8e80941Smrg   inst->resize_sources(4);
5481b8e80941Smrg   inst->src[0] = brw_imm_ud(0); /* desc */
5482b8e80941Smrg   inst->src[1] = brw_imm_ud(0); /* ex_desc */
5483b8e80941Smrg   inst->src[2] = payload;
5484b8e80941Smrg   inst->src[3] = payload2;
5485b8e80941Smrg}
5486b8e80941Smrg
5487b8e80941Smrgstatic void
5488b8e80941Smrglower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
5489b8e80941Smrg{
5490b8e80941Smrg   const gen_device_info *devinfo = bld.shader->devinfo;
5491b8e80941Smrg
5492b8e80941Smrg   if (devinfo->gen >= 7) {
5493b8e80941Smrg      fs_reg index = inst->src[0];
5494b8e80941Smrg      /* We are switching the instruction from an ALU-like instruction to a
5495b8e80941Smrg       * send-from-grf instruction.  Since sends can't handle strides or
5496b8e80941Smrg       * source modifiers, we have to make a copy of the offset source.
5497b8e80941Smrg       */
5498b8e80941Smrg      fs_reg offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
5499b8e80941Smrg      bld.MOV(offset, inst->src[1]);
5500b8e80941Smrg
5501b8e80941Smrg      const unsigned simd_mode =
5502b8e80941Smrg         inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
5503b8e80941Smrg                                BRW_SAMPLER_SIMD_MODE_SIMD16;
5504b8e80941Smrg
5505b8e80941Smrg      inst->opcode = SHADER_OPCODE_SEND;
5506b8e80941Smrg      inst->mlen = inst->exec_size / 8;
5507b8e80941Smrg      inst->resize_sources(3);
5508b8e80941Smrg
5509b8e80941Smrg      inst->sfid = BRW_SFID_SAMPLER;
5510b8e80941Smrg      inst->desc = brw_sampler_desc(devinfo, 0, 0,
5511b8e80941Smrg                                    GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
5512b8e80941Smrg                                    simd_mode, 0);
5513b8e80941Smrg      if (index.file == IMM) {
5514b8e80941Smrg         inst->desc |= index.ud & 0xff;
5515b8e80941Smrg         inst->src[0] = brw_imm_ud(0);
5516b8e80941Smrg      } else {
5517b8e80941Smrg         const fs_builder ubld = bld.exec_all().group(1, 0);
5518b8e80941Smrg         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5519b8e80941Smrg         ubld.AND(tmp, index, brw_imm_ud(0xff));
5520b8e80941Smrg         inst->src[0] = component(tmp, 0);
5521b8e80941Smrg      }
5522b8e80941Smrg      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5523b8e80941Smrg      inst->src[2] = offset; /* payload */
5524b8e80941Smrg   } else {
5525b8e80941Smrg      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
5526b8e80941Smrg                           BRW_REGISTER_TYPE_UD);
5527b8e80941Smrg
5528b8e80941Smrg      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
5529b8e80941Smrg
5530b8e80941Smrg      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
5531b8e80941Smrg      inst->resize_sources(1);
5532b8e80941Smrg      inst->base_mrf = payload.nr;
5533b8e80941Smrg      inst->header_size = 1;
5534b8e80941Smrg      inst->mlen = 1 + inst->exec_size / 8;
5535b8e80941Smrg   }
5536b8e80941Smrg}
5537b8e80941Smrg
5538b8e80941Smrgstatic void
5539b8e80941Smrglower_math_logical_send(const fs_builder &bld, fs_inst *inst)
5540b8e80941Smrg{
5541b8e80941Smrg   assert(bld.shader->devinfo->gen < 6);
5542b8e80941Smrg
5543b8e80941Smrg   inst->base_mrf = 2;
5544b8e80941Smrg   inst->mlen = inst->sources * inst->exec_size / 8;
5545b8e80941Smrg
5546b8e80941Smrg   if (inst->sources > 1) {
5547b8e80941Smrg      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
5548b8e80941Smrg       * "Message Payload":
5549b8e80941Smrg       *
5550b8e80941Smrg       * "Operand0[7].  For the INT DIV functions, this operand is the
5551b8e80941Smrg       *  denominator."
5552b8e80941Smrg       *  ...
5553b8e80941Smrg       * "Operand1[7].  For the INT DIV functions, this operand is the
5554b8e80941Smrg       *  numerator."
5555b8e80941Smrg       */
5556b8e80941Smrg      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
5557b8e80941Smrg      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
5558b8e80941Smrg      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
5559b8e80941Smrg
5560b8e80941Smrg      inst->resize_sources(1);
5561b8e80941Smrg      inst->src[0] = src0;
5562b8e80941Smrg
5563b8e80941Smrg      assert(inst->exec_size == 8);
5564b8e80941Smrg      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
5565b8e80941Smrg   }
5566b8e80941Smrg}
5567b8e80941Smrg
5568b8e80941Smrgbool
5569b8e80941Smrgfs_visitor::lower_logical_sends()
5570b8e80941Smrg{
5571b8e80941Smrg   bool progress = false;
5572b8e80941Smrg
5573b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
5574b8e80941Smrg      const fs_builder ibld(this, block, inst);
5575b8e80941Smrg
5576b8e80941Smrg      switch (inst->opcode) {
5577b8e80941Smrg      case FS_OPCODE_FB_WRITE_LOGICAL:
5578b8e80941Smrg         assert(stage == MESA_SHADER_FRAGMENT);
5579b8e80941Smrg         lower_fb_write_logical_send(ibld, inst,
5580b8e80941Smrg                                     brw_wm_prog_data(prog_data),
5581b8e80941Smrg                                     (const brw_wm_prog_key *)key,
5582b8e80941Smrg                                     payload);
5583b8e80941Smrg         break;
5584b8e80941Smrg
5585b8e80941Smrg      case FS_OPCODE_FB_READ_LOGICAL:
5586b8e80941Smrg         lower_fb_read_logical_send(ibld, inst);
5587b8e80941Smrg         break;
5588b8e80941Smrg
5589b8e80941Smrg      case SHADER_OPCODE_TEX_LOGICAL:
5590b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
5591b8e80941Smrg         break;
5592b8e80941Smrg
5593b8e80941Smrg      case SHADER_OPCODE_TXD_LOGICAL:
5594b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
5595b8e80941Smrg         break;
5596b8e80941Smrg
5597b8e80941Smrg      case SHADER_OPCODE_TXF_LOGICAL:
5598b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
5599b8e80941Smrg         break;
5600b8e80941Smrg
5601b8e80941Smrg      case SHADER_OPCODE_TXL_LOGICAL:
5602b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
5603b8e80941Smrg         break;
5604b8e80941Smrg
5605b8e80941Smrg      case SHADER_OPCODE_TXS_LOGICAL:
5606b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
5607b8e80941Smrg         break;
5608b8e80941Smrg
5609b8e80941Smrg      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5610b8e80941Smrg         lower_sampler_logical_send(ibld, inst,
5611b8e80941Smrg                                    SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
5612b8e80941Smrg         break;
5613b8e80941Smrg
5614b8e80941Smrg      case FS_OPCODE_TXB_LOGICAL:
5615b8e80941Smrg         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
5616b8e80941Smrg         break;
5617b8e80941Smrg
5618b8e80941Smrg      case SHADER_OPCODE_TXF_CMS_LOGICAL:
5619b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
5620b8e80941Smrg         break;
5621b8e80941Smrg
5622b8e80941Smrg      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
5623b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
5624b8e80941Smrg         break;
5625b8e80941Smrg
5626b8e80941Smrg      case SHADER_OPCODE_TXF_UMS_LOGICAL:
5627b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
5628b8e80941Smrg         break;
5629b8e80941Smrg
5630b8e80941Smrg      case SHADER_OPCODE_TXF_MCS_LOGICAL:
5631b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
5632b8e80941Smrg         break;
5633b8e80941Smrg
5634b8e80941Smrg      case SHADER_OPCODE_LOD_LOGICAL:
5635b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
5636b8e80941Smrg         break;
5637b8e80941Smrg
5638b8e80941Smrg      case SHADER_OPCODE_TG4_LOGICAL:
5639b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
5640b8e80941Smrg         break;
5641b8e80941Smrg
5642b8e80941Smrg      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
5643b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
5644b8e80941Smrg         break;
5645b8e80941Smrg
5646b8e80941Smrg      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
5647b8e80941Smrg         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
5648b8e80941Smrg         break;
5649b8e80941Smrg
5650b8e80941Smrg      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5651b8e80941Smrg      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5652b8e80941Smrg      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5653b8e80941Smrg      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5654b8e80941Smrg      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5655b8e80941Smrg      case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5656b8e80941Smrg      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5657b8e80941Smrg      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5658b8e80941Smrg      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5659b8e80941Smrg         lower_surface_logical_send(ibld, inst);
5660b8e80941Smrg         break;
5661b8e80941Smrg
5662b8e80941Smrg      case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
5663b8e80941Smrg      case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
5664b8e80941Smrg      case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
5665b8e80941Smrg      case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
5666b8e80941Smrg      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
5667b8e80941Smrg      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
5668b8e80941Smrg      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5669b8e80941Smrg         lower_a64_logical_send(ibld, inst);
5670b8e80941Smrg         break;
5671b8e80941Smrg
5672b8e80941Smrg      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
5673b8e80941Smrg         lower_varying_pull_constant_logical_send(ibld, inst);
5674b8e80941Smrg         break;
5675b8e80941Smrg
5676b8e80941Smrg      case SHADER_OPCODE_RCP:
5677b8e80941Smrg      case SHADER_OPCODE_RSQ:
5678b8e80941Smrg      case SHADER_OPCODE_SQRT:
5679b8e80941Smrg      case SHADER_OPCODE_EXP2:
5680b8e80941Smrg      case SHADER_OPCODE_LOG2:
5681b8e80941Smrg      case SHADER_OPCODE_SIN:
5682b8e80941Smrg      case SHADER_OPCODE_COS:
5683b8e80941Smrg      case SHADER_OPCODE_POW:
5684b8e80941Smrg      case SHADER_OPCODE_INT_QUOTIENT:
5685b8e80941Smrg      case SHADER_OPCODE_INT_REMAINDER:
5686b8e80941Smrg         /* The math opcodes are overloaded for the send-like and
5687b8e80941Smrg          * expression-like instructions which seems kind of icky.  Gen6+ has
5688b8e80941Smrg          * a native (but rather quirky) MATH instruction so we don't need to
5689b8e80941Smrg          * do anything here.  On Gen4-5 we'll have to lower the Gen6-like
5690b8e80941Smrg          * logical instructions (which we can easily recognize because they
5691b8e80941Smrg          * have mlen = 0) into send-like virtual instructions.
5692b8e80941Smrg          */
5693b8e80941Smrg         if (devinfo->gen < 6 && inst->mlen == 0) {
5694b8e80941Smrg            lower_math_logical_send(ibld, inst);
5695b8e80941Smrg            break;
5696b8e80941Smrg
5697b8e80941Smrg         } else {
5698b8e80941Smrg            continue;
5699b8e80941Smrg         }
5700b8e80941Smrg
5701b8e80941Smrg      default:
5702b8e80941Smrg         continue;
5703b8e80941Smrg      }
5704b8e80941Smrg
5705b8e80941Smrg      progress = true;
5706b8e80941Smrg   }
5707b8e80941Smrg
5708b8e80941Smrg   if (progress)
5709b8e80941Smrg      invalidate_live_intervals();
5710b8e80941Smrg
5711b8e80941Smrg   return progress;
5712b8e80941Smrg}
5713b8e80941Smrg
5714b8e80941Smrgstatic bool
5715b8e80941Smrgis_mixed_float_with_fp32_dst(const fs_inst *inst)
5716b8e80941Smrg{
5717b8e80941Smrg   /* This opcode sometimes uses :W type on the source even if the operand is
5718b8e80941Smrg    * a :HF, because in gen7 there is no support for :HF, and thus it uses :W.
5719b8e80941Smrg    */
5720b8e80941Smrg   if (inst->opcode == BRW_OPCODE_F16TO32)
5721b8e80941Smrg      return true;
5722b8e80941Smrg
5723b8e80941Smrg   if (inst->dst.type != BRW_REGISTER_TYPE_F)
5724b8e80941Smrg      return false;
5725b8e80941Smrg
5726b8e80941Smrg   for (int i = 0; i < inst->sources; i++) {
5727b8e80941Smrg      if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
5728b8e80941Smrg         return true;
5729b8e80941Smrg   }
5730b8e80941Smrg
5731b8e80941Smrg   return false;
5732b8e80941Smrg}
5733b8e80941Smrg
5734b8e80941Smrgstatic bool
5735b8e80941Smrgis_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
5736b8e80941Smrg{
5737b8e80941Smrg   /* This opcode sometimes uses :W type on the destination even if the
5738b8e80941Smrg    * destination is a :HF, because in gen7 there is no support for :HF, and
5739b8e80941Smrg    * thus it uses :W.
5740b8e80941Smrg    */
5741b8e80941Smrg   if (inst->opcode == BRW_OPCODE_F32TO16 &&
5742b8e80941Smrg       inst->dst.stride == 1)
5743b8e80941Smrg      return true;
5744b8e80941Smrg
5745b8e80941Smrg   if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
5746b8e80941Smrg       inst->dst.stride != 1)
5747b8e80941Smrg      return false;
5748b8e80941Smrg
5749b8e80941Smrg   for (int i = 0; i < inst->sources; i++) {
5750b8e80941Smrg      if (inst->src[i].type == BRW_REGISTER_TYPE_F)
5751b8e80941Smrg         return true;
5752b8e80941Smrg   }
5753b8e80941Smrg
5754b8e80941Smrg   return false;
5755b8e80941Smrg}
5756b8e80941Smrg
5757b8e80941Smrg/**
5758b8e80941Smrg * Get the closest allowed SIMD width for instruction \p inst accounting for
5759b8e80941Smrg * some common regioning and execution control restrictions that apply to FPU
5760b8e80941Smrg * instructions.  These restrictions don't necessarily have any relevance to
5761b8e80941Smrg * instructions not executed by the FPU pipeline like extended math, control
5762b8e80941Smrg * flow or send message instructions.
5763b8e80941Smrg *
5764b8e80941Smrg * For virtual opcodes it's really up to the instruction -- In some cases
5765b8e80941Smrg * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
5766b8e80941Smrg * instructions) it may simplify virtual instruction lowering if we can
5767b8e80941Smrg * enforce FPU-like regioning restrictions already on the virtual instruction,
5768b8e80941Smrg * in other cases (e.g. virtual send-like instructions) this may be
5769b8e80941Smrg * excessively restrictive.
5770b8e80941Smrg */
5771b8e80941Smrgstatic unsigned
5772b8e80941Smrgget_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
5773b8e80941Smrg                           const fs_inst *inst)
5774b8e80941Smrg{
5775b8e80941Smrg   /* Maximum execution size representable in the instruction controls. */
5776b8e80941Smrg   unsigned max_width = MIN2(32, inst->exec_size);
5777b8e80941Smrg
5778b8e80941Smrg   /* According to the PRMs:
5779b8e80941Smrg    *  "A. In Direct Addressing mode, a source cannot span more than 2
5780b8e80941Smrg    *      adjacent GRF registers.
5781b8e80941Smrg    *   B. A destination cannot span more than 2 adjacent GRF registers."
5782b8e80941Smrg    *
5783b8e80941Smrg    * Look for the source or destination with the largest register region
5784b8e80941Smrg    * which is the one that is going to limit the overall execution size of
5785b8e80941Smrg    * the instruction due to this rule.
5786b8e80941Smrg    */
5787b8e80941Smrg   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
5788b8e80941Smrg
5789b8e80941Smrg   for (unsigned i = 0; i < inst->sources; i++)
5790b8e80941Smrg      reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
5791b8e80941Smrg
5792b8e80941Smrg   /* Calculate the maximum execution size of the instruction based on the
5793b8e80941Smrg    * factor by which it goes over the hardware limit of 2 GRFs.
5794b8e80941Smrg    */
5795b8e80941Smrg   if (reg_count > 2)
5796b8e80941Smrg      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
5797b8e80941Smrg
5798b8e80941Smrg   /* According to the IVB PRMs:
5799b8e80941Smrg    *  "When destination spans two registers, the source MUST span two
5800b8e80941Smrg    *   registers. The exception to the above rule:
5801b8e80941Smrg    *
5802b8e80941Smrg    *    - When source is scalar, the source registers are not incremented.
5803b8e80941Smrg    *    - When source is packed integer Word and destination is packed
5804b8e80941Smrg    *      integer DWord, the source register is not incremented but the
5805b8e80941Smrg    *      source sub register is incremented."
5806b8e80941Smrg    *
5807b8e80941Smrg    * The hardware specs from Gen4 to Gen7.5 mention similar regioning
5808b8e80941Smrg    * restrictions.  The code below intentionally doesn't check whether the
5809b8e80941Smrg    * destination type is integer because empirically the hardware doesn't
5810b8e80941Smrg    * seem to care what the actual type is as long as it's dword-aligned.
5811b8e80941Smrg    */
5812b8e80941Smrg   if (devinfo->gen < 8) {
5813b8e80941Smrg      for (unsigned i = 0; i < inst->sources; i++) {
5814b8e80941Smrg         /* IVB implements DF scalars as <0;2,1> regions. */
5815b8e80941Smrg         const bool is_scalar_exception = is_uniform(inst->src[i]) &&
5816b8e80941Smrg            (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
5817b8e80941Smrg         const bool is_packed_word_exception =
5818b8e80941Smrg            type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
5819b8e80941Smrg            type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
5820b8e80941Smrg
5821b8e80941Smrg         /* We check size_read(i) against size_written instead of REG_SIZE
5822b8e80941Smrg          * because we want to properly handle SIMD32.  In SIMD32, you can end
5823b8e80941Smrg          * up with writes to 4 registers and a source that reads 2 registers
5824b8e80941Smrg          * and we may still need to lower all the way to SIMD8 in that case.
5825b8e80941Smrg          */
5826b8e80941Smrg         if (inst->size_written > REG_SIZE &&
5827b8e80941Smrg             inst->size_read(i) != 0 &&
5828b8e80941Smrg             inst->size_read(i) < inst->size_written &&
5829b8e80941Smrg             !is_scalar_exception && !is_packed_word_exception) {
5830b8e80941Smrg            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
5831b8e80941Smrg            max_width = MIN2(max_width, inst->exec_size / reg_count);
5832b8e80941Smrg         }
5833b8e80941Smrg      }
5834b8e80941Smrg   }
5835b8e80941Smrg
5836b8e80941Smrg   if (devinfo->gen < 6) {
5837b8e80941Smrg      /* From the G45 PRM, Volume 4 Page 361:
5838b8e80941Smrg       *
5839b8e80941Smrg       *    "Operand Alignment Rule: With the exceptions listed below, a
5840b8e80941Smrg       *     source/destination operand in general should be aligned to even
5841b8e80941Smrg       *     256-bit physical register with a region size equal to two 256-bit
5842b8e80941Smrg       *     physical registers."
5843b8e80941Smrg       *
5844b8e80941Smrg       * Normally we enforce this by allocating virtual registers to the
5845b8e80941Smrg       * even-aligned class.  But we need to handle payload registers.
5846b8e80941Smrg       */
5847b8e80941Smrg      for (unsigned i = 0; i < inst->sources; i++) {
5848b8e80941Smrg         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
5849b8e80941Smrg             inst->size_read(i) > REG_SIZE) {
5850b8e80941Smrg            max_width = MIN2(max_width, 8);
5851b8e80941Smrg         }
5852b8e80941Smrg      }
5853b8e80941Smrg   }
5854b8e80941Smrg
5855b8e80941Smrg   /* From the IVB PRMs:
5856b8e80941Smrg    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
5857b8e80941Smrg    *   are applied for both halves of the SIMD32 instruction. If different
5858b8e80941Smrg    *   execution mask channels are required, split the instruction into two
5859b8e80941Smrg    *   SIMD16 instructions."
5860b8e80941Smrg    *
5861b8e80941Smrg    * There is similar text in the HSW PRMs.  Gen4-6 don't even implement
5862b8e80941Smrg    * 32-wide control flow support in hardware and will behave similarly.
5863b8e80941Smrg    */
5864b8e80941Smrg   if (devinfo->gen < 8 && !inst->force_writemask_all)
5865b8e80941Smrg      max_width = MIN2(max_width, 16);
5866b8e80941Smrg
5867b8e80941Smrg   /* From the IVB PRMs (applies to HSW too):
5868b8e80941Smrg    *  "Instructions with condition modifiers must not use SIMD32."
5869b8e80941Smrg    *
5870b8e80941Smrg    * From the BDW PRMs (applies to later hardware too):
5871b8e80941Smrg    *  "Ternary instruction with condition modifiers must not use SIMD32."
5872b8e80941Smrg    */
5873b8e80941Smrg   if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
5874b8e80941Smrg      max_width = MIN2(max_width, 16);
5875b8e80941Smrg
5876b8e80941Smrg   /* From the IVB PRMs (applies to other devices that don't have the
5877b8e80941Smrg    * gen_device_info::supports_simd16_3src flag set):
5878b8e80941Smrg    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
5879b8e80941Smrg    *   SIMD8 is not allowed for DF operations."
5880b8e80941Smrg    */
5881b8e80941Smrg   if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
5882b8e80941Smrg      max_width = MIN2(max_width, inst->exec_size / reg_count);
5883b8e80941Smrg
5884b8e80941Smrg   /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
5885b8e80941Smrg    * the 8-bit quarter of the execution mask signals specified in the
5886b8e80941Smrg    * instruction control fields) for the second compressed half of any
5887b8e80941Smrg    * single-precision instruction (for double-precision instructions
5888b8e80941Smrg    * it's hardwired to use NibCtrl+1, at least on HSW), which means that
5889b8e80941Smrg    * the EU will apply the wrong execution controls for the second
5890b8e80941Smrg    * sequential GRF write if the number of channels per GRF is not exactly
5891b8e80941Smrg    * eight in single-precision mode (or four in double-float mode).
5892b8e80941Smrg    *
5893b8e80941Smrg    * In this situation we calculate the maximum size of the split
5894b8e80941Smrg    * instructions so they only ever write to a single register.
5895b8e80941Smrg    */
5896b8e80941Smrg   if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
5897b8e80941Smrg       !inst->force_writemask_all) {
5898b8e80941Smrg      const unsigned channels_per_grf = inst->exec_size /
5899b8e80941Smrg         DIV_ROUND_UP(inst->size_written, REG_SIZE);
5900b8e80941Smrg      const unsigned exec_type_size = get_exec_type_size(inst);
5901b8e80941Smrg      assert(exec_type_size);
5902b8e80941Smrg
5903b8e80941Smrg      /* The hardware shifts exactly 8 channels per compressed half of the
5904b8e80941Smrg       * instruction in single-precision mode and exactly 4 in double-precision.
5905b8e80941Smrg       */
5906b8e80941Smrg      if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
5907b8e80941Smrg         max_width = MIN2(max_width, channels_per_grf);
5908b8e80941Smrg
5909b8e80941Smrg      /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
5910b8e80941Smrg       * because HW applies the same channel enable signals to both halves of
5911b8e80941Smrg       * the compressed instruction which will be just wrong under
5912b8e80941Smrg       * non-uniform control flow.
5913b8e80941Smrg       */
5914b8e80941Smrg      if (devinfo->gen == 7 && !devinfo->is_haswell &&
5915b8e80941Smrg          (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
5916b8e80941Smrg         max_width = MIN2(max_width, 4);
5917b8e80941Smrg   }
5918b8e80941Smrg
5919b8e80941Smrg   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
5920b8e80941Smrg    * Float Operations:
5921b8e80941Smrg    *
5922b8e80941Smrg    *    "No SIMD16 in mixed mode when destination is f32. Instruction
5923b8e80941Smrg    *     execution size must be no more than 8."
5924b8e80941Smrg    *
5925b8e80941Smrg    * FIXME: the simulator doesn't seem to complain if we don't do this and
5926b8e80941Smrg    * empirical testing with existing CTS tests show that they pass just fine
5927b8e80941Smrg    * without implementing this, however, since our interpretation of the PRM
5928b8e80941Smrg    * is that conversion MOVs between HF and F are still mixed-float
5929b8e80941Smrg    * instructions (and therefore subject to this restriction) we decided to
5930b8e80941Smrg    * split them to be safe. Might be useful to do additional investigation to
5931b8e80941Smrg    * lift the restriction if we can ensure that it is safe though, since these
5932b8e80941Smrg    * conversions are common when half-float types are involved since many
5933b8e80941Smrg    * instructions do not support HF types and conversions from/to F are
5934b8e80941Smrg    * required.
5935b8e80941Smrg    */
5936b8e80941Smrg   if (is_mixed_float_with_fp32_dst(inst))
5937b8e80941Smrg      max_width = MIN2(max_width, 8);
5938b8e80941Smrg
5939b8e80941Smrg   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
5940b8e80941Smrg    * Float Operations:
5941b8e80941Smrg    *
5942b8e80941Smrg    *    "No SIMD16 in mixed mode when destination is packed f16 for both
5943b8e80941Smrg    *     Align1 and Align16."
5944b8e80941Smrg    */
5945b8e80941Smrg   if (is_mixed_float_with_packed_fp16_dst(inst))
5946b8e80941Smrg      max_width = MIN2(max_width, 8);
5947b8e80941Smrg
5948b8e80941Smrg   /* Only power-of-two execution sizes are representable in the instruction
5949b8e80941Smrg    * control fields.
5950b8e80941Smrg    */
5951b8e80941Smrg   return 1 << _mesa_logbase2(max_width);
5952b8e80941Smrg}
5953b8e80941Smrg
5954b8e80941Smrg/**
5955b8e80941Smrg * Get the maximum allowed SIMD width for instruction \p inst accounting for
5956b8e80941Smrg * various payload size restrictions that apply to sampler message
5957b8e80941Smrg * instructions.
5958b8e80941Smrg *
5959b8e80941Smrg * This is only intended to provide a maximum theoretical bound for the
5960b8e80941Smrg * execution size of the message based on the number of argument components
5961b8e80941Smrg * alone, which in most cases will determine whether the SIMD8 or SIMD16
5962b8e80941Smrg * variant of the message can be used, though some messages may have
5963b8e80941Smrg * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
5964b8e80941Smrg * the message length to determine the exact SIMD width and argument count,
5965b8e80941Smrg * which makes a number of sampler message combinations impossible to
5966b8e80941Smrg * represent).
5967b8e80941Smrg */
5968b8e80941Smrgstatic unsigned
5969b8e80941Smrgget_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
5970b8e80941Smrg                               const fs_inst *inst)
5971b8e80941Smrg{
5972b8e80941Smrg   /* If we have a min_lod parameter on anything other than a simple sample
5973b8e80941Smrg    * message, it will push it over 5 arguments and we have to fall back to
5974b8e80941Smrg    * SIMD8.
5975b8e80941Smrg    */
5976b8e80941Smrg   if (inst->opcode != SHADER_OPCODE_TEX &&
5977b8e80941Smrg       inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
5978b8e80941Smrg      return 8;
5979b8e80941Smrg
5980b8e80941Smrg   /* Calculate the number of coordinate components that have to be present
5981b8e80941Smrg    * assuming that additional arguments follow the texel coordinates in the
5982b8e80941Smrg    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
5983b8e80941Smrg    * need to pad to four or three components depending on the message,
5984b8e80941Smrg    * pre-ILK we need to pad to at most three components.
5985b8e80941Smrg    */
5986b8e80941Smrg   const unsigned req_coord_components =
5987b8e80941Smrg      (devinfo->gen >= 7 ||
5988b8e80941Smrg       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
5989b8e80941Smrg      (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
5990b8e80941Smrg                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
5991b8e80941Smrg      3;
5992b8e80941Smrg
5993b8e80941Smrg   /* On Gen9+ the LOD argument is for free if we're able to use the LZ
5994b8e80941Smrg    * variant of the TXL or TXF message.
5995b8e80941Smrg    */
5996b8e80941Smrg   const bool implicit_lod = devinfo->gen >= 9 &&
5997b8e80941Smrg                             (inst->opcode == SHADER_OPCODE_TXL ||
5998b8e80941Smrg                              inst->opcode == SHADER_OPCODE_TXF) &&
5999b8e80941Smrg                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
6000b8e80941Smrg
6001b8e80941Smrg   /* Calculate the total number of argument components that need to be passed
6002b8e80941Smrg    * to the sampler unit.
6003b8e80941Smrg    */
6004b8e80941Smrg   const unsigned num_payload_components =
6005b8e80941Smrg      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
6006b8e80941Smrg           req_coord_components) +
6007b8e80941Smrg      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
6008b8e80941Smrg      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
6009b8e80941Smrg      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
6010b8e80941Smrg      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
6011b8e80941Smrg      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
6012b8e80941Smrg       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
6013b8e80941Smrg      inst->components_read(TEX_LOGICAL_SRC_MCS);
6014b8e80941Smrg
6015b8e80941Smrg   /* SIMD16 messages with more than five arguments exceed the maximum message
6016b8e80941Smrg    * size supported by the sampler, regardless of whether a header is
6017b8e80941Smrg    * provided or not.
6018b8e80941Smrg    */
6019b8e80941Smrg   return MIN2(inst->exec_size,
6020b8e80941Smrg               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
6021b8e80941Smrg}
6022b8e80941Smrg
6023b8e80941Smrg/**
6024b8e80941Smrg * Get the closest native SIMD width supported by the hardware for instruction
6025b8e80941Smrg * \p inst.  The instruction will be left untouched by
6026b8e80941Smrg * fs_visitor::lower_simd_width() if the returned value is equal to the
6027b8e80941Smrg * original execution size.
6028b8e80941Smrg */
6029b8e80941Smrgstatic unsigned
6030b8e80941Smrgget_lowered_simd_width(const struct gen_device_info *devinfo,
6031b8e80941Smrg                       const fs_inst *inst)
6032b8e80941Smrg{
6033b8e80941Smrg   switch (inst->opcode) {
6034b8e80941Smrg   case BRW_OPCODE_MOV:
6035b8e80941Smrg   case BRW_OPCODE_SEL:
6036b8e80941Smrg   case BRW_OPCODE_NOT:
6037b8e80941Smrg   case BRW_OPCODE_AND:
6038b8e80941Smrg   case BRW_OPCODE_OR:
6039b8e80941Smrg   case BRW_OPCODE_XOR:
6040b8e80941Smrg   case BRW_OPCODE_SHR:
6041b8e80941Smrg   case BRW_OPCODE_SHL:
6042b8e80941Smrg   case BRW_OPCODE_ASR:
6043b8e80941Smrg   case BRW_OPCODE_CMPN:
6044b8e80941Smrg   case BRW_OPCODE_CSEL:
6045b8e80941Smrg   case BRW_OPCODE_F32TO16:
6046b8e80941Smrg   case BRW_OPCODE_F16TO32:
6047b8e80941Smrg   case BRW_OPCODE_BFREV:
6048b8e80941Smrg   case BRW_OPCODE_BFE:
6049b8e80941Smrg   case BRW_OPCODE_ADD:
6050b8e80941Smrg   case BRW_OPCODE_MUL:
6051b8e80941Smrg   case BRW_OPCODE_AVG:
6052b8e80941Smrg   case BRW_OPCODE_FRC:
6053b8e80941Smrg   case BRW_OPCODE_RNDU:
6054b8e80941Smrg   case BRW_OPCODE_RNDD:
6055b8e80941Smrg   case BRW_OPCODE_RNDE:
6056b8e80941Smrg   case BRW_OPCODE_RNDZ:
6057b8e80941Smrg   case BRW_OPCODE_LZD:
6058b8e80941Smrg   case BRW_OPCODE_FBH:
6059b8e80941Smrg   case BRW_OPCODE_FBL:
6060b8e80941Smrg   case BRW_OPCODE_CBIT:
6061b8e80941Smrg   case BRW_OPCODE_SAD2:
6062b8e80941Smrg   case BRW_OPCODE_MAD:
6063b8e80941Smrg   case BRW_OPCODE_LRP:
6064b8e80941Smrg   case FS_OPCODE_PACK:
6065b8e80941Smrg   case SHADER_OPCODE_SEL_EXEC:
6066b8e80941Smrg   case SHADER_OPCODE_CLUSTER_BROADCAST:
6067b8e80941Smrg      return get_fpu_lowered_simd_width(devinfo, inst);
6068b8e80941Smrg
6069b8e80941Smrg   case BRW_OPCODE_CMP: {
6070b8e80941Smrg      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
6071b8e80941Smrg       * when the destination is a GRF the dependency-clear bit on the flag
6072b8e80941Smrg       * register is cleared early.
6073b8e80941Smrg       *
6074b8e80941Smrg       * Suggested workarounds are to disable coissuing CMP instructions
6075b8e80941Smrg       * or to split CMP(16) instructions into two CMP(8) instructions.
6076b8e80941Smrg       *
6077b8e80941Smrg       * We choose to split into CMP(8) instructions since disabling
6078b8e80941Smrg       * coissuing would affect CMP instructions not otherwise affected by
6079b8e80941Smrg       * the errata.
6080b8e80941Smrg       */
6081b8e80941Smrg      const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
6082b8e80941Smrg                                  !inst->dst.is_null() ? 8 : ~0);
6083b8e80941Smrg      return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
6084b8e80941Smrg   }
6085b8e80941Smrg   case BRW_OPCODE_BFI1:
6086b8e80941Smrg   case BRW_OPCODE_BFI2:
6087b8e80941Smrg      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
6088b8e80941Smrg       * should
6089b8e80941Smrg       *  "Force BFI instructions to be executed always in SIMD8."
6090b8e80941Smrg       */
6091b8e80941Smrg      return MIN2(devinfo->is_haswell ? 8 : ~0u,
6092b8e80941Smrg                  get_fpu_lowered_simd_width(devinfo, inst));
6093b8e80941Smrg
6094b8e80941Smrg   case BRW_OPCODE_IF:
6095b8e80941Smrg      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
6096b8e80941Smrg      return inst->exec_size;
6097b8e80941Smrg
6098b8e80941Smrg   case SHADER_OPCODE_RCP:
6099b8e80941Smrg   case SHADER_OPCODE_RSQ:
6100b8e80941Smrg   case SHADER_OPCODE_SQRT:
6101b8e80941Smrg   case SHADER_OPCODE_EXP2:
6102b8e80941Smrg   case SHADER_OPCODE_LOG2:
6103b8e80941Smrg   case SHADER_OPCODE_SIN:
6104b8e80941Smrg   case SHADER_OPCODE_COS: {
6105b8e80941Smrg      /* Unary extended math instructions are limited to SIMD8 on Gen4 and
6106b8e80941Smrg       * Gen6. Extended Math Function is limited to SIMD8 with half-float.
6107b8e80941Smrg       */
6108b8e80941Smrg      if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
6109b8e80941Smrg         return MIN2(8, inst->exec_size);
6110b8e80941Smrg      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
6111b8e80941Smrg         return MIN2(8, inst->exec_size);
6112b8e80941Smrg      return MIN2(16, inst->exec_size);
6113b8e80941Smrg   }
6114b8e80941Smrg
6115b8e80941Smrg   case SHADER_OPCODE_POW: {
6116b8e80941Smrg      /* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
6117b8e80941Smrg       * to SIMD8 with half-float
6118b8e80941Smrg       */
6119b8e80941Smrg      if (devinfo->gen < 7)
6120b8e80941Smrg         return MIN2(8, inst->exec_size);
6121b8e80941Smrg      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
6122b8e80941Smrg         return MIN2(8, inst->exec_size);
6123b8e80941Smrg      return MIN2(16, inst->exec_size);
6124b8e80941Smrg   }
6125b8e80941Smrg
6126b8e80941Smrg   case SHADER_OPCODE_INT_QUOTIENT:
6127b8e80941Smrg   case SHADER_OPCODE_INT_REMAINDER:
6128b8e80941Smrg      /* Integer division is limited to SIMD8 on all generations. */
6129b8e80941Smrg      return MIN2(8, inst->exec_size);
6130b8e80941Smrg
6131b8e80941Smrg   case FS_OPCODE_LINTERP:
6132b8e80941Smrg   case SHADER_OPCODE_GET_BUFFER_SIZE:
6133b8e80941Smrg   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
6134b8e80941Smrg   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
6135b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
6136b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
6137b8e80941Smrg   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
6138b8e80941Smrg      return MIN2(16, inst->exec_size);
6139b8e80941Smrg
6140b8e80941Smrg   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
6141b8e80941Smrg      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
6142b8e80941Smrg       * message used to implement varying pull constant loads, so expand it
6143b8e80941Smrg       * to SIMD16.  An alternative with longer message payload length but
6144b8e80941Smrg       * shorter return payload would be to use the SIMD8 sampler message that
6145b8e80941Smrg       * takes (header, u, v, r) as parameters instead of (header, u).
6146b8e80941Smrg       */
6147b8e80941Smrg      return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
6148b8e80941Smrg
6149b8e80941Smrg   case FS_OPCODE_DDX_COARSE:
6150b8e80941Smrg   case FS_OPCODE_DDX_FINE:
6151b8e80941Smrg   case FS_OPCODE_DDY_COARSE:
6152b8e80941Smrg   case FS_OPCODE_DDY_FINE:
6153b8e80941Smrg      /* The implementation of this virtual opcode may require emitting
6154b8e80941Smrg       * compressed Align16 instructions, which are severely limited on some
6155b8e80941Smrg       * generations.
6156b8e80941Smrg       *
6157b8e80941Smrg       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
6158b8e80941Smrg       * Region Restrictions):
6159b8e80941Smrg       *
6160b8e80941Smrg       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
6161b8e80941Smrg       *   and SIMD8 is not allowed for DF operations."
6162b8e80941Smrg       *
6163b8e80941Smrg       * In this context, "DW operations" means "operations acting on 32-bit
6164b8e80941Smrg       * values", so it includes operations on floats.
6165b8e80941Smrg       *
6166b8e80941Smrg       * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
6167b8e80941Smrg       * (Instruction Compression -> Rules and Restrictions):
6168b8e80941Smrg       *
6169b8e80941Smrg       *  "A compressed instruction must be in Align1 access mode. Align16
6170b8e80941Smrg       *   mode instructions cannot be compressed."
6171b8e80941Smrg       *
6172b8e80941Smrg       * Similar text exists in the g45 PRM.
6173b8e80941Smrg       *
6174b8e80941Smrg       * Empirically, compressed align16 instructions using odd register
6175b8e80941Smrg       * numbers don't appear to work on Sandybridge either.
6176b8e80941Smrg       */
6177b8e80941Smrg      return (devinfo->gen == 4 || devinfo->gen == 6 ||
6178b8e80941Smrg              (devinfo->gen == 7 && !devinfo->is_haswell) ?
6179b8e80941Smrg              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
6180b8e80941Smrg
6181b8e80941Smrg   case SHADER_OPCODE_MULH:
6182b8e80941Smrg      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
6183b8e80941Smrg       * is 8-wide on Gen7+.
6184b8e80941Smrg       */
6185b8e80941Smrg      return (devinfo->gen >= 7 ? 8 :
6186b8e80941Smrg              get_fpu_lowered_simd_width(devinfo, inst));
6187b8e80941Smrg
6188b8e80941Smrg   case FS_OPCODE_FB_WRITE_LOGICAL:
6189b8e80941Smrg      /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
6190b8e80941Smrg       * here.
6191b8e80941Smrg       */
6192b8e80941Smrg      assert(devinfo->gen != 6 ||
6193b8e80941Smrg             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
6194b8e80941Smrg             inst->exec_size == 8);
6195b8e80941Smrg      /* Dual-source FB writes are unsupported in SIMD16 mode. */
6196b8e80941Smrg      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
6197b8e80941Smrg              8 : MIN2(16, inst->exec_size));
6198b8e80941Smrg
6199b8e80941Smrg   case FS_OPCODE_FB_READ_LOGICAL:
6200b8e80941Smrg      return MIN2(16, inst->exec_size);
6201b8e80941Smrg
6202b8e80941Smrg   case SHADER_OPCODE_TEX_LOGICAL:
6203b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_LOGICAL:
6204b8e80941Smrg   case SHADER_OPCODE_TXF_UMS_LOGICAL:
6205b8e80941Smrg   case SHADER_OPCODE_TXF_MCS_LOGICAL:
6206b8e80941Smrg   case SHADER_OPCODE_LOD_LOGICAL:
6207b8e80941Smrg   case SHADER_OPCODE_TG4_LOGICAL:
6208b8e80941Smrg   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
6209b8e80941Smrg   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
6210b8e80941Smrg   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
6211b8e80941Smrg      return get_sampler_lowered_simd_width(devinfo, inst);
6212b8e80941Smrg
6213b8e80941Smrg   case SHADER_OPCODE_TXD_LOGICAL:
6214b8e80941Smrg      /* TXD is unsupported in SIMD16 mode. */
6215b8e80941Smrg      return 8;
6216b8e80941Smrg
6217b8e80941Smrg   case SHADER_OPCODE_TXL_LOGICAL:
6218b8e80941Smrg   case FS_OPCODE_TXB_LOGICAL:
6219b8e80941Smrg      /* Only one execution size is representable pre-ILK depending on whether
6220b8e80941Smrg       * the shadow reference argument is present.
6221b8e80941Smrg       */
6222b8e80941Smrg      if (devinfo->gen == 4)
6223b8e80941Smrg         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
6224b8e80941Smrg      else
6225b8e80941Smrg         return get_sampler_lowered_simd_width(devinfo, inst);
6226b8e80941Smrg
6227b8e80941Smrg   case SHADER_OPCODE_TXF_LOGICAL:
6228b8e80941Smrg   case SHADER_OPCODE_TXS_LOGICAL:
6229b8e80941Smrg      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
6230b8e80941Smrg       * messages.  Use SIMD16 instead.
6231b8e80941Smrg       */
6232b8e80941Smrg      if (devinfo->gen == 4)
6233b8e80941Smrg         return 16;
6234b8e80941Smrg      else
6235b8e80941Smrg         return get_sampler_lowered_simd_width(devinfo, inst);
6236b8e80941Smrg
6237b8e80941Smrg   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
6238b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
6239b8e80941Smrg   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
6240b8e80941Smrg      return 8;
6241b8e80941Smrg
6242b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6243b8e80941Smrg   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
6244b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6245b8e80941Smrg   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6246b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6247b8e80941Smrg   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6248b8e80941Smrg      return MIN2(16, inst->exec_size);
6249b8e80941Smrg
6250b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6251b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6252b8e80941Smrg   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6253b8e80941Smrg   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6254b8e80941Smrg      return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size);
6255b8e80941Smrg
6256b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6257b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6258b8e80941Smrg   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
6259b8e80941Smrg      return 8;
6260b8e80941Smrg
6261b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8:
6262b8e80941Smrg   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
6263b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8:
6264b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
6265b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
6266b8e80941Smrg   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
6267b8e80941Smrg      return MIN2(8, inst->exec_size);
6268b8e80941Smrg
6269b8e80941Smrg   case SHADER_OPCODE_QUAD_SWIZZLE: {
6270b8e80941Smrg      const unsigned swiz = inst->src[1].ud;
6271b8e80941Smrg      return (is_uniform(inst->src[0]) ?
6272b8e80941Smrg                 get_fpu_lowered_simd_width(devinfo, inst) :
6273b8e80941Smrg              devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
6274b8e80941Smrg              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
6275b8e80941Smrg              get_fpu_lowered_simd_width(devinfo, inst));
6276b8e80941Smrg   }
6277b8e80941Smrg   case SHADER_OPCODE_MOV_INDIRECT: {
6278b8e80941Smrg      /* From IVB and HSW PRMs:
6279b8e80941Smrg       *
6280b8e80941Smrg       * "2.When the destination requires two registers and the sources are
6281b8e80941Smrg       *  indirect, the sources must use 1x1 regioning mode.
6282b8e80941Smrg       *
6283b8e80941Smrg       * In case of DF instructions in HSW/IVB, the exec_size is limited by
6284b8e80941Smrg       * the EU decompression logic not handling VxH indirect addressing
6285b8e80941Smrg       * correctly.
6286b8e80941Smrg       */
6287b8e80941Smrg      const unsigned max_size = (devinfo->gen >= 8 ? 2 : 1) * REG_SIZE;
6288b8e80941Smrg      /* Prior to Broadwell, we only have 8 address subregisters. */
6289b8e80941Smrg      return MIN3(devinfo->gen >= 8 ? 16 : 8,
6290b8e80941Smrg                  max_size / (inst->dst.stride * type_sz(inst->dst.type)),
6291b8e80941Smrg                  inst->exec_size);
6292b8e80941Smrg   }
6293b8e80941Smrg
6294b8e80941Smrg   case SHADER_OPCODE_LOAD_PAYLOAD: {
6295b8e80941Smrg      const unsigned reg_count =
6296b8e80941Smrg         DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
6297b8e80941Smrg
6298b8e80941Smrg      if (reg_count > 2) {
6299b8e80941Smrg         /* Only LOAD_PAYLOAD instructions with per-channel destination region
6300b8e80941Smrg          * can be easily lowered (which excludes headers and heterogeneous
6301b8e80941Smrg          * types).
6302b8e80941Smrg          */
6303b8e80941Smrg         assert(!inst->header_size);
6304b8e80941Smrg         for (unsigned i = 0; i < inst->sources; i++)
6305b8e80941Smrg            assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
6306b8e80941Smrg                   inst->src[i].file == BAD_FILE);
6307b8e80941Smrg
6308b8e80941Smrg         return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
6309b8e80941Smrg      } else {
6310b8e80941Smrg         return inst->exec_size;
6311b8e80941Smrg      }
6312b8e80941Smrg   }
6313b8e80941Smrg   default:
6314b8e80941Smrg      return inst->exec_size;
6315b8e80941Smrg   }
6316b8e80941Smrg}
6317b8e80941Smrg
6318b8e80941Smrg/**
6319b8e80941Smrg * Return true if splitting out the group of channels of instruction \p inst
6320b8e80941Smrg * given by lbld.group() requires allocating a temporary for the i-th source
6321b8e80941Smrg * of the lowered instruction.
6322b8e80941Smrg */
6323b8e80941Smrgstatic inline bool
6324b8e80941Smrgneeds_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
6325b8e80941Smrg{
6326b8e80941Smrg   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
6327b8e80941Smrg            (inst->components_read(i) == 1 &&
6328b8e80941Smrg             lbld.dispatch_width() <= inst->exec_size)) ||
6329b8e80941Smrg          (inst->flags_written() &
6330b8e80941Smrg           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
6331b8e80941Smrg}
6332b8e80941Smrg
6333b8e80941Smrg/**
6334b8e80941Smrg * Extract the data that would be consumed by the channel group given by
6335b8e80941Smrg * lbld.group() from the i-th source region of instruction \p inst and return
6336b8e80941Smrg * it as result in packed form.
6337b8e80941Smrg */
6338b8e80941Smrgstatic fs_reg
6339b8e80941Smrgemit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
6340b8e80941Smrg{
6341b8e80941Smrg   assert(lbld.group() >= inst->group);
6342b8e80941Smrg
6343b8e80941Smrg   /* Specified channel group from the source region. */
6344b8e80941Smrg   const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
6345b8e80941Smrg
6346b8e80941Smrg   if (needs_src_copy(lbld, inst, i)) {
6347b8e80941Smrg      /* Builder of the right width to perform the copy avoiding uninitialized
6348b8e80941Smrg       * data if the lowered execution size is greater than the original
6349b8e80941Smrg       * execution size of the instruction.
6350b8e80941Smrg       */
6351b8e80941Smrg      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
6352b8e80941Smrg                                              inst->exec_size), 0);
6353b8e80941Smrg      const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
6354b8e80941Smrg
6355b8e80941Smrg      for (unsigned k = 0; k < inst->components_read(i); ++k)
6356b8e80941Smrg         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
6357b8e80941Smrg
6358b8e80941Smrg      return tmp;
6359b8e80941Smrg
6360b8e80941Smrg   } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
6361b8e80941Smrg      /* The source is invariant for all dispatch_width-wide groups of the
6362b8e80941Smrg       * original region.
6363b8e80941Smrg       */
6364b8e80941Smrg      return inst->src[i];
6365b8e80941Smrg
6366b8e80941Smrg   } else {
6367b8e80941Smrg      /* We can just point the lowered instruction at the right channel group
6368b8e80941Smrg       * from the original region.
6369b8e80941Smrg       */
6370b8e80941Smrg      return src;
6371b8e80941Smrg   }
6372b8e80941Smrg}
6373b8e80941Smrg
6374b8e80941Smrg/**
6375b8e80941Smrg * Return true if splitting out the group of channels of instruction \p inst
6376b8e80941Smrg * given by lbld.group() requires allocating a temporary for the destination
6377b8e80941Smrg * of the lowered instruction and copying the data back to the original
6378b8e80941Smrg * destination region.
6379b8e80941Smrg */
6380b8e80941Smrgstatic inline bool
6381b8e80941Smrgneeds_dst_copy(const fs_builder &lbld, const fs_inst *inst)
6382b8e80941Smrg{
6383b8e80941Smrg   /* If the instruction writes more than one component we'll have to shuffle
6384b8e80941Smrg    * the results of multiple lowered instructions in order to make sure that
6385b8e80941Smrg    * they end up arranged correctly in the original destination region.
6386b8e80941Smrg    */
6387b8e80941Smrg   if (inst->size_written > inst->dst.component_size(inst->exec_size))
6388b8e80941Smrg      return true;
6389b8e80941Smrg
6390b8e80941Smrg   /* If the lowered execution size is larger than the original the result of
6391b8e80941Smrg    * the instruction won't fit in the original destination, so we'll have to
6392b8e80941Smrg    * allocate a temporary in any case.
6393b8e80941Smrg    */
6394b8e80941Smrg   if (lbld.dispatch_width() > inst->exec_size)
6395b8e80941Smrg      return true;
6396b8e80941Smrg
6397b8e80941Smrg   for (unsigned i = 0; i < inst->sources; i++) {
6398b8e80941Smrg      /* If we already made a copy of the source for other reasons there won't
6399b8e80941Smrg       * be any overlap with the destination.
6400b8e80941Smrg       */
6401b8e80941Smrg      if (needs_src_copy(lbld, inst, i))
6402b8e80941Smrg         continue;
6403b8e80941Smrg
6404b8e80941Smrg      /* In order to keep the logic simple we emit a copy whenever the
6405b8e80941Smrg       * destination region doesn't exactly match an overlapping source, which
6406b8e80941Smrg       * may point at the source and destination not being aligned group by
6407b8e80941Smrg       * group which could cause one of the lowered instructions to overwrite
6408b8e80941Smrg       * the data read from the same source by other lowered instructions.
6409b8e80941Smrg       */
6410b8e80941Smrg      if (regions_overlap(inst->dst, inst->size_written,
6411b8e80941Smrg                          inst->src[i], inst->size_read(i)) &&
6412b8e80941Smrg          !inst->dst.equals(inst->src[i]))
6413b8e80941Smrg        return true;
6414b8e80941Smrg   }
6415b8e80941Smrg
6416b8e80941Smrg   return false;
6417b8e80941Smrg}
6418b8e80941Smrg
6419b8e80941Smrg/**
6420b8e80941Smrg * Insert data from a packed temporary into the channel group given by
6421b8e80941Smrg * lbld.group() of the destination region of instruction \p inst and return
6422b8e80941Smrg * the temporary as result.  Any copy instructions that are required for
6423b8e80941Smrg * unzipping the previous value (in the case of partial writes) will be
6424b8e80941Smrg * inserted using \p lbld_before and any copy instructions required for
6425b8e80941Smrg * zipping up the destination of \p inst will be inserted using \p lbld_after.
6426b8e80941Smrg */
6427b8e80941Smrgstatic fs_reg
6428b8e80941Smrgemit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
6429b8e80941Smrg         fs_inst *inst)
6430b8e80941Smrg{
6431b8e80941Smrg   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
6432b8e80941Smrg   assert(lbld_before.group() == lbld_after.group());
6433b8e80941Smrg   assert(lbld_after.group() >= inst->group);
6434b8e80941Smrg
6435b8e80941Smrg   /* Specified channel group from the destination region. */
6436b8e80941Smrg   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
6437b8e80941Smrg   const unsigned dst_size = inst->size_written /
6438b8e80941Smrg      inst->dst.component_size(inst->exec_size);
6439b8e80941Smrg
6440b8e80941Smrg   if (needs_dst_copy(lbld_after, inst)) {
6441b8e80941Smrg      const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
6442b8e80941Smrg
6443b8e80941Smrg      if (inst->predicate) {
6444b8e80941Smrg         /* Handle predication by copying the original contents of
6445b8e80941Smrg          * the destination into the temporary before emitting the
6446b8e80941Smrg          * lowered instruction.
6447b8e80941Smrg          */
6448b8e80941Smrg         const fs_builder gbld_before =
6449b8e80941Smrg            lbld_before.group(MIN2(lbld_before.dispatch_width(),
6450b8e80941Smrg                                   inst->exec_size), 0);
6451b8e80941Smrg         for (unsigned k = 0; k < dst_size; ++k) {
6452b8e80941Smrg            gbld_before.MOV(offset(tmp, lbld_before, k),
6453b8e80941Smrg                            offset(dst, inst->exec_size, k));
6454b8e80941Smrg         }
6455b8e80941Smrg      }
6456b8e80941Smrg
6457b8e80941Smrg      const fs_builder gbld_after =
6458b8e80941Smrg         lbld_after.group(MIN2(lbld_after.dispatch_width(),
6459b8e80941Smrg                               inst->exec_size), 0);
6460b8e80941Smrg      for (unsigned k = 0; k < dst_size; ++k) {
6461b8e80941Smrg         /* Use a builder of the right width to perform the copy avoiding
6462b8e80941Smrg          * uninitialized data if the lowered execution size is greater than
6463b8e80941Smrg          * the original execution size of the instruction.
6464b8e80941Smrg          */
6465b8e80941Smrg         gbld_after.MOV(offset(dst, inst->exec_size, k),
6466b8e80941Smrg                        offset(tmp, lbld_after, k));
6467b8e80941Smrg      }
6468b8e80941Smrg
6469b8e80941Smrg      return tmp;
6470b8e80941Smrg
6471b8e80941Smrg   } else {
6472b8e80941Smrg      /* No need to allocate a temporary for the lowered instruction, just
6473b8e80941Smrg       * take the right group of channels from the original region.
6474b8e80941Smrg       */
6475b8e80941Smrg      return dst;
6476b8e80941Smrg   }
6477b8e80941Smrg}
6478b8e80941Smrg
6479b8e80941Smrgbool
6480b8e80941Smrgfs_visitor::lower_simd_width()
6481b8e80941Smrg{
6482b8e80941Smrg   bool progress = false;
6483b8e80941Smrg
6484b8e80941Smrg   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
6485b8e80941Smrg      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
6486b8e80941Smrg
6487b8e80941Smrg      if (lower_width != inst->exec_size) {
6488b8e80941Smrg         /* Builder matching the original instruction.  We may also need to
6489b8e80941Smrg          * emit an instruction of width larger than the original, set the
6490b8e80941Smrg          * execution size of the builder to the highest of both for now so
6491b8e80941Smrg          * we're sure that both cases can be handled.
6492b8e80941Smrg          */
6493b8e80941Smrg         const unsigned max_width = MAX2(inst->exec_size, lower_width);
6494b8e80941Smrg         const fs_builder ibld = bld.at(block, inst)
6495b8e80941Smrg                                    .exec_all(inst->force_writemask_all)
6496b8e80941Smrg                                    .group(max_width, inst->group / max_width);
6497b8e80941Smrg
6498b8e80941Smrg         /* Split the copies in chunks of the execution width of either the
6499b8e80941Smrg          * original or the lowered instruction, whichever is lower.
6500b8e80941Smrg          */
6501b8e80941Smrg         const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
6502b8e80941Smrg         const unsigned dst_size = inst->size_written /
6503b8e80941Smrg            inst->dst.component_size(inst->exec_size);
6504b8e80941Smrg
6505b8e80941Smrg         assert(!inst->writes_accumulator && !inst->mlen);
6506b8e80941Smrg
6507b8e80941Smrg         /* Inserting the zip, unzip, and duplicated instructions in all of
6508b8e80941Smrg          * the right spots is somewhat tricky.  All of the unzip and any
6509b8e80941Smrg          * instructions from the zip which unzip the destination prior to
6510b8e80941Smrg          * writing need to happen before all of the per-group instructions
6511b8e80941Smrg          * and the zip instructions need to happen after.  In order to sort
6512b8e80941Smrg          * this all out, we insert the unzip instructions before \p inst,
6513b8e80941Smrg          * insert the per-group instructions after \p inst (i.e. before
6514b8e80941Smrg          * inst->next), and insert the zip instructions before the
6515b8e80941Smrg          * instruction after \p inst.  Since we are inserting instructions
6516b8e80941Smrg          * after \p inst, inst->next is a moving target and we need to save
6517b8e80941Smrg          * it off here so that we insert the zip instructions in the right
6518b8e80941Smrg          * place.
6519b8e80941Smrg          *
6520b8e80941Smrg          * Since we're inserting split instructions after after_inst, the
6521b8e80941Smrg          * instructions will end up in the reverse order that we insert them.
6522b8e80941Smrg          * However, certain render target writes require that the low group
6523b8e80941Smrg          * instructions come before the high group.  From the Ivy Bridge PRM
6524b8e80941Smrg          * Vol. 4, Pt. 1, Section 3.9.11:
6525b8e80941Smrg          *
6526b8e80941Smrg          *    "If multiple SIMD8 Dual Source messages are delivered by the
6527b8e80941Smrg          *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
6528b8e80941Smrg          *    issued before the SIMD8_DUALSRC_HI message with the same Slot
6529b8e80941Smrg          *    Group Select setting."
6530b8e80941Smrg          *
6531b8e80941Smrg          * And, from Section 3.9.11.1 of the same PRM:
6532b8e80941Smrg          *
6533b8e80941Smrg          *    "When SIMD32 or SIMD16 PS threads send render target writes
6534b8e80941Smrg          *    with multiple SIMD8 and SIMD16 messages, the following must
6535b8e80941Smrg          *    hold:
6536b8e80941Smrg          *
6537b8e80941Smrg          *    All the slots (as described above) must have a corresponding
6538b8e80941Smrg          *    render target write irrespective of the slot's validity. A slot
6539b8e80941Smrg          *    is considered valid when at least one sample is enabled. For
6540b8e80941Smrg          *    example, a SIMD16 PS thread must send two SIMD8 render target
6541b8e80941Smrg          *    writes to cover all the slots.
6542b8e80941Smrg          *
6543b8e80941Smrg          *    PS thread must send SIMD render target write messages with
6544b8e80941Smrg          *    increasing slot numbers. For example, SIMD16 thread has
6545b8e80941Smrg          *    Slot[15:0] and if two SIMD8 render target writes are used, the
6546b8e80941Smrg          *    first SIMD8 render target write must send Slot[7:0] and the
6547b8e80941Smrg          *    next one must send Slot[15:8]."
6548b8e80941Smrg          *
6549b8e80941Smrg          * In order to make low group instructions come before high group
6550b8e80941Smrg          * instructions (this is required for some render target writes), we
6551b8e80941Smrg          * split from the highest group to lowest.
6552b8e80941Smrg          */
6553b8e80941Smrg         exec_node *const after_inst = inst->next;
6554b8e80941Smrg         for (int i = n - 1; i >= 0; i--) {
6555b8e80941Smrg            /* Emit a copy of the original instruction with the lowered width.
6556b8e80941Smrg             * If the EOT flag was set throw it away except for the last
6557b8e80941Smrg             * instruction to avoid killing the thread prematurely.
6558b8e80941Smrg             */
6559b8e80941Smrg            fs_inst split_inst = *inst;
6560b8e80941Smrg            split_inst.exec_size = lower_width;
6561b8e80941Smrg            split_inst.eot = inst->eot && i == int(n - 1);
6562b8e80941Smrg
6563b8e80941Smrg            /* Select the correct channel enables for the i-th group, then
6564b8e80941Smrg             * transform the sources and destination and emit the lowered
6565b8e80941Smrg             * instruction.
6566b8e80941Smrg             */
6567b8e80941Smrg            const fs_builder lbld = ibld.group(lower_width, i);
6568b8e80941Smrg
6569b8e80941Smrg            for (unsigned j = 0; j < inst->sources; j++)
6570b8e80941Smrg               split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
6571b8e80941Smrg
6572b8e80941Smrg            split_inst.dst = emit_zip(lbld.at(block, inst),
6573b8e80941Smrg                                      lbld.at(block, after_inst), inst);
6574b8e80941Smrg            split_inst.size_written =
6575b8e80941Smrg               split_inst.dst.component_size(lower_width) * dst_size;
6576b8e80941Smrg
6577b8e80941Smrg            lbld.at(block, inst->next).emit(split_inst);
6578b8e80941Smrg         }
6579b8e80941Smrg
6580b8e80941Smrg         inst->remove(block);
6581b8e80941Smrg         progress = true;
6582b8e80941Smrg      }
6583b8e80941Smrg   }
6584b8e80941Smrg
6585b8e80941Smrg   if (progress)
6586b8e80941Smrg      invalidate_live_intervals();
6587b8e80941Smrg
6588b8e80941Smrg   return progress;
6589b8e80941Smrg}
6590b8e80941Smrg
6591b8e80941Smrgvoid
6592b8e80941Smrgfs_visitor::dump_instructions()
6593b8e80941Smrg{
6594b8e80941Smrg   dump_instructions(NULL);
6595b8e80941Smrg}
6596b8e80941Smrg
6597b8e80941Smrgvoid
6598b8e80941Smrgfs_visitor::dump_instructions(const char *name)
6599b8e80941Smrg{
6600b8e80941Smrg   FILE *file = stderr;
6601b8e80941Smrg   if (name && geteuid() != 0) {
6602b8e80941Smrg      file = fopen(name, "w");
6603b8e80941Smrg      if (!file)
6604b8e80941Smrg         file = stderr;
6605b8e80941Smrg   }
6606b8e80941Smrg
6607b8e80941Smrg   if (cfg) {
6608b8e80941Smrg      calculate_register_pressure();
6609b8e80941Smrg      int ip = 0, max_pressure = 0;
6610b8e80941Smrg      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
6611b8e80941Smrg         max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
6612b8e80941Smrg         fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
6613b8e80941Smrg         dump_instruction(inst, file);
6614b8e80941Smrg         ip++;
6615b8e80941Smrg      }
6616b8e80941Smrg      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
6617b8e80941Smrg   } else {
6618b8e80941Smrg      int ip = 0;
6619b8e80941Smrg      foreach_in_list(backend_instruction, inst, &instructions) {
6620b8e80941Smrg         fprintf(file, "%4d: ", ip++);
6621b8e80941Smrg         dump_instruction(inst, file);
6622b8e80941Smrg      }
6623b8e80941Smrg   }
6624b8e80941Smrg
6625b8e80941Smrg   if (file != stderr) {
6626b8e80941Smrg      fclose(file);
6627b8e80941Smrg   }
6628b8e80941Smrg}
6629b8e80941Smrg
6630b8e80941Smrgvoid
6631b8e80941Smrgfs_visitor::dump_instruction(backend_instruction *be_inst)
6632b8e80941Smrg{
6633b8e80941Smrg   dump_instruction(be_inst, stderr);
6634b8e80941Smrg}
6635b8e80941Smrg
6636b8e80941Smrgvoid
6637b8e80941Smrgfs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
6638b8e80941Smrg{
6639b8e80941Smrg   fs_inst *inst = (fs_inst *)be_inst;
6640b8e80941Smrg
6641b8e80941Smrg   if (inst->predicate) {
6642b8e80941Smrg      fprintf(file, "(%cf%d.%d) ",
6643b8e80941Smrg              inst->predicate_inverse ? '-' : '+',
6644b8e80941Smrg              inst->flag_subreg / 2,
6645b8e80941Smrg              inst->flag_subreg % 2);
6646b8e80941Smrg   }
6647b8e80941Smrg
6648b8e80941Smrg   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
6649b8e80941Smrg   if (inst->saturate)
6650b8e80941Smrg      fprintf(file, ".sat");
6651b8e80941Smrg   if (inst->conditional_mod) {
6652b8e80941Smrg      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
6653b8e80941Smrg      if (!inst->predicate &&
6654b8e80941Smrg          (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
6655b8e80941Smrg                                inst->opcode != BRW_OPCODE_CSEL &&
6656b8e80941Smrg                                inst->opcode != BRW_OPCODE_IF &&
6657b8e80941Smrg                                inst->opcode != BRW_OPCODE_WHILE))) {
6658b8e80941Smrg         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
6659b8e80941Smrg                 inst->flag_subreg % 2);
6660b8e80941Smrg      }
6661b8e80941Smrg   }
6662b8e80941Smrg   fprintf(file, "(%d) ", inst->exec_size);
6663b8e80941Smrg
6664b8e80941Smrg   if (inst->mlen) {
6665b8e80941Smrg      fprintf(file, "(mlen: %d) ", inst->mlen);
6666b8e80941Smrg   }
6667b8e80941Smrg
6668b8e80941Smrg   if (inst->ex_mlen) {
6669b8e80941Smrg      fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
6670b8e80941Smrg   }
6671b8e80941Smrg
6672b8e80941Smrg   if (inst->eot) {
6673b8e80941Smrg      fprintf(file, "(EOT) ");
6674b8e80941Smrg   }
6675b8e80941Smrg
6676b8e80941Smrg   switch (inst->dst.file) {
6677b8e80941Smrg   case VGRF:
6678b8e80941Smrg      fprintf(file, "vgrf%d", inst->dst.nr);
6679b8e80941Smrg      break;
6680b8e80941Smrg   case FIXED_GRF:
6681b8e80941Smrg      fprintf(file, "g%d", inst->dst.nr);
6682b8e80941Smrg      break;
6683b8e80941Smrg   case MRF:
6684b8e80941Smrg      fprintf(file, "m%d", inst->dst.nr);
6685b8e80941Smrg      break;
6686b8e80941Smrg   case BAD_FILE:
6687b8e80941Smrg      fprintf(file, "(null)");
6688b8e80941Smrg      break;
6689b8e80941Smrg   case UNIFORM:
6690b8e80941Smrg      fprintf(file, "***u%d***", inst->dst.nr);
6691b8e80941Smrg      break;
6692b8e80941Smrg   case ATTR:
6693b8e80941Smrg      fprintf(file, "***attr%d***", inst->dst.nr);
6694b8e80941Smrg      break;
6695b8e80941Smrg   case ARF:
6696b8e80941Smrg      switch (inst->dst.nr) {
6697b8e80941Smrg      case BRW_ARF_NULL:
6698b8e80941Smrg         fprintf(file, "null");
6699b8e80941Smrg         break;
6700b8e80941Smrg      case BRW_ARF_ADDRESS:
6701b8e80941Smrg         fprintf(file, "a0.%d", inst->dst.subnr);
6702b8e80941Smrg         break;
6703b8e80941Smrg      case BRW_ARF_ACCUMULATOR:
6704b8e80941Smrg         fprintf(file, "acc%d", inst->dst.subnr);
6705b8e80941Smrg         break;
6706b8e80941Smrg      case BRW_ARF_FLAG:
6707b8e80941Smrg         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
6708b8e80941Smrg         break;
6709b8e80941Smrg      default:
6710b8e80941Smrg         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
6711b8e80941Smrg         break;
6712b8e80941Smrg      }
6713b8e80941Smrg      break;
6714b8e80941Smrg   case IMM:
6715b8e80941Smrg      unreachable("not reached");
6716b8e80941Smrg   }
6717b8e80941Smrg
6718b8e80941Smrg   if (inst->dst.offset ||
6719b8e80941Smrg       (inst->dst.file == VGRF &&
6720b8e80941Smrg        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
6721b8e80941Smrg      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
6722b8e80941Smrg      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
6723b8e80941Smrg              inst->dst.offset % reg_size);
6724b8e80941Smrg   }
6725b8e80941Smrg
6726b8e80941Smrg   if (inst->dst.stride != 1)
6727b8e80941Smrg      fprintf(file, "<%u>", inst->dst.stride);
6728b8e80941Smrg   fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
6729b8e80941Smrg
6730b8e80941Smrg   for (int i = 0; i < inst->sources; i++) {
6731b8e80941Smrg      if (inst->src[i].negate)
6732b8e80941Smrg         fprintf(file, "-");
6733b8e80941Smrg      if (inst->src[i].abs)
6734b8e80941Smrg         fprintf(file, "|");
6735b8e80941Smrg      switch (inst->src[i].file) {
6736b8e80941Smrg      case VGRF:
6737b8e80941Smrg         fprintf(file, "vgrf%d", inst->src[i].nr);
6738b8e80941Smrg         break;
6739b8e80941Smrg      case FIXED_GRF:
6740b8e80941Smrg         fprintf(file, "g%d", inst->src[i].nr);
6741b8e80941Smrg         break;
6742b8e80941Smrg      case MRF:
6743b8e80941Smrg         fprintf(file, "***m%d***", inst->src[i].nr);
6744b8e80941Smrg         break;
6745b8e80941Smrg      case ATTR:
6746b8e80941Smrg         fprintf(file, "attr%d", inst->src[i].nr);
6747b8e80941Smrg         break;
6748b8e80941Smrg      case UNIFORM:
6749b8e80941Smrg         fprintf(file, "u%d", inst->src[i].nr);
6750b8e80941Smrg         break;
6751b8e80941Smrg      case BAD_FILE:
6752b8e80941Smrg         fprintf(file, "(null)");
6753b8e80941Smrg         break;
6754b8e80941Smrg      case IMM:
6755b8e80941Smrg         switch (inst->src[i].type) {
6756b8e80941Smrg         case BRW_REGISTER_TYPE_F:
6757b8e80941Smrg            fprintf(file, "%-gf", inst->src[i].f);
6758b8e80941Smrg            break;
6759b8e80941Smrg         case BRW_REGISTER_TYPE_DF:
6760b8e80941Smrg            fprintf(file, "%fdf", inst->src[i].df);
6761b8e80941Smrg            break;
6762b8e80941Smrg         case BRW_REGISTER_TYPE_W:
6763b8e80941Smrg         case BRW_REGISTER_TYPE_D:
6764b8e80941Smrg            fprintf(file, "%dd", inst->src[i].d);
6765b8e80941Smrg            break;
6766b8e80941Smrg         case BRW_REGISTER_TYPE_UW:
6767b8e80941Smrg         case BRW_REGISTER_TYPE_UD:
6768b8e80941Smrg            fprintf(file, "%uu", inst->src[i].ud);
6769b8e80941Smrg            break;
6770b8e80941Smrg         case BRW_REGISTER_TYPE_Q:
6771b8e80941Smrg            fprintf(file, "%" PRId64 "q", inst->src[i].d64);
6772b8e80941Smrg            break;
6773b8e80941Smrg         case BRW_REGISTER_TYPE_UQ:
6774b8e80941Smrg            fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
6775b8e80941Smrg            break;
6776b8e80941Smrg         case BRW_REGISTER_TYPE_VF:
6777b8e80941Smrg            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
6778b8e80941Smrg                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
6779b8e80941Smrg                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
6780b8e80941Smrg                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
6781b8e80941Smrg                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
6782b8e80941Smrg            break;
6783b8e80941Smrg         case BRW_REGISTER_TYPE_V:
6784b8e80941Smrg         case BRW_REGISTER_TYPE_UV:
6785b8e80941Smrg            fprintf(file, "%08x%s", inst->src[i].ud,
6786b8e80941Smrg                    inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
6787b8e80941Smrg            break;
6788b8e80941Smrg         default:
6789b8e80941Smrg            fprintf(file, "???");
6790b8e80941Smrg            break;
6791b8e80941Smrg         }
6792b8e80941Smrg         break;
6793b8e80941Smrg      case ARF:
6794b8e80941Smrg         switch (inst->src[i].nr) {
6795b8e80941Smrg         case BRW_ARF_NULL:
6796b8e80941Smrg            fprintf(file, "null");
6797b8e80941Smrg            break;
6798b8e80941Smrg         case BRW_ARF_ADDRESS:
6799b8e80941Smrg            fprintf(file, "a0.%d", inst->src[i].subnr);
6800b8e80941Smrg            break;
6801b8e80941Smrg         case BRW_ARF_ACCUMULATOR:
6802b8e80941Smrg            fprintf(file, "acc%d", inst->src[i].subnr);
6803b8e80941Smrg            break;
6804b8e80941Smrg         case BRW_ARF_FLAG:
6805b8e80941Smrg            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
6806b8e80941Smrg            break;
6807b8e80941Smrg         default:
6808b8e80941Smrg            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
6809b8e80941Smrg            break;
6810b8e80941Smrg         }
6811b8e80941Smrg         break;
6812b8e80941Smrg      }
6813b8e80941Smrg
6814b8e80941Smrg      if (inst->src[i].offset ||
6815b8e80941Smrg          (inst->src[i].file == VGRF &&
6816b8e80941Smrg           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
6817b8e80941Smrg         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
6818b8e80941Smrg         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
6819b8e80941Smrg                 inst->src[i].offset % reg_size);
6820b8e80941Smrg      }
6821b8e80941Smrg
6822b8e80941Smrg      if (inst->src[i].abs)
6823b8e80941Smrg         fprintf(file, "|");
6824b8e80941Smrg
6825b8e80941Smrg      if (inst->src[i].file != IMM) {
6826b8e80941Smrg         unsigned stride;
6827b8e80941Smrg         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
6828b8e80941Smrg            unsigned hstride = inst->src[i].hstride;
6829b8e80941Smrg            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
6830b8e80941Smrg         } else {
6831b8e80941Smrg            stride = inst->src[i].stride;
6832b8e80941Smrg         }
6833b8e80941Smrg         if (stride != 1)
6834b8e80941Smrg            fprintf(file, "<%u>", stride);
6835b8e80941Smrg
6836b8e80941Smrg         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
6837b8e80941Smrg      }
6838b8e80941Smrg
6839b8e80941Smrg      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
6840b8e80941Smrg         fprintf(file, ", ");
6841b8e80941Smrg   }
6842b8e80941Smrg
6843b8e80941Smrg   fprintf(file, " ");
6844b8e80941Smrg
6845b8e80941Smrg   if (inst->force_writemask_all)
6846b8e80941Smrg      fprintf(file, "NoMask ");
6847b8e80941Smrg
6848b8e80941Smrg   if (inst->exec_size != dispatch_width)
6849b8e80941Smrg      fprintf(file, "group%d ", inst->group);
6850b8e80941Smrg
6851b8e80941Smrg   fprintf(file, "\n");
6852b8e80941Smrg}
6853b8e80941Smrg
6854b8e80941Smrgvoid
6855b8e80941Smrgfs_visitor::setup_fs_payload_gen6()
6856b8e80941Smrg{
6857b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
6858b8e80941Smrg   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
6859b8e80941Smrg   const unsigned payload_width = MIN2(16, dispatch_width);
6860b8e80941Smrg   assert(dispatch_width % payload_width == 0);
6861b8e80941Smrg   assert(devinfo->gen >= 6);
6862b8e80941Smrg
6863b8e80941Smrg   prog_data->uses_src_depth = prog_data->uses_src_w =
6864b8e80941Smrg      (nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
6865b8e80941Smrg
6866b8e80941Smrg   prog_data->uses_sample_mask =
6867b8e80941Smrg      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
6868b8e80941Smrg
6869b8e80941Smrg   /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
6870b8e80941Smrg    *
6871b8e80941Smrg    *    "MSDISPMODE_PERSAMPLE is required in order to select
6872b8e80941Smrg    *    POSOFFSET_SAMPLE"
6873b8e80941Smrg    *
6874b8e80941Smrg    * So we can only really get sample positions if we are doing real
6875b8e80941Smrg    * per-sample dispatch.  If we need gl_SamplePosition and we don't have
6876b8e80941Smrg    * persample dispatch, we hard-code it to 0.5.
6877b8e80941Smrg    */
6878b8e80941Smrg   prog_data->uses_pos_offset = prog_data->persample_dispatch &&
6879b8e80941Smrg      (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
6880b8e80941Smrg
6881b8e80941Smrg   /* R0: PS thread payload header. */
6882b8e80941Smrg   payload.num_regs++;
6883b8e80941Smrg
6884b8e80941Smrg   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
6885b8e80941Smrg      /* R1: masks, pixel X/Y coordinates. */
6886b8e80941Smrg      payload.subspan_coord_reg[j] = payload.num_regs++;
6887b8e80941Smrg   }
6888b8e80941Smrg
6889b8e80941Smrg   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
6890b8e80941Smrg      /* R3-26: barycentric interpolation coordinates.  These appear in the
6891b8e80941Smrg       * same order that they appear in the brw_barycentric_mode enum.  Each
6892b8e80941Smrg       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
6893b8e80941Smrg       * registers if dispatch width == 16.  Coordinates only appear if they
6894b8e80941Smrg       * were enabled using the "Barycentric Interpolation Mode" bits in
6895b8e80941Smrg       * WM_STATE.
6896b8e80941Smrg       */
6897b8e80941Smrg      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
6898b8e80941Smrg         if (prog_data->barycentric_interp_modes & (1 << i)) {
6899b8e80941Smrg            payload.barycentric_coord_reg[i][j] = payload.num_regs;
6900b8e80941Smrg            payload.num_regs += payload_width / 4;
6901b8e80941Smrg         }
6902b8e80941Smrg      }
6903b8e80941Smrg
6904b8e80941Smrg      /* R27-28: interpolated depth if uses source depth */
6905b8e80941Smrg      if (prog_data->uses_src_depth) {
6906b8e80941Smrg         payload.source_depth_reg[j] = payload.num_regs;
6907b8e80941Smrg         payload.num_regs += payload_width / 8;
6908b8e80941Smrg      }
6909b8e80941Smrg
6910b8e80941Smrg      /* R29-30: interpolated W set if GEN6_WM_USES_SOURCE_W. */
6911b8e80941Smrg      if (prog_data->uses_src_w) {
6912b8e80941Smrg         payload.source_w_reg[j] = payload.num_regs;
6913b8e80941Smrg         payload.num_regs += payload_width / 8;
6914b8e80941Smrg      }
6915b8e80941Smrg
6916b8e80941Smrg      /* R31: MSAA position offsets. */
6917b8e80941Smrg      if (prog_data->uses_pos_offset) {
6918b8e80941Smrg         payload.sample_pos_reg[j] = payload.num_regs;
6919b8e80941Smrg         payload.num_regs++;
6920b8e80941Smrg      }
6921b8e80941Smrg
6922b8e80941Smrg      /* R32-33: MSAA input coverage mask */
6923b8e80941Smrg      if (prog_data->uses_sample_mask) {
6924b8e80941Smrg         assert(devinfo->gen >= 7);
6925b8e80941Smrg         payload.sample_mask_in_reg[j] = payload.num_regs;
6926b8e80941Smrg         payload.num_regs += payload_width / 8;
6927b8e80941Smrg      }
6928b8e80941Smrg   }
6929b8e80941Smrg
6930b8e80941Smrg   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
6931b8e80941Smrg      source_depth_to_render_target = true;
6932b8e80941Smrg   }
6933b8e80941Smrg}
6934b8e80941Smrg
6935b8e80941Smrgvoid
6936b8e80941Smrgfs_visitor::setup_vs_payload()
6937b8e80941Smrg{
6938b8e80941Smrg   /* R0: thread header, R1: urb handles */
6939b8e80941Smrg   payload.num_regs = 2;
6940b8e80941Smrg}
6941b8e80941Smrg
6942b8e80941Smrgvoid
6943b8e80941Smrgfs_visitor::setup_gs_payload()
6944b8e80941Smrg{
6945b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
6946b8e80941Smrg
6947b8e80941Smrg   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
6948b8e80941Smrg   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
6949b8e80941Smrg
6950b8e80941Smrg   /* R0: thread header, R1: output URB handles */
6951b8e80941Smrg   payload.num_regs = 2;
6952b8e80941Smrg
6953b8e80941Smrg   if (gs_prog_data->include_primitive_id) {
6954b8e80941Smrg      /* R2: Primitive ID 0..7 */
6955b8e80941Smrg      payload.num_regs++;
6956b8e80941Smrg   }
6957b8e80941Smrg
6958b8e80941Smrg   /* Always enable VUE handles so we can safely use pull model if needed.
6959b8e80941Smrg    *
6960b8e80941Smrg    * The push model for a GS uses a ton of register space even for trivial
6961b8e80941Smrg    * scenarios with just a few inputs, so just make things easier and a bit
6962b8e80941Smrg    * safer by always having pull model available.
6963b8e80941Smrg    */
6964b8e80941Smrg   gs_prog_data->base.include_vue_handles = true;
6965b8e80941Smrg
6966b8e80941Smrg   /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
6967b8e80941Smrg   payload.num_regs += nir->info.gs.vertices_in;
6968b8e80941Smrg
6969b8e80941Smrg   /* Use a maximum of 24 registers for push-model inputs. */
6970b8e80941Smrg   const unsigned max_push_components = 24;
6971b8e80941Smrg
6972b8e80941Smrg   /* If pushing our inputs would take too many registers, reduce the URB read
6973b8e80941Smrg    * length (which is in HWords, or 8 registers), and resort to pulling.
6974b8e80941Smrg    *
6975b8e80941Smrg    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
6976b8e80941Smrg    * have to multiply by VerticesIn to obtain the total storage requirement.
6977b8e80941Smrg    */
6978b8e80941Smrg   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
6979b8e80941Smrg       max_push_components) {
6980b8e80941Smrg      vue_prog_data->urb_read_length =
6981b8e80941Smrg         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
6982b8e80941Smrg   }
6983b8e80941Smrg}
6984b8e80941Smrg
6985b8e80941Smrgvoid
6986b8e80941Smrgfs_visitor::setup_cs_payload()
6987b8e80941Smrg{
6988b8e80941Smrg   assert(devinfo->gen >= 7);
6989b8e80941Smrg   payload.num_regs = 1;
6990b8e80941Smrg}
6991b8e80941Smrg
6992b8e80941Smrgvoid
6993b8e80941Smrgfs_visitor::calculate_register_pressure()
6994b8e80941Smrg{
6995b8e80941Smrg   invalidate_live_intervals();
6996b8e80941Smrg   calculate_live_intervals();
6997b8e80941Smrg
6998b8e80941Smrg   unsigned num_instructions = 0;
6999b8e80941Smrg   foreach_block(block, cfg)
7000b8e80941Smrg      num_instructions += block->instructions.length();
7001b8e80941Smrg
7002b8e80941Smrg   regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
7003b8e80941Smrg
7004b8e80941Smrg   for (unsigned reg = 0; reg < alloc.count; reg++) {
7005b8e80941Smrg      for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
7006b8e80941Smrg         regs_live_at_ip[ip] += alloc.sizes[reg];
7007b8e80941Smrg   }
7008b8e80941Smrg}
7009b8e80941Smrg
7010b8e80941Smrgvoid
7011b8e80941Smrgfs_visitor::optimize()
7012b8e80941Smrg{
7013b8e80941Smrg   /* Start by validating the shader we currently have. */
7014b8e80941Smrg   validate();
7015b8e80941Smrg
7016b8e80941Smrg   /* bld is the common builder object pointing at the end of the program we
7017b8e80941Smrg    * used to translate it into i965 IR.  For the optimization and lowering
7018b8e80941Smrg    * passes coming next, any code added after the end of the program without
7019b8e80941Smrg    * having explicitly called fs_builder::at() clearly points at a mistake.
7020b8e80941Smrg    * Ideally optimization passes wouldn't be part of the visitor so they
7021b8e80941Smrg    * wouldn't have access to bld at all, but they do, so just in case some
7022b8e80941Smrg    * pass forgets to ask for a location explicitly set it to NULL here to
7023b8e80941Smrg    * make it trip.  The dispatch width is initialized to a bogus value to
7024b8e80941Smrg    * make sure that optimizations set the execution controls explicitly to
7025b8e80941Smrg    * match the code they are manipulating instead of relying on the defaults.
7026b8e80941Smrg    */
7027b8e80941Smrg   bld = fs_builder(this, 64);
7028b8e80941Smrg
7029b8e80941Smrg   assign_constant_locations();
7030b8e80941Smrg   lower_constant_loads();
7031b8e80941Smrg
7032b8e80941Smrg   validate();
7033b8e80941Smrg
7034b8e80941Smrg   split_virtual_grfs();
7035b8e80941Smrg   validate();
7036b8e80941Smrg
7037b8e80941Smrg#define OPT(pass, args...) ({                                           \
7038b8e80941Smrg      pass_num++;                                                       \
7039b8e80941Smrg      bool this_progress = pass(args);                                  \
7040b8e80941Smrg                                                                        \
7041b8e80941Smrg      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
7042b8e80941Smrg         char filename[64];                                             \
7043b8e80941Smrg         snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
7044b8e80941Smrg                  stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
7045b8e80941Smrg                                                                        \
7046b8e80941Smrg         backend_shader::dump_instructions(filename);                   \
7047b8e80941Smrg      }                                                                 \
7048b8e80941Smrg                                                                        \
7049b8e80941Smrg      validate();                                                       \
7050b8e80941Smrg                                                                        \
7051b8e80941Smrg      progress = progress || this_progress;                             \
7052b8e80941Smrg      this_progress;                                                    \
7053b8e80941Smrg   })
7054b8e80941Smrg
7055b8e80941Smrg   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
7056b8e80941Smrg      char filename[64];
7057b8e80941Smrg      snprintf(filename, 64, "%s%d-%s-00-00-start",
7058b8e80941Smrg               stage_abbrev, dispatch_width, nir->info.name);
7059b8e80941Smrg
7060b8e80941Smrg      backend_shader::dump_instructions(filename);
7061b8e80941Smrg   }
7062b8e80941Smrg
7063b8e80941Smrg   bool progress = false;
7064b8e80941Smrg   int iteration = 0;
7065b8e80941Smrg   int pass_num = 0;
7066b8e80941Smrg
7067b8e80941Smrg   /* Before anything else, eliminate dead code.  The results of some NIR
7068b8e80941Smrg    * instructions may effectively be calculated twice.  Once when the
7069b8e80941Smrg    * instruction is encountered, and again when the user of that result is
7070b8e80941Smrg    * encountered.  Wipe those away before algebraic optimizations and
7071b8e80941Smrg    * especially copy propagation can mix things up.
7072b8e80941Smrg    */
7073b8e80941Smrg   OPT(dead_code_eliminate);
7074b8e80941Smrg
7075b8e80941Smrg   OPT(remove_extra_rounding_modes);
7076b8e80941Smrg
7077b8e80941Smrg   do {
7078b8e80941Smrg      progress = false;
7079b8e80941Smrg      pass_num = 0;
7080b8e80941Smrg      iteration++;
7081b8e80941Smrg
7082b8e80941Smrg      OPT(remove_duplicate_mrf_writes);
7083b8e80941Smrg
7084b8e80941Smrg      OPT(opt_algebraic);
7085b8e80941Smrg      OPT(opt_cse);
7086b8e80941Smrg      OPT(opt_copy_propagation);
7087b8e80941Smrg      OPT(opt_predicated_break, this);
7088b8e80941Smrg      OPT(opt_cmod_propagation);
7089b8e80941Smrg      OPT(dead_code_eliminate);
7090b8e80941Smrg      OPT(opt_peephole_sel);
7091b8e80941Smrg      OPT(dead_control_flow_eliminate, this);
7092b8e80941Smrg      OPT(opt_register_renaming);
7093b8e80941Smrg      OPT(opt_saturate_propagation);
7094b8e80941Smrg      OPT(register_coalesce);
7095b8e80941Smrg      OPT(compute_to_mrf);
7096b8e80941Smrg      OPT(eliminate_find_live_channel);
7097b8e80941Smrg
7098b8e80941Smrg      OPT(compact_virtual_grfs);
7099b8e80941Smrg   } while (progress);
7100b8e80941Smrg
7101b8e80941Smrg   if (OPT(lower_linterp)) {
7102b8e80941Smrg      OPT(opt_copy_propagation);
7103b8e80941Smrg      OPT(dead_code_eliminate);
7104b8e80941Smrg   }
7105b8e80941Smrg
7106b8e80941Smrg   /* Do this after cmod propagation has had every possible opportunity to
7107b8e80941Smrg    * propagate results into SEL instructions.
7108b8e80941Smrg    */
7109b8e80941Smrg   if (OPT(opt_peephole_csel))
7110b8e80941Smrg      OPT(dead_code_eliminate);
7111b8e80941Smrg
7112b8e80941Smrg   progress = false;
7113b8e80941Smrg   pass_num = 0;
7114b8e80941Smrg
7115b8e80941Smrg   if (OPT(lower_pack)) {
7116b8e80941Smrg      OPT(register_coalesce);
7117b8e80941Smrg      OPT(dead_code_eliminate);
7118b8e80941Smrg   }
7119b8e80941Smrg
7120b8e80941Smrg   OPT(lower_simd_width);
7121b8e80941Smrg
7122b8e80941Smrg   /* After SIMD lowering just in case we had to unroll the EOT send. */
7123b8e80941Smrg   OPT(opt_sampler_eot);
7124b8e80941Smrg
7125b8e80941Smrg   OPT(lower_logical_sends);
7126b8e80941Smrg
7127b8e80941Smrg   if (progress) {
7128b8e80941Smrg      OPT(opt_copy_propagation);
7129b8e80941Smrg      /* Only run after logical send lowering because it's easier to implement
7130b8e80941Smrg       * in terms of physical sends.
7131b8e80941Smrg       */
7132b8e80941Smrg      if (OPT(opt_zero_samples))
7133b8e80941Smrg         OPT(opt_copy_propagation);
7134b8e80941Smrg      /* Run after logical send lowering to give it a chance to CSE the
7135b8e80941Smrg       * LOAD_PAYLOAD instructions created to construct the payloads of
7136b8e80941Smrg       * e.g. texturing messages in cases where it wasn't possible to CSE the
7137b8e80941Smrg       * whole logical instruction.
7138b8e80941Smrg       */
7139b8e80941Smrg      OPT(opt_cse);
7140b8e80941Smrg      OPT(register_coalesce);
7141b8e80941Smrg      OPT(compute_to_mrf);
7142b8e80941Smrg      OPT(dead_code_eliminate);
7143b8e80941Smrg      OPT(remove_duplicate_mrf_writes);
7144b8e80941Smrg      OPT(opt_peephole_sel);
7145b8e80941Smrg   }
7146b8e80941Smrg
7147b8e80941Smrg   OPT(opt_redundant_discard_jumps);
7148b8e80941Smrg
7149b8e80941Smrg   if (OPT(lower_load_payload)) {
7150b8e80941Smrg      split_virtual_grfs();
7151b8e80941Smrg      OPT(register_coalesce);
7152b8e80941Smrg      OPT(lower_simd_width);
7153b8e80941Smrg      OPT(compute_to_mrf);
7154b8e80941Smrg      OPT(dead_code_eliminate);
7155b8e80941Smrg   }
7156b8e80941Smrg
7157b8e80941Smrg   OPT(opt_combine_constants);
7158b8e80941Smrg   OPT(lower_integer_multiplication);
7159b8e80941Smrg
7160b8e80941Smrg   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
7161b8e80941Smrg      OPT(opt_cmod_propagation);
7162b8e80941Smrg      OPT(opt_cse);
7163b8e80941Smrg      OPT(opt_copy_propagation);
7164b8e80941Smrg      OPT(dead_code_eliminate);
7165b8e80941Smrg   }
7166b8e80941Smrg
7167b8e80941Smrg   if (OPT(lower_regioning)) {
7168b8e80941Smrg      OPT(opt_copy_propagation);
7169b8e80941Smrg      OPT(dead_code_eliminate);
7170b8e80941Smrg      OPT(lower_simd_width);
7171b8e80941Smrg   }
7172b8e80941Smrg
7173b8e80941Smrg   OPT(fixup_sends_duplicate_payload);
7174b8e80941Smrg
7175b8e80941Smrg   lower_uniform_pull_constant_loads();
7176b8e80941Smrg
7177b8e80941Smrg   validate();
7178b8e80941Smrg}
7179b8e80941Smrg
7180b8e80941Smrg/**
7181b8e80941Smrg * From the Skylake PRM Vol. 2a docs for sends:
7182b8e80941Smrg *
7183b8e80941Smrg *    "It is required that the second block of GRFs does not overlap with the
7184b8e80941Smrg *    first block."
7185b8e80941Smrg *
7186b8e80941Smrg * There are plenty of cases where we may accidentally violate this due to
7187b8e80941Smrg * having, for instance, both sources be the constant 0.  This little pass
7188b8e80941Smrg * just adds a new vgrf for the second payload and copies it over.
7189b8e80941Smrg */
7190b8e80941Smrgbool
7191b8e80941Smrgfs_visitor::fixup_sends_duplicate_payload()
7192b8e80941Smrg{
7193b8e80941Smrg   bool progress = false;
7194b8e80941Smrg
7195b8e80941Smrg   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
7196b8e80941Smrg      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
7197b8e80941Smrg          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
7198b8e80941Smrg                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
7199b8e80941Smrg         fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
7200b8e80941Smrg                             BRW_REGISTER_TYPE_UD);
7201b8e80941Smrg         /* Sadly, we've lost all notion of channels and bit sizes at this
7202b8e80941Smrg          * point.  Just WE_all it.
7203b8e80941Smrg          */
7204b8e80941Smrg         const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);
7205b8e80941Smrg         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
7206b8e80941Smrg         fs_reg copy_dst = tmp;
7207b8e80941Smrg         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
7208b8e80941Smrg            if (inst->ex_mlen == i + 1) {
7209b8e80941Smrg               /* Only one register left; do SIMD8 */
7210b8e80941Smrg               ibld.group(8, 0).MOV(copy_dst, copy_src);
7211b8e80941Smrg            } else {
7212b8e80941Smrg               ibld.MOV(copy_dst, copy_src);
7213b8e80941Smrg            }
7214b8e80941Smrg            copy_src = offset(copy_src, ibld, 1);
7215b8e80941Smrg            copy_dst = offset(copy_dst, ibld, 1);
7216b8e80941Smrg         }
7217b8e80941Smrg         inst->src[3] = tmp;
7218b8e80941Smrg         progress = true;
7219b8e80941Smrg      }
7220b8e80941Smrg   }
7221b8e80941Smrg
7222b8e80941Smrg   if (progress)
7223b8e80941Smrg      invalidate_live_intervals();
7224b8e80941Smrg
7225b8e80941Smrg   return progress;
7226b8e80941Smrg}
7227b8e80941Smrg
7228b8e80941Smrg/**
7229b8e80941Smrg * Three source instruction must have a GRF/MRF destination register.
7230b8e80941Smrg * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
7231b8e80941Smrg */
7232b8e80941Smrgvoid
7233b8e80941Smrgfs_visitor::fixup_3src_null_dest()
7234b8e80941Smrg{
7235b8e80941Smrg   bool progress = false;
7236b8e80941Smrg
7237b8e80941Smrg   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
7238b8e80941Smrg      if (inst->is_3src(devinfo) && inst->dst.is_null()) {
7239b8e80941Smrg         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
7240b8e80941Smrg                            inst->dst.type);
7241b8e80941Smrg         progress = true;
7242b8e80941Smrg      }
7243b8e80941Smrg   }
7244b8e80941Smrg
7245b8e80941Smrg   if (progress)
7246b8e80941Smrg      invalidate_live_intervals();
7247b8e80941Smrg}
7248b8e80941Smrg
7249b8e80941Smrgvoid
7250b8e80941Smrgfs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
7251b8e80941Smrg{
7252b8e80941Smrg   bool allocated_without_spills;
7253b8e80941Smrg
7254b8e80941Smrg   static const enum instruction_scheduler_mode pre_modes[] = {
7255b8e80941Smrg      SCHEDULE_PRE,
7256b8e80941Smrg      SCHEDULE_PRE_NON_LIFO,
7257b8e80941Smrg      SCHEDULE_PRE_LIFO,
7258b8e80941Smrg   };
7259b8e80941Smrg
7260b8e80941Smrg   bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
7261b8e80941Smrg
7262b8e80941Smrg   /* Try each scheduling heuristic to see if it can successfully register
7263b8e80941Smrg    * allocate without spilling.  They should be ordered by decreasing
7264b8e80941Smrg    * performance but increasing likelihood of allocating.
7265b8e80941Smrg    */
7266b8e80941Smrg   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
7267b8e80941Smrg      schedule_instructions(pre_modes[i]);
7268b8e80941Smrg
7269b8e80941Smrg      if (0) {
7270b8e80941Smrg         assign_regs_trivial();
7271b8e80941Smrg         allocated_without_spills = true;
7272b8e80941Smrg      } else {
7273b8e80941Smrg         allocated_without_spills = assign_regs(false, spill_all);
7274b8e80941Smrg      }
7275b8e80941Smrg      if (allocated_without_spills)
7276b8e80941Smrg         break;
7277b8e80941Smrg   }
7278b8e80941Smrg
7279b8e80941Smrg   if (!allocated_without_spills) {
7280b8e80941Smrg      if (!allow_spilling)
7281b8e80941Smrg         fail("Failure to register allocate and spilling is not allowed.");
7282b8e80941Smrg
7283b8e80941Smrg      /* We assume that any spilling is worse than just dropping back to
7284b8e80941Smrg       * SIMD8.  There's probably actually some intermediate point where
7285b8e80941Smrg       * SIMD16 with a couple of spills is still better.
7286b8e80941Smrg       */
7287b8e80941Smrg      if (dispatch_width > min_dispatch_width) {
7288b8e80941Smrg         fail("Failure to register allocate.  Reduce number of "
7289b8e80941Smrg              "live scalar values to avoid this.");
7290b8e80941Smrg      } else {
7291b8e80941Smrg         compiler->shader_perf_log(log_data,
7292b8e80941Smrg                                   "%s shader triggered register spilling.  "
7293b8e80941Smrg                                   "Try reducing the number of live scalar "
7294b8e80941Smrg                                   "values to improve performance.\n",
7295b8e80941Smrg                                   stage_name);
7296b8e80941Smrg      }
7297b8e80941Smrg
7298b8e80941Smrg      /* Since we're out of heuristics, just go spill registers until we
7299b8e80941Smrg       * get an allocation.
7300b8e80941Smrg       */
7301b8e80941Smrg      while (!assign_regs(true, spill_all)) {
7302b8e80941Smrg         if (failed)
7303b8e80941Smrg            break;
7304b8e80941Smrg      }
7305b8e80941Smrg   }
7306b8e80941Smrg
7307b8e80941Smrg   /* This must come after all optimization and register allocation, since
7308b8e80941Smrg    * it inserts dead code that happens to have side effects, and it does
7309b8e80941Smrg    * so based on the actual physical registers in use.
7310b8e80941Smrg    */
7311b8e80941Smrg   insert_gen4_send_dependency_workarounds();
7312b8e80941Smrg
7313b8e80941Smrg   if (failed)
7314b8e80941Smrg      return;
7315b8e80941Smrg
7316b8e80941Smrg   opt_bank_conflicts();
7317b8e80941Smrg
7318b8e80941Smrg   schedule_instructions(SCHEDULE_POST);
7319b8e80941Smrg
7320b8e80941Smrg   if (last_scratch > 0) {
7321b8e80941Smrg      MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
7322b8e80941Smrg
7323b8e80941Smrg      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
7324b8e80941Smrg
7325b8e80941Smrg      if (stage == MESA_SHADER_COMPUTE) {
7326b8e80941Smrg         if (devinfo->is_haswell) {
7327b8e80941Smrg            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
7328b8e80941Smrg             * field documentation, Haswell supports a minimum of 2kB of
7329b8e80941Smrg             * scratch space for compute shaders, unlike every other stage
7330b8e80941Smrg             * and platform.
7331b8e80941Smrg             */
7332b8e80941Smrg            prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
7333b8e80941Smrg         } else if (devinfo->gen <= 7) {
7334b8e80941Smrg            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
7335b8e80941Smrg             * field documentation, platforms prior to Haswell measure scratch
7336b8e80941Smrg             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
7337b8e80941Smrg             */
7338b8e80941Smrg            prog_data->total_scratch = ALIGN(last_scratch, 1024);
7339b8e80941Smrg            max_scratch_size = 12 * 1024;
7340b8e80941Smrg         }
7341b8e80941Smrg      }
7342b8e80941Smrg
7343b8e80941Smrg      /* We currently only support up to 2MB of scratch space.  If we
7344b8e80941Smrg       * need to support more eventually, the documentation suggests
7345b8e80941Smrg       * that we could allocate a larger buffer, and partition it out
7346b8e80941Smrg       * ourselves.  We'd just have to undo the hardware's address
7347b8e80941Smrg       * calculation by subtracting (FFTID * Per Thread Scratch Space)
7348b8e80941Smrg       * and then add FFTID * (Larger Per Thread Scratch Space).
7349b8e80941Smrg       *
7350b8e80941Smrg       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
7351b8e80941Smrg       * Thread Group Tracking > Local Memory/Scratch Space.
7352b8e80941Smrg       */
7353b8e80941Smrg      assert(prog_data->total_scratch < max_scratch_size);
7354b8e80941Smrg   }
7355b8e80941Smrg}
7356b8e80941Smrg
7357b8e80941Smrgbool
7358b8e80941Smrgfs_visitor::run_vs()
7359b8e80941Smrg{
7360b8e80941Smrg   assert(stage == MESA_SHADER_VERTEX);
7361b8e80941Smrg
7362b8e80941Smrg   setup_vs_payload();
7363b8e80941Smrg
7364b8e80941Smrg   if (shader_time_index >= 0)
7365b8e80941Smrg      emit_shader_time_begin();
7366b8e80941Smrg
7367b8e80941Smrg   emit_nir_code();
7368b8e80941Smrg
7369b8e80941Smrg   if (failed)
7370b8e80941Smrg      return false;
7371b8e80941Smrg
7372b8e80941Smrg   compute_clip_distance();
7373b8e80941Smrg
7374b8e80941Smrg   emit_urb_writes();
7375b8e80941Smrg
7376b8e80941Smrg   if (shader_time_index >= 0)
7377b8e80941Smrg      emit_shader_time_end();
7378b8e80941Smrg
7379b8e80941Smrg   calculate_cfg();
7380b8e80941Smrg
7381b8e80941Smrg   optimize();
7382b8e80941Smrg
7383b8e80941Smrg   assign_curb_setup();
7384b8e80941Smrg   assign_vs_urb_setup();
7385b8e80941Smrg
7386b8e80941Smrg   fixup_3src_null_dest();
7387b8e80941Smrg   allocate_registers(8, true);
7388b8e80941Smrg
7389b8e80941Smrg   return !failed;
7390b8e80941Smrg}
7391b8e80941Smrg
7392b8e80941Smrgbool
7393b8e80941Smrgfs_visitor::run_tcs_single_patch()
7394b8e80941Smrg{
7395b8e80941Smrg   assert(stage == MESA_SHADER_TESS_CTRL);
7396b8e80941Smrg
7397b8e80941Smrg   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
7398b8e80941Smrg
7399b8e80941Smrg   /* r1-r4 contain the ICP handles. */
7400b8e80941Smrg   payload.num_regs = 5;
7401b8e80941Smrg
7402b8e80941Smrg   if (shader_time_index >= 0)
7403b8e80941Smrg      emit_shader_time_begin();
7404b8e80941Smrg
7405b8e80941Smrg   /* Initialize gl_InvocationID */
7406b8e80941Smrg   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
7407b8e80941Smrg   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
7408b8e80941Smrg   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
7409b8e80941Smrg   bld.MOV(channels_ud, channels_uw);
7410b8e80941Smrg
7411b8e80941Smrg   if (tcs_prog_data->instances == 1) {
7412b8e80941Smrg      invocation_id = channels_ud;
7413b8e80941Smrg   } else {
7414b8e80941Smrg      const unsigned invocation_id_mask = devinfo->gen >= 11 ?
7415b8e80941Smrg         INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
7416b8e80941Smrg      const unsigned invocation_id_shift = devinfo->gen >= 11 ? 16 : 17;
7417b8e80941Smrg
7418b8e80941Smrg      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
7419b8e80941Smrg
7420b8e80941Smrg      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
7421b8e80941Smrg      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
7422b8e80941Smrg      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
7423b8e80941Smrg      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
7424b8e80941Smrg              brw_imm_ud(invocation_id_mask));
7425b8e80941Smrg      bld.SHR(instance_times_8, t, brw_imm_ud(invocation_id_shift - 3));
7426b8e80941Smrg
7427b8e80941Smrg      bld.ADD(invocation_id, instance_times_8, channels_ud);
7428b8e80941Smrg   }
7429b8e80941Smrg
7430b8e80941Smrg   /* Fix the disptach mask */
7431b8e80941Smrg   if (nir->info.tess.tcs_vertices_out % 8) {
7432b8e80941Smrg      bld.CMP(bld.null_reg_ud(), invocation_id,
7433b8e80941Smrg              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
7434b8e80941Smrg      bld.IF(BRW_PREDICATE_NORMAL);
7435b8e80941Smrg   }
7436b8e80941Smrg
7437b8e80941Smrg   emit_nir_code();
7438b8e80941Smrg
7439b8e80941Smrg   if (nir->info.tess.tcs_vertices_out % 8) {
7440b8e80941Smrg      bld.emit(BRW_OPCODE_ENDIF);
7441b8e80941Smrg   }
7442b8e80941Smrg
7443b8e80941Smrg   /* Emit EOT write; set TR DS Cache bit */
7444b8e80941Smrg   fs_reg srcs[3] = {
7445b8e80941Smrg      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
7446b8e80941Smrg      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
7447b8e80941Smrg      fs_reg(brw_imm_ud(0)),
7448b8e80941Smrg   };
7449b8e80941Smrg   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
7450b8e80941Smrg   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
7451b8e80941Smrg
7452b8e80941Smrg   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
7453b8e80941Smrg                            bld.null_reg_ud(), payload);
7454b8e80941Smrg   inst->mlen = 3;
7455b8e80941Smrg   inst->eot = true;
7456b8e80941Smrg
7457b8e80941Smrg   if (shader_time_index >= 0)
7458b8e80941Smrg      emit_shader_time_end();
7459b8e80941Smrg
7460b8e80941Smrg   if (failed)
7461b8e80941Smrg      return false;
7462b8e80941Smrg
7463b8e80941Smrg   calculate_cfg();
7464b8e80941Smrg
7465b8e80941Smrg   optimize();
7466b8e80941Smrg
7467b8e80941Smrg   assign_curb_setup();
7468b8e80941Smrg   assign_tcs_single_patch_urb_setup();
7469b8e80941Smrg
7470b8e80941Smrg   fixup_3src_null_dest();
7471b8e80941Smrg   allocate_registers(8, true);
7472b8e80941Smrg
7473b8e80941Smrg   return !failed;
7474b8e80941Smrg}
7475b8e80941Smrg
7476b8e80941Smrgbool
7477b8e80941Smrgfs_visitor::run_tes()
7478b8e80941Smrg{
7479b8e80941Smrg   assert(stage == MESA_SHADER_TESS_EVAL);
7480b8e80941Smrg
7481b8e80941Smrg   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
7482b8e80941Smrg   payload.num_regs = 5;
7483b8e80941Smrg
7484b8e80941Smrg   if (shader_time_index >= 0)
7485b8e80941Smrg      emit_shader_time_begin();
7486b8e80941Smrg
7487b8e80941Smrg   emit_nir_code();
7488b8e80941Smrg
7489b8e80941Smrg   if (failed)
7490b8e80941Smrg      return false;
7491b8e80941Smrg
7492b8e80941Smrg   emit_urb_writes();
7493b8e80941Smrg
7494b8e80941Smrg   if (shader_time_index >= 0)
7495b8e80941Smrg      emit_shader_time_end();
7496b8e80941Smrg
7497b8e80941Smrg   calculate_cfg();
7498b8e80941Smrg
7499b8e80941Smrg   optimize();
7500b8e80941Smrg
7501b8e80941Smrg   assign_curb_setup();
7502b8e80941Smrg   assign_tes_urb_setup();
7503b8e80941Smrg
7504b8e80941Smrg   fixup_3src_null_dest();
7505b8e80941Smrg   allocate_registers(8, true);
7506b8e80941Smrg
7507b8e80941Smrg   return !failed;
7508b8e80941Smrg}
7509b8e80941Smrg
7510b8e80941Smrgbool
7511b8e80941Smrgfs_visitor::run_gs()
7512b8e80941Smrg{
7513b8e80941Smrg   assert(stage == MESA_SHADER_GEOMETRY);
7514b8e80941Smrg
7515b8e80941Smrg   setup_gs_payload();
7516b8e80941Smrg
7517b8e80941Smrg   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
7518b8e80941Smrg
7519b8e80941Smrg   if (gs_compile->control_data_header_size_bits > 0) {
7520b8e80941Smrg      /* Create a VGRF to store accumulated control data bits. */
7521b8e80941Smrg      this->control_data_bits = vgrf(glsl_type::uint_type);
7522b8e80941Smrg
7523b8e80941Smrg      /* If we're outputting more than 32 control data bits, then EmitVertex()
7524b8e80941Smrg       * will set control_data_bits to 0 after emitting the first vertex.
7525b8e80941Smrg       * Otherwise, we need to initialize it to 0 here.
7526b8e80941Smrg       */
7527b8e80941Smrg      if (gs_compile->control_data_header_size_bits <= 32) {
7528b8e80941Smrg         const fs_builder abld = bld.annotate("initialize control data bits");
7529b8e80941Smrg         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
7530b8e80941Smrg      }
7531b8e80941Smrg   }
7532b8e80941Smrg
7533b8e80941Smrg   if (shader_time_index >= 0)
7534b8e80941Smrg      emit_shader_time_begin();
7535b8e80941Smrg
7536b8e80941Smrg   emit_nir_code();
7537b8e80941Smrg
7538b8e80941Smrg   emit_gs_thread_end();
7539b8e80941Smrg
7540b8e80941Smrg   if (shader_time_index >= 0)
7541b8e80941Smrg      emit_shader_time_end();
7542b8e80941Smrg
7543b8e80941Smrg   if (failed)
7544b8e80941Smrg      return false;
7545b8e80941Smrg
7546b8e80941Smrg   calculate_cfg();
7547b8e80941Smrg
7548b8e80941Smrg   optimize();
7549b8e80941Smrg
7550b8e80941Smrg   assign_curb_setup();
7551b8e80941Smrg   assign_gs_urb_setup();
7552b8e80941Smrg
7553b8e80941Smrg   fixup_3src_null_dest();
7554b8e80941Smrg   allocate_registers(8, true);
7555b8e80941Smrg
7556b8e80941Smrg   return !failed;
7557b8e80941Smrg}
7558b8e80941Smrg
7559b8e80941Smrg/* From the SKL PRM, Volume 16, Workarounds:
7560b8e80941Smrg *
7561b8e80941Smrg *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
7562b8e80941Smrg *              only header phases (R0-R2)
7563b8e80941Smrg *
7564b8e80941Smrg *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
7565b8e80941Smrg *       have been header only.
7566b8e80941Smrg *
7567b8e80941Smrg * Instead of enabling push constants one can alternatively enable one of the
7568b8e80941Smrg * inputs. Here one simply chooses "layer" which shouldn't impose much
7569b8e80941Smrg * overhead.
7570b8e80941Smrg */
7571b8e80941Smrgstatic void
7572b8e80941Smrggen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
7573b8e80941Smrg{
7574b8e80941Smrg   if (wm_prog_data->num_varying_inputs)
7575b8e80941Smrg      return;
7576b8e80941Smrg
7577b8e80941Smrg   if (wm_prog_data->base.curb_read_length)
7578b8e80941Smrg      return;
7579b8e80941Smrg
7580b8e80941Smrg   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
7581b8e80941Smrg   wm_prog_data->num_varying_inputs = 1;
7582b8e80941Smrg}
7583b8e80941Smrg
7584b8e80941Smrgbool
7585b8e80941Smrgfs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
7586b8e80941Smrg{
7587b8e80941Smrg   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
7588b8e80941Smrg   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
7589b8e80941Smrg
7590b8e80941Smrg   assert(stage == MESA_SHADER_FRAGMENT);
7591b8e80941Smrg
7592b8e80941Smrg   if (devinfo->gen >= 6)
7593b8e80941Smrg      setup_fs_payload_gen6();
7594b8e80941Smrg   else
7595b8e80941Smrg      setup_fs_payload_gen4();
7596b8e80941Smrg
7597b8e80941Smrg   if (0) {
7598b8e80941Smrg      emit_dummy_fs();
7599b8e80941Smrg   } else if (do_rep_send) {
7600b8e80941Smrg      assert(dispatch_width == 16);
7601b8e80941Smrg      emit_repclear_shader();
7602b8e80941Smrg   } else {
7603b8e80941Smrg      if (shader_time_index >= 0)
7604b8e80941Smrg         emit_shader_time_begin();
7605b8e80941Smrg
7606b8e80941Smrg      calculate_urb_setup();
7607b8e80941Smrg      if (nir->info.inputs_read > 0 ||
7608b8e80941Smrg          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
7609b8e80941Smrg         if (devinfo->gen < 6)
7610b8e80941Smrg            emit_interpolation_setup_gen4();
7611b8e80941Smrg         else
7612b8e80941Smrg            emit_interpolation_setup_gen6();
7613b8e80941Smrg      }
7614b8e80941Smrg
7615b8e80941Smrg      /* We handle discards by keeping track of the still-live pixels in f0.1.
7616b8e80941Smrg       * Initialize it with the dispatched pixels.
7617b8e80941Smrg       */
7618b8e80941Smrg      if (wm_prog_data->uses_kill) {
7619b8e80941Smrg         const fs_reg dispatch_mask =
7620b8e80941Smrg            devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
7621b8e80941Smrg         bld.exec_all().group(1, 0)
7622b8e80941Smrg            .MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW),
7623b8e80941Smrg                 retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
7624b8e80941Smrg      }
7625b8e80941Smrg
7626b8e80941Smrg      emit_nir_code();
7627b8e80941Smrg
7628b8e80941Smrg      if (failed)
7629b8e80941Smrg	 return false;
7630b8e80941Smrg
7631b8e80941Smrg      if (wm_prog_data->uses_kill)
7632b8e80941Smrg         bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
7633b8e80941Smrg
7634b8e80941Smrg      if (wm_key->alpha_test_func)
7635b8e80941Smrg         emit_alpha_test();
7636b8e80941Smrg
7637b8e80941Smrg      emit_fb_writes();
7638b8e80941Smrg
7639b8e80941Smrg      if (shader_time_index >= 0)
7640b8e80941Smrg         emit_shader_time_end();
7641b8e80941Smrg
7642b8e80941Smrg      calculate_cfg();
7643b8e80941Smrg
7644b8e80941Smrg      optimize();
7645b8e80941Smrg
7646b8e80941Smrg      assign_curb_setup();
7647b8e80941Smrg
7648b8e80941Smrg      if (devinfo->gen >= 9)
7649b8e80941Smrg         gen9_ps_header_only_workaround(wm_prog_data);
7650b8e80941Smrg
7651b8e80941Smrg      assign_urb_setup();
7652b8e80941Smrg
7653b8e80941Smrg      fixup_3src_null_dest();
7654b8e80941Smrg      allocate_registers(8, allow_spilling);
7655b8e80941Smrg
7656b8e80941Smrg      if (failed)
7657b8e80941Smrg         return false;
7658b8e80941Smrg   }
7659b8e80941Smrg
7660b8e80941Smrg   return !failed;
7661b8e80941Smrg}
7662b8e80941Smrg
7663b8e80941Smrgbool
7664b8e80941Smrgfs_visitor::run_cs(unsigned min_dispatch_width)
7665b8e80941Smrg{
7666b8e80941Smrg   assert(stage == MESA_SHADER_COMPUTE);
7667b8e80941Smrg   assert(dispatch_width >= min_dispatch_width);
7668b8e80941Smrg
7669b8e80941Smrg   setup_cs_payload();
7670b8e80941Smrg
7671b8e80941Smrg   if (shader_time_index >= 0)
7672b8e80941Smrg      emit_shader_time_begin();
7673b8e80941Smrg
7674b8e80941Smrg   if (devinfo->is_haswell && prog_data->total_shared > 0) {
7675b8e80941Smrg      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
7676b8e80941Smrg      const fs_builder abld = bld.exec_all().group(1, 0);
7677b8e80941Smrg      abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
7678b8e80941Smrg               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
7679b8e80941Smrg   }
7680b8e80941Smrg
7681b8e80941Smrg   emit_nir_code();
7682b8e80941Smrg
7683b8e80941Smrg   if (failed)
7684b8e80941Smrg      return false;
7685b8e80941Smrg
7686b8e80941Smrg   emit_cs_terminate();
7687b8e80941Smrg
7688b8e80941Smrg   if (shader_time_index >= 0)
7689b8e80941Smrg      emit_shader_time_end();
7690b8e80941Smrg
7691b8e80941Smrg   calculate_cfg();
7692b8e80941Smrg
7693b8e80941Smrg   optimize();
7694b8e80941Smrg
7695b8e80941Smrg   assign_curb_setup();
7696b8e80941Smrg
7697b8e80941Smrg   fixup_3src_null_dest();
7698b8e80941Smrg   allocate_registers(min_dispatch_width, true);
7699b8e80941Smrg
7700b8e80941Smrg   if (failed)
7701b8e80941Smrg      return false;
7702b8e80941Smrg
7703b8e80941Smrg   return !failed;
7704b8e80941Smrg}
7705b8e80941Smrg
7706b8e80941Smrg/**
7707b8e80941Smrg * Return a bitfield where bit n is set if barycentric interpolation mode n
7708b8e80941Smrg * (see enum brw_barycentric_mode) is needed by the fragment shader.
7709b8e80941Smrg *
7710b8e80941Smrg * We examine the load_barycentric intrinsics rather than looking at input
7711b8e80941Smrg * variables so that we catch interpolateAtCentroid() messages too, which
7712b8e80941Smrg * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
7713b8e80941Smrg */
7714b8e80941Smrgstatic unsigned
7715b8e80941Smrgbrw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
7716b8e80941Smrg                                     const nir_shader *shader)
7717b8e80941Smrg{
7718b8e80941Smrg   unsigned barycentric_interp_modes = 0;
7719b8e80941Smrg
7720b8e80941Smrg   nir_foreach_function(f, shader) {
7721b8e80941Smrg      if (!f->impl)
7722b8e80941Smrg         continue;
7723b8e80941Smrg
7724b8e80941Smrg      nir_foreach_block(block, f->impl) {
7725b8e80941Smrg         nir_foreach_instr(instr, block) {
7726b8e80941Smrg            if (instr->type != nir_instr_type_intrinsic)
7727b8e80941Smrg               continue;
7728b8e80941Smrg
7729b8e80941Smrg            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7730b8e80941Smrg            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
7731b8e80941Smrg               continue;
7732b8e80941Smrg
7733b8e80941Smrg            /* Ignore WPOS; it doesn't require interpolation. */
7734b8e80941Smrg            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
7735b8e80941Smrg               continue;
7736b8e80941Smrg
7737b8e80941Smrg            intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
7738b8e80941Smrg            enum glsl_interp_mode interp = (enum glsl_interp_mode)
7739b8e80941Smrg               nir_intrinsic_interp_mode(intrin);
7740b8e80941Smrg            nir_intrinsic_op bary_op = intrin->intrinsic;
7741b8e80941Smrg            enum brw_barycentric_mode bary =
7742b8e80941Smrg               brw_barycentric_mode(interp, bary_op);
7743b8e80941Smrg
7744b8e80941Smrg            barycentric_interp_modes |= 1 << bary;
7745b8e80941Smrg
7746b8e80941Smrg            if (devinfo->needs_unlit_centroid_workaround &&
7747b8e80941Smrg                bary_op == nir_intrinsic_load_barycentric_centroid)
7748b8e80941Smrg               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
7749b8e80941Smrg         }
7750b8e80941Smrg      }
7751b8e80941Smrg   }
7752b8e80941Smrg
7753b8e80941Smrg   return barycentric_interp_modes;
7754b8e80941Smrg}
7755b8e80941Smrg
7756b8e80941Smrgstatic void
7757b8e80941Smrgbrw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
7758b8e80941Smrg                        const nir_shader *shader)
7759b8e80941Smrg{
7760b8e80941Smrg   prog_data->flat_inputs = 0;
7761b8e80941Smrg
7762b8e80941Smrg   nir_foreach_variable(var, &shader->inputs) {
7763b8e80941Smrg      unsigned slots = glsl_count_attribute_slots(var->type, false);
7764b8e80941Smrg      for (unsigned s = 0; s < slots; s++) {
7765b8e80941Smrg         int input_index = prog_data->urb_setup[var->data.location + s];
7766b8e80941Smrg
7767b8e80941Smrg         if (input_index < 0)
7768b8e80941Smrg            continue;
7769b8e80941Smrg
7770b8e80941Smrg         /* flat shading */
7771b8e80941Smrg         if (var->data.interpolation == INTERP_MODE_FLAT)
7772b8e80941Smrg            prog_data->flat_inputs |= 1 << input_index;
7773b8e80941Smrg      }
7774b8e80941Smrg   }
7775b8e80941Smrg}
7776b8e80941Smrg
7777b8e80941Smrgstatic uint8_t
7778b8e80941Smrgcomputed_depth_mode(const nir_shader *shader)
7779b8e80941Smrg{
7780b8e80941Smrg   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
7781b8e80941Smrg      switch (shader->info.fs.depth_layout) {
7782b8e80941Smrg      case FRAG_DEPTH_LAYOUT_NONE:
7783b8e80941Smrg      case FRAG_DEPTH_LAYOUT_ANY:
7784b8e80941Smrg         return BRW_PSCDEPTH_ON;
7785b8e80941Smrg      case FRAG_DEPTH_LAYOUT_GREATER:
7786b8e80941Smrg         return BRW_PSCDEPTH_ON_GE;
7787b8e80941Smrg      case FRAG_DEPTH_LAYOUT_LESS:
7788b8e80941Smrg         return BRW_PSCDEPTH_ON_LE;
7789b8e80941Smrg      case FRAG_DEPTH_LAYOUT_UNCHANGED:
7790b8e80941Smrg         return BRW_PSCDEPTH_OFF;
7791b8e80941Smrg      }
7792b8e80941Smrg   }
7793b8e80941Smrg   return BRW_PSCDEPTH_OFF;
7794b8e80941Smrg}
7795b8e80941Smrg
7796b8e80941Smrg/**
7797b8e80941Smrg * Move load_interpolated_input with simple (payload-based) barycentric modes
7798b8e80941Smrg * to the top of the program so we don't emit multiple PLNs for the same input.
7799b8e80941Smrg *
7800b8e80941Smrg * This works around CSE not being able to handle non-dominating cases
7801b8e80941Smrg * such as:
7802b8e80941Smrg *
7803b8e80941Smrg *    if (...) {
7804b8e80941Smrg *       interpolate input
7805b8e80941Smrg *    } else {
7806b8e80941Smrg *       interpolate the same exact input
7807b8e80941Smrg *    }
7808b8e80941Smrg *
7809b8e80941Smrg * This should be replaced by global value numbering someday.
7810b8e80941Smrg */
7811b8e80941Smrgstatic bool
7812b8e80941Smrgmove_interpolation_to_top(nir_shader *nir)
7813b8e80941Smrg{
7814b8e80941Smrg   bool progress = false;
7815b8e80941Smrg
7816b8e80941Smrg   nir_foreach_function(f, nir) {
7817b8e80941Smrg      if (!f->impl)
7818b8e80941Smrg         continue;
7819b8e80941Smrg
7820b8e80941Smrg      nir_block *top = nir_start_block(f->impl);
7821b8e80941Smrg      exec_node *cursor_node = NULL;
7822b8e80941Smrg
7823b8e80941Smrg      nir_foreach_block(block, f->impl) {
7824b8e80941Smrg         if (block == top)
7825b8e80941Smrg            continue;
7826b8e80941Smrg
7827b8e80941Smrg         nir_foreach_instr_safe(instr, block) {
7828b8e80941Smrg            if (instr->type != nir_instr_type_intrinsic)
7829b8e80941Smrg               continue;
7830b8e80941Smrg
7831b8e80941Smrg            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7832b8e80941Smrg            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
7833b8e80941Smrg               continue;
7834b8e80941Smrg            nir_intrinsic_instr *bary_intrinsic =
7835b8e80941Smrg               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
7836b8e80941Smrg            nir_intrinsic_op op = bary_intrinsic->intrinsic;
7837b8e80941Smrg
7838b8e80941Smrg            /* Leave interpolateAtSample/Offset() where they are. */
7839b8e80941Smrg            if (op == nir_intrinsic_load_barycentric_at_sample ||
7840b8e80941Smrg                op == nir_intrinsic_load_barycentric_at_offset)
7841b8e80941Smrg               continue;
7842b8e80941Smrg
7843b8e80941Smrg            nir_instr *move[3] = {
7844b8e80941Smrg               &bary_intrinsic->instr,
7845b8e80941Smrg               intrin->src[1].ssa->parent_instr,
7846b8e80941Smrg               instr
7847b8e80941Smrg            };
7848b8e80941Smrg
7849b8e80941Smrg            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
7850b8e80941Smrg               if (move[i]->block != top) {
7851b8e80941Smrg                  move[i]->block = top;
7852b8e80941Smrg                  exec_node_remove(&move[i]->node);
7853b8e80941Smrg                  if (cursor_node) {
7854b8e80941Smrg                     exec_node_insert_after(cursor_node, &move[i]->node);
7855b8e80941Smrg                  } else {
7856b8e80941Smrg                     exec_list_push_head(&top->instr_list, &move[i]->node);
7857b8e80941Smrg                  }
7858b8e80941Smrg                  cursor_node = &move[i]->node;
7859b8e80941Smrg                  progress = true;
7860b8e80941Smrg               }
7861b8e80941Smrg            }
7862b8e80941Smrg         }
7863b8e80941Smrg      }
7864b8e80941Smrg      nir_metadata_preserve(f->impl, (nir_metadata)
7865b8e80941Smrg                            ((unsigned) nir_metadata_block_index |
7866b8e80941Smrg                             (unsigned) nir_metadata_dominance));
7867b8e80941Smrg   }
7868b8e80941Smrg
7869b8e80941Smrg   return progress;
7870b8e80941Smrg}
7871b8e80941Smrg
7872b8e80941Smrg/**
7873b8e80941Smrg * Demote per-sample barycentric intrinsics to centroid.
7874b8e80941Smrg *
7875b8e80941Smrg * Useful when rendering to a non-multisampled buffer.
7876b8e80941Smrg */
7877b8e80941Smrgstatic bool
7878b8e80941Smrgdemote_sample_qualifiers(nir_shader *nir)
7879b8e80941Smrg{
7880b8e80941Smrg   bool progress = true;
7881b8e80941Smrg
7882b8e80941Smrg   nir_foreach_function(f, nir) {
7883b8e80941Smrg      if (!f->impl)
7884b8e80941Smrg         continue;
7885b8e80941Smrg
7886b8e80941Smrg      nir_builder b;
7887b8e80941Smrg      nir_builder_init(&b, f->impl);
7888b8e80941Smrg
7889b8e80941Smrg      nir_foreach_block(block, f->impl) {
7890b8e80941Smrg         nir_foreach_instr_safe(instr, block) {
7891b8e80941Smrg            if (instr->type != nir_instr_type_intrinsic)
7892b8e80941Smrg               continue;
7893b8e80941Smrg
7894b8e80941Smrg            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
7895b8e80941Smrg            if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
7896b8e80941Smrg                intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
7897b8e80941Smrg               continue;
7898b8e80941Smrg
7899b8e80941Smrg            b.cursor = nir_before_instr(instr);
7900b8e80941Smrg            nir_ssa_def *centroid =
7901b8e80941Smrg               nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
7902b8e80941Smrg                                    nir_intrinsic_interp_mode(intrin));
7903b8e80941Smrg            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
7904b8e80941Smrg                                     nir_src_for_ssa(centroid));
7905b8e80941Smrg            nir_instr_remove(instr);
7906b8e80941Smrg            progress = true;
7907b8e80941Smrg         }
7908b8e80941Smrg      }
7909b8e80941Smrg
7910b8e80941Smrg      nir_metadata_preserve(f->impl, (nir_metadata)
7911b8e80941Smrg                            ((unsigned) nir_metadata_block_index |
7912b8e80941Smrg                             (unsigned) nir_metadata_dominance));
7913b8e80941Smrg   }
7914b8e80941Smrg
7915b8e80941Smrg   return progress;
7916b8e80941Smrg}
7917b8e80941Smrg
7918b8e80941Smrg/**
7919b8e80941Smrg * Pre-gen6, the register file of the EUs was shared between threads,
7920b8e80941Smrg * and each thread used some subset allocated on a 16-register block
7921b8e80941Smrg * granularity.  The unit states wanted these block counts.
7922b8e80941Smrg */
7923b8e80941Smrgstatic inline int
7924b8e80941Smrgbrw_register_blocks(int reg_count)
7925b8e80941Smrg{
7926b8e80941Smrg   return ALIGN(reg_count, 16) / 16 - 1;
7927b8e80941Smrg}
7928b8e80941Smrg
7929b8e80941Smrgconst unsigned *
7930b8e80941Smrgbrw_compile_fs(const struct brw_compiler *compiler, void *log_data,
7931b8e80941Smrg               void *mem_ctx,
7932b8e80941Smrg               const struct brw_wm_prog_key *key,
7933b8e80941Smrg               struct brw_wm_prog_data *prog_data,
7934b8e80941Smrg               nir_shader *shader,
7935b8e80941Smrg               struct gl_program *prog,
7936b8e80941Smrg               int shader_time_index8, int shader_time_index16,
7937b8e80941Smrg               int shader_time_index32, bool allow_spilling,
7938b8e80941Smrg               bool use_rep_send, struct brw_vue_map *vue_map,
7939b8e80941Smrg               char **error_str)
7940b8e80941Smrg{
7941b8e80941Smrg   const struct gen_device_info *devinfo = compiler->devinfo;
7942b8e80941Smrg
7943b8e80941Smrg   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
7944b8e80941Smrg   brw_nir_lower_fs_inputs(shader, devinfo, key);
7945b8e80941Smrg   brw_nir_lower_fs_outputs(shader);
7946b8e80941Smrg
7947b8e80941Smrg   if (devinfo->gen < 6)
7948b8e80941Smrg      brw_setup_vue_interpolation(vue_map, shader, prog_data);
7949b8e80941Smrg
7950b8e80941Smrg   if (!key->multisample_fbo)
7951b8e80941Smrg      NIR_PASS_V(shader, demote_sample_qualifiers);
7952b8e80941Smrg   NIR_PASS_V(shader, move_interpolation_to_top);
7953b8e80941Smrg   shader = brw_postprocess_nir(shader, compiler, true);
7954b8e80941Smrg
7955b8e80941Smrg   /* key->alpha_test_func means simulating alpha testing via discards,
7956b8e80941Smrg    * so the shader definitely kills pixels.
7957b8e80941Smrg    */
7958b8e80941Smrg   prog_data->uses_kill = shader->info.fs.uses_discard ||
7959b8e80941Smrg      key->alpha_test_func;
7960b8e80941Smrg   prog_data->uses_omask = key->multisample_fbo &&
7961b8e80941Smrg      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
7962b8e80941Smrg   prog_data->computed_depth_mode = computed_depth_mode(shader);
7963b8e80941Smrg   prog_data->computed_stencil =
7964b8e80941Smrg      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
7965b8e80941Smrg
7966b8e80941Smrg   prog_data->persample_dispatch =
7967b8e80941Smrg      key->multisample_fbo &&
7968b8e80941Smrg      (key->persample_interp ||
7969b8e80941Smrg       (shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
7970b8e80941Smrg                                            SYSTEM_BIT_SAMPLE_POS)) ||
7971b8e80941Smrg       shader->info.fs.uses_sample_qualifier ||
7972b8e80941Smrg       shader->info.outputs_read);
7973b8e80941Smrg
7974b8e80941Smrg   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
7975b8e80941Smrg
7976b8e80941Smrg   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
7977b8e80941Smrg   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
7978b8e80941Smrg   prog_data->inner_coverage = shader->info.fs.inner_coverage;
7979b8e80941Smrg
7980b8e80941Smrg   prog_data->barycentric_interp_modes =
7981b8e80941Smrg      brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
7982b8e80941Smrg
7983b8e80941Smrg   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
7984b8e80941Smrg
7985b8e80941Smrg   fs_visitor v8(compiler, log_data, mem_ctx, key,
7986b8e80941Smrg                 &prog_data->base, prog, shader, 8,
7987b8e80941Smrg                 shader_time_index8);
7988b8e80941Smrg   if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
7989b8e80941Smrg      if (error_str)
7990b8e80941Smrg         *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
7991b8e80941Smrg
7992b8e80941Smrg      return NULL;
7993b8e80941Smrg   } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
7994b8e80941Smrg      simd8_cfg = v8.cfg;
7995b8e80941Smrg      prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
7996b8e80941Smrg      prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used);
7997b8e80941Smrg   }
7998b8e80941Smrg
7999b8e80941Smrg   if (v8.max_dispatch_width >= 16 &&
8000b8e80941Smrg       likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
8001b8e80941Smrg      /* Try a SIMD16 compile */
8002b8e80941Smrg      fs_visitor v16(compiler, log_data, mem_ctx, key,
8003b8e80941Smrg                     &prog_data->base, prog, shader, 16,
8004b8e80941Smrg                     shader_time_index16);
8005b8e80941Smrg      v16.import_uniforms(&v8);
8006b8e80941Smrg      if (!v16.run_fs(allow_spilling, use_rep_send)) {
8007b8e80941Smrg         compiler->shader_perf_log(log_data,
8008b8e80941Smrg                                   "SIMD16 shader failed to compile: %s",
8009b8e80941Smrg                                   v16.fail_msg);
8010b8e80941Smrg      } else {
8011b8e80941Smrg         simd16_cfg = v16.cfg;
8012b8e80941Smrg         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
8013b8e80941Smrg         prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
8014b8e80941Smrg      }
8015b8e80941Smrg   }
8016b8e80941Smrg
8017b8e80941Smrg   /* Currently, the compiler only supports SIMD32 on SNB+ */
8018b8e80941Smrg   if (v8.max_dispatch_width >= 32 && !use_rep_send &&
8019b8e80941Smrg       compiler->devinfo->gen >= 6 &&
8020b8e80941Smrg       unlikely(INTEL_DEBUG & DEBUG_DO32)) {
8021b8e80941Smrg      /* Try a SIMD32 compile */
8022b8e80941Smrg      fs_visitor v32(compiler, log_data, mem_ctx, key,
8023b8e80941Smrg                     &prog_data->base, prog, shader, 32,
8024b8e80941Smrg                     shader_time_index32);
8025b8e80941Smrg      v32.import_uniforms(&v8);
8026b8e80941Smrg      if (!v32.run_fs(allow_spilling, false)) {
8027b8e80941Smrg         compiler->shader_perf_log(log_data,
8028b8e80941Smrg                                   "SIMD32 shader failed to compile: %s",
8029b8e80941Smrg                                   v32.fail_msg);
8030b8e80941Smrg      } else {
8031b8e80941Smrg         simd32_cfg = v32.cfg;
8032b8e80941Smrg         prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
8033b8e80941Smrg         prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
8034b8e80941Smrg      }
8035b8e80941Smrg   }
8036b8e80941Smrg
8037b8e80941Smrg   /* When the caller requests a repclear shader, they want SIMD16-only */
8038b8e80941Smrg   if (use_rep_send)
8039b8e80941Smrg      simd8_cfg = NULL;
8040b8e80941Smrg
8041b8e80941Smrg   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
8042b8e80941Smrg    * at the top to select the shader.  We've never implemented that.
8043b8e80941Smrg    * Instead, we just give them exactly one shader and we pick the widest one
8044b8e80941Smrg    * available.
8045b8e80941Smrg    */
8046b8e80941Smrg   if (compiler->devinfo->gen < 5) {
8047b8e80941Smrg      if (simd32_cfg || simd16_cfg)
8048b8e80941Smrg         simd8_cfg = NULL;
8049b8e80941Smrg      if (simd32_cfg)
8050b8e80941Smrg         simd16_cfg = NULL;
8051b8e80941Smrg   }
8052b8e80941Smrg
8053b8e80941Smrg   /* If computed depth is enabled SNB only allows SIMD8. */
8054b8e80941Smrg   if (compiler->devinfo->gen == 6 &&
8055b8e80941Smrg       prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
8056b8e80941Smrg      assert(simd16_cfg == NULL && simd32_cfg == NULL);
8057b8e80941Smrg
8058b8e80941Smrg   if (compiler->devinfo->gen <= 5 && !simd8_cfg) {
8059b8e80941Smrg      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
8060b8e80941Smrg       * the data available in the base prog data struct for convenience.
8061b8e80941Smrg       */
8062b8e80941Smrg      if (simd16_cfg) {
8063b8e80941Smrg         prog_data->base.dispatch_grf_start_reg =
8064b8e80941Smrg            prog_data->dispatch_grf_start_reg_16;
8065b8e80941Smrg      } else if (simd32_cfg) {
8066b8e80941Smrg         prog_data->base.dispatch_grf_start_reg =
8067b8e80941Smrg            prog_data->dispatch_grf_start_reg_32;
8068b8e80941Smrg      }
8069b8e80941Smrg   }
8070b8e80941Smrg
8071b8e80941Smrg   if (prog_data->persample_dispatch) {
8072b8e80941Smrg      /* Starting with SandyBridge (where we first get MSAA), the different
8073b8e80941Smrg       * pixel dispatch combinations are grouped into classifications A
8074b8e80941Smrg       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
8075b8e80941Smrg       * generations, the only configurations supporting persample dispatch
8076b8e80941Smrg       * are are this in which only one dispatch width is enabled.
8077b8e80941Smrg       */
8078b8e80941Smrg      if (simd32_cfg || simd16_cfg)
8079b8e80941Smrg         simd8_cfg = NULL;
8080b8e80941Smrg      if (simd32_cfg)
8081b8e80941Smrg         simd16_cfg = NULL;
8082b8e80941Smrg   }
8083b8e80941Smrg
8084b8e80941Smrg   /* We have to compute the flat inputs after the visitor is finished running
8085b8e80941Smrg    * because it relies on prog_data->urb_setup which is computed in
8086b8e80941Smrg    * fs_visitor::calculate_urb_setup().
8087b8e80941Smrg    */
8088b8e80941Smrg   brw_compute_flat_inputs(prog_data, shader);
8089b8e80941Smrg
8090b8e80941Smrg   fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
8091b8e80941Smrg                  v8.promoted_constants, v8.runtime_check_aads_emit,
8092b8e80941Smrg                  MESA_SHADER_FRAGMENT);
8093b8e80941Smrg
8094b8e80941Smrg   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
8095b8e80941Smrg      g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
8096b8e80941Smrg                                     shader->info.label ?
8097b8e80941Smrg                                        shader->info.label : "unnamed",
8098b8e80941Smrg                                     shader->info.name));
8099b8e80941Smrg   }
8100b8e80941Smrg
8101b8e80941Smrg   if (simd8_cfg) {
8102b8e80941Smrg      prog_data->dispatch_8 = true;
8103b8e80941Smrg      g.generate_code(simd8_cfg, 8);
8104b8e80941Smrg   }
8105b8e80941Smrg
8106b8e80941Smrg   if (simd16_cfg) {
8107b8e80941Smrg      prog_data->dispatch_16 = true;
8108b8e80941Smrg      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
8109b8e80941Smrg   }
8110b8e80941Smrg
8111b8e80941Smrg   if (simd32_cfg) {
8112b8e80941Smrg      prog_data->dispatch_32 = true;
8113b8e80941Smrg      prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32);
8114b8e80941Smrg   }
8115b8e80941Smrg
8116b8e80941Smrg   return g.get_assembly();
8117b8e80941Smrg}
8118b8e80941Smrg
8119b8e80941Smrgfs_reg *
8120b8e80941Smrgfs_visitor::emit_cs_work_group_id_setup()
8121b8e80941Smrg{
8122b8e80941Smrg   assert(stage == MESA_SHADER_COMPUTE);
8123b8e80941Smrg
8124b8e80941Smrg   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
8125b8e80941Smrg
8126b8e80941Smrg   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
8127b8e80941Smrg   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
8128b8e80941Smrg   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
8129b8e80941Smrg
8130b8e80941Smrg   bld.MOV(*reg, r0_1);
8131b8e80941Smrg   bld.MOV(offset(*reg, bld, 1), r0_6);
8132b8e80941Smrg   bld.MOV(offset(*reg, bld, 2), r0_7);
8133b8e80941Smrg
8134b8e80941Smrg   return reg;
8135b8e80941Smrg}
8136b8e80941Smrg
8137b8e80941Smrgstatic void
8138b8e80941Smrgfill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
8139b8e80941Smrg{
8140b8e80941Smrg   block->dwords = dwords;
8141b8e80941Smrg   block->regs = DIV_ROUND_UP(dwords, 8);
8142b8e80941Smrg   block->size = block->regs * 32;
8143b8e80941Smrg}
8144b8e80941Smrg
8145b8e80941Smrgstatic void
8146b8e80941Smrgcs_fill_push_const_info(const struct gen_device_info *devinfo,
8147b8e80941Smrg                        struct brw_cs_prog_data *cs_prog_data)
8148b8e80941Smrg{
8149b8e80941Smrg   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
8150b8e80941Smrg   int subgroup_id_index = get_subgroup_id_param_index(prog_data);
8151b8e80941Smrg   bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
8152b8e80941Smrg
8153b8e80941Smrg   /* The thread ID should be stored in the last param dword */
8154b8e80941Smrg   assert(subgroup_id_index == -1 ||
8155b8e80941Smrg          subgroup_id_index == (int)prog_data->nr_params - 1);
8156b8e80941Smrg
8157b8e80941Smrg   unsigned cross_thread_dwords, per_thread_dwords;
8158b8e80941Smrg   if (!cross_thread_supported) {
8159b8e80941Smrg      cross_thread_dwords = 0u;
8160b8e80941Smrg      per_thread_dwords = prog_data->nr_params;
8161b8e80941Smrg   } else if (subgroup_id_index >= 0) {
8162b8e80941Smrg      /* Fill all but the last register with cross-thread payload */
8163b8e80941Smrg      cross_thread_dwords = 8 * (subgroup_id_index / 8);
8164b8e80941Smrg      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
8165b8e80941Smrg      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
8166b8e80941Smrg   } else {
8167b8e80941Smrg      /* Fill all data using cross-thread payload */
8168b8e80941Smrg      cross_thread_dwords = prog_data->nr_params;
8169b8e80941Smrg      per_thread_dwords = 0u;
8170b8e80941Smrg   }
8171b8e80941Smrg
8172b8e80941Smrg   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
8173b8e80941Smrg   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
8174b8e80941Smrg
8175b8e80941Smrg   unsigned total_dwords =
8176b8e80941Smrg      (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
8177b8e80941Smrg       cs_prog_data->push.cross_thread.size) / 4;
8178b8e80941Smrg   fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
8179b8e80941Smrg
8180b8e80941Smrg   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
8181b8e80941Smrg          cs_prog_data->push.per_thread.size == 0);
8182b8e80941Smrg   assert(cs_prog_data->push.cross_thread.dwords +
8183b8e80941Smrg          cs_prog_data->push.per_thread.dwords ==
8184b8e80941Smrg             prog_data->nr_params);
8185b8e80941Smrg}
8186b8e80941Smrg
8187b8e80941Smrgstatic void
8188b8e80941Smrgcs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
8189b8e80941Smrg{
8190b8e80941Smrg   cs_prog_data->simd_size = size;
8191b8e80941Smrg   unsigned group_size = cs_prog_data->local_size[0] *
8192b8e80941Smrg      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
8193b8e80941Smrg   cs_prog_data->threads = (group_size + size - 1) / size;
8194b8e80941Smrg}
8195b8e80941Smrg
8196b8e80941Smrgstatic nir_shader *
8197b8e80941Smrgcompile_cs_to_nir(const struct brw_compiler *compiler,
8198b8e80941Smrg                  void *mem_ctx,
8199b8e80941Smrg                  const struct brw_cs_prog_key *key,
8200b8e80941Smrg                  const nir_shader *src_shader,
8201b8e80941Smrg                  unsigned dispatch_width)
8202b8e80941Smrg{
8203b8e80941Smrg   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
8204b8e80941Smrg   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
8205b8e80941Smrg
8206b8e80941Smrg   NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width);
8207b8e80941Smrg
8208b8e80941Smrg   /* Clean up after the local index and ID calculations. */
8209b8e80941Smrg   NIR_PASS_V(shader, nir_opt_constant_folding);
8210b8e80941Smrg   NIR_PASS_V(shader, nir_opt_dce);
8211b8e80941Smrg
8212b8e80941Smrg   return brw_postprocess_nir(shader, compiler, true);
8213b8e80941Smrg}
8214b8e80941Smrg
8215b8e80941Smrgconst unsigned *
8216b8e80941Smrgbrw_compile_cs(const struct brw_compiler *compiler, void *log_data,
8217b8e80941Smrg               void *mem_ctx,
8218b8e80941Smrg               const struct brw_cs_prog_key *key,
8219b8e80941Smrg               struct brw_cs_prog_data *prog_data,
8220b8e80941Smrg               const nir_shader *src_shader,
8221b8e80941Smrg               int shader_time_index,
8222b8e80941Smrg               char **error_str)
8223b8e80941Smrg{
8224b8e80941Smrg   prog_data->local_size[0] = src_shader->info.cs.local_size[0];
8225b8e80941Smrg   prog_data->local_size[1] = src_shader->info.cs.local_size[1];
8226b8e80941Smrg   prog_data->local_size[2] = src_shader->info.cs.local_size[2];
8227b8e80941Smrg   unsigned local_workgroup_size =
8228b8e80941Smrg      src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
8229b8e80941Smrg      src_shader->info.cs.local_size[2];
8230b8e80941Smrg
8231b8e80941Smrg   unsigned min_dispatch_width =
8232b8e80941Smrg      DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
8233b8e80941Smrg   min_dispatch_width = MAX2(8, min_dispatch_width);
8234b8e80941Smrg   min_dispatch_width = util_next_power_of_two(min_dispatch_width);
8235b8e80941Smrg   assert(min_dispatch_width <= 32);
8236b8e80941Smrg
8237b8e80941Smrg   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
8238b8e80941Smrg   cfg_t *cfg = NULL;
8239b8e80941Smrg   const char *fail_msg = NULL;
8240b8e80941Smrg   unsigned promoted_constants = 0;
8241b8e80941Smrg
8242b8e80941Smrg   /* Now the main event: Visit the shader IR and generate our CS IR for it.
8243b8e80941Smrg    */
8244b8e80941Smrg   if (min_dispatch_width <= 8) {
8245b8e80941Smrg      nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
8246b8e80941Smrg                                           src_shader, 8);
8247b8e80941Smrg      v8 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
8248b8e80941Smrg                          NULL, /* Never used in core profile */
8249b8e80941Smrg                          nir8, 8, shader_time_index);
8250b8e80941Smrg      if (!v8->run_cs(min_dispatch_width)) {
8251b8e80941Smrg         fail_msg = v8->fail_msg;
8252b8e80941Smrg      } else {
8253b8e80941Smrg         /* We should always be able to do SIMD32 for compute shaders */
8254b8e80941Smrg         assert(v8->max_dispatch_width >= 32);
8255b8e80941Smrg
8256b8e80941Smrg         cfg = v8->cfg;
8257b8e80941Smrg         cs_set_simd_size(prog_data, 8);
8258b8e80941Smrg         cs_fill_push_const_info(compiler->devinfo, prog_data);
8259b8e80941Smrg         promoted_constants = v8->promoted_constants;
8260b8e80941Smrg      }
8261b8e80941Smrg   }
8262b8e80941Smrg
8263b8e80941Smrg   if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
8264b8e80941Smrg       !fail_msg && min_dispatch_width <= 16) {
8265b8e80941Smrg      /* Try a SIMD16 compile */
8266b8e80941Smrg      nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
8267b8e80941Smrg                                            src_shader, 16);
8268b8e80941Smrg      v16 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
8269b8e80941Smrg                           NULL, /* Never used in core profile */
8270b8e80941Smrg                           nir16, 16, shader_time_index);
8271b8e80941Smrg      if (v8)
8272b8e80941Smrg         v16->import_uniforms(v8);
8273b8e80941Smrg
8274b8e80941Smrg      if (!v16->run_cs(min_dispatch_width)) {
8275b8e80941Smrg         compiler->shader_perf_log(log_data,
8276b8e80941Smrg                                   "SIMD16 shader failed to compile: %s",
8277b8e80941Smrg                                   v16->fail_msg);
8278b8e80941Smrg         if (!cfg) {
8279b8e80941Smrg            fail_msg =
8280b8e80941Smrg               "Couldn't generate SIMD16 program and not "
8281b8e80941Smrg               "enough threads for SIMD8";
8282b8e80941Smrg         }
8283b8e80941Smrg      } else {
8284b8e80941Smrg         /* We should always be able to do SIMD32 for compute shaders */
8285b8e80941Smrg         assert(v16->max_dispatch_width >= 32);
8286b8e80941Smrg
8287b8e80941Smrg         cfg = v16->cfg;
8288b8e80941Smrg         cs_set_simd_size(prog_data, 16);
8289b8e80941Smrg         cs_fill_push_const_info(compiler->devinfo, prog_data);
8290b8e80941Smrg         promoted_constants = v16->promoted_constants;
8291b8e80941Smrg      }
8292b8e80941Smrg   }
8293b8e80941Smrg
8294b8e80941Smrg   /* We should always be able to do SIMD32 for compute shaders */
8295b8e80941Smrg   assert(!v16 || v16->max_dispatch_width >= 32);
8296b8e80941Smrg
8297b8e80941Smrg   if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
8298b8e80941Smrg      /* Try a SIMD32 compile */
8299b8e80941Smrg      nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
8300b8e80941Smrg                                            src_shader, 32);
8301b8e80941Smrg      v32 = new fs_visitor(compiler, log_data, mem_ctx, key, &prog_data->base,
8302b8e80941Smrg                           NULL, /* Never used in core profile */
8303b8e80941Smrg                           nir32, 32, shader_time_index);
8304b8e80941Smrg      if (v8)
8305b8e80941Smrg         v32->import_uniforms(v8);
8306b8e80941Smrg      else if (v16)
8307b8e80941Smrg         v32->import_uniforms(v16);
8308b8e80941Smrg
8309b8e80941Smrg      if (!v32->run_cs(min_dispatch_width)) {
8310b8e80941Smrg         compiler->shader_perf_log(log_data,
8311b8e80941Smrg                                   "SIMD32 shader failed to compile: %s",
8312b8e80941Smrg                                   v32->fail_msg);
8313b8e80941Smrg         if (!cfg) {
8314b8e80941Smrg            fail_msg =
8315b8e80941Smrg               "Couldn't generate SIMD32 program and not "
8316b8e80941Smrg               "enough threads for SIMD16";
8317b8e80941Smrg         }
8318b8e80941Smrg      } else {
8319b8e80941Smrg         cfg = v32->cfg;
8320b8e80941Smrg         cs_set_simd_size(prog_data, 32);
8321b8e80941Smrg         cs_fill_push_const_info(compiler->devinfo, prog_data);
8322b8e80941Smrg         promoted_constants = v32->promoted_constants;
8323b8e80941Smrg      }
8324b8e80941Smrg   }
8325b8e80941Smrg
8326b8e80941Smrg   const unsigned *ret = NULL;
8327b8e80941Smrg   if (unlikely(cfg == NULL)) {
8328b8e80941Smrg      assert(fail_msg);
8329b8e80941Smrg      if (error_str)
8330b8e80941Smrg         *error_str = ralloc_strdup(mem_ctx, fail_msg);
8331b8e80941Smrg   } else {
8332b8e80941Smrg      fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
8333b8e80941Smrg                     promoted_constants, false, MESA_SHADER_COMPUTE);
8334b8e80941Smrg      if (INTEL_DEBUG & DEBUG_CS) {
8335b8e80941Smrg         char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
8336b8e80941Smrg                                      src_shader->info.label ?
8337b8e80941Smrg                                         src_shader->info.label : "unnamed",
8338b8e80941Smrg                                      src_shader->info.name);
8339b8e80941Smrg         g.enable_debug(name);
8340b8e80941Smrg      }
8341b8e80941Smrg
8342b8e80941Smrg      g.generate_code(cfg, prog_data->simd_size);
8343b8e80941Smrg
8344b8e80941Smrg      ret = g.get_assembly();
8345b8e80941Smrg   }
8346b8e80941Smrg
8347b8e80941Smrg   delete v8;
8348b8e80941Smrg   delete v16;
8349b8e80941Smrg   delete v32;
8350b8e80941Smrg
8351b8e80941Smrg   return ret;
8352b8e80941Smrg}
8353b8e80941Smrg
8354b8e80941Smrg/**
8355b8e80941Smrg * Test the dispatch mask packing assumptions of
8356b8e80941Smrg * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
8357b8e80941Smrg * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
8358b8e80941Smrg * executed with an unexpected dispatch mask.
8359b8e80941Smrg */
8360b8e80941Smrgstatic UNUSED void
8361b8e80941Smrgbrw_fs_test_dispatch_packing(const fs_builder &bld)
8362b8e80941Smrg{
8363b8e80941Smrg   const gl_shader_stage stage = bld.shader->stage;
8364b8e80941Smrg
8365b8e80941Smrg   if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
8366b8e80941Smrg                                     bld.shader->stage_prog_data)) {
8367b8e80941Smrg      const fs_builder ubld = bld.exec_all().group(1, 0);
8368b8e80941Smrg      const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
8369b8e80941Smrg      const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
8370b8e80941Smrg                           brw_dmask_reg());
8371b8e80941Smrg
8372b8e80941Smrg      ubld.ADD(tmp, mask, brw_imm_ud(1));
8373b8e80941Smrg      ubld.AND(tmp, mask, tmp);
8374b8e80941Smrg
8375b8e80941Smrg      /* This will loop forever if the dispatch mask doesn't have the expected
8376b8e80941Smrg       * form '2^n-1', in which case tmp will be non-zero.
8377b8e80941Smrg       */
8378b8e80941Smrg      bld.emit(BRW_OPCODE_DO);
8379b8e80941Smrg      bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
8380b8e80941Smrg      set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
8381b8e80941Smrg   }
8382b8e80941Smrg}
8383