brw_fs.cpp revision 7ec681f3
1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31#include "main/macros.h"
32#include "brw_eu.h"
33#include "brw_fs.h"
34#include "brw_fs_live_variables.h"
35#include "brw_nir.h"
36#include "brw_vec4_gs_visitor.h"
37#include "brw_cfg.h"
38#include "brw_dead_control_flow.h"
39#include "dev/intel_debug.h"
40#include "compiler/glsl_types.h"
41#include "compiler/nir/nir_builder.h"
42#include "program/prog_parameter.h"
43#include "util/u_math.h"
44
45using namespace brw;
46
47static unsigned get_lowered_simd_width(const struct intel_device_info *devinfo,
48                                       const fs_inst *inst);
49
50void
51fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
52              const fs_reg *src, unsigned sources)
53{
54   memset((void*)this, 0, sizeof(*this));
55
56   this->src = new fs_reg[MAX2(sources, 3)];
57   for (unsigned i = 0; i < sources; i++)
58      this->src[i] = src[i];
59
60   this->opcode = opcode;
61   this->dst = dst;
62   this->sources = sources;
63   this->exec_size = exec_size;
64   this->base_mrf = -1;
65
66   assert(dst.file != IMM && dst.file != UNIFORM);
67
68   assert(this->exec_size != 0);
69
70   this->conditional_mod = BRW_CONDITIONAL_NONE;
71
72   /* This will be the case for almost all instructions. */
73   switch (dst.file) {
74   case VGRF:
75   case ARF:
76   case FIXED_GRF:
77   case MRF:
78   case ATTR:
79      this->size_written = dst.component_size(exec_size);
80      break;
81   case BAD_FILE:
82      this->size_written = 0;
83      break;
84   case IMM:
85   case UNIFORM:
86      unreachable("Invalid destination register file");
87   }
88
89   this->writes_accumulator = false;
90}
91
92fs_inst::fs_inst()
93{
94   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
95}
96
97fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
98{
99   init(opcode, exec_size, reg_undef, NULL, 0);
100}
101
102fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
103{
104   init(opcode, exec_size, dst, NULL, 0);
105}
106
107fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
108                 const fs_reg &src0)
109{
110   const fs_reg src[1] = { src0 };
111   init(opcode, exec_size, dst, src, 1);
112}
113
114fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
115                 const fs_reg &src0, const fs_reg &src1)
116{
117   const fs_reg src[2] = { src0, src1 };
118   init(opcode, exec_size, dst, src, 2);
119}
120
121fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
122                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
123{
124   const fs_reg src[3] = { src0, src1, src2 };
125   init(opcode, exec_size, dst, src, 3);
126}
127
128fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
129                 const fs_reg src[], unsigned sources)
130{
131   init(opcode, exec_width, dst, src, sources);
132}
133
134fs_inst::fs_inst(const fs_inst &that)
135{
136   memcpy((void*)this, &that, sizeof(that));
137
138   this->src = new fs_reg[MAX2(that.sources, 3)];
139
140   for (unsigned i = 0; i < that.sources; i++)
141      this->src[i] = that.src[i];
142}
143
144fs_inst::~fs_inst()
145{
146   delete[] this->src;
147}
148
149void
150fs_inst::resize_sources(uint8_t num_sources)
151{
152   if (this->sources != num_sources) {
153      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
154
155      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
156         src[i] = this->src[i];
157
158      delete[] this->src;
159      this->src = src;
160      this->sources = num_sources;
161   }
162}
163
164void
165fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
166                                       const fs_reg &dst,
167                                       const fs_reg &surf_index,
168                                       const fs_reg &varying_offset,
169                                       uint32_t const_offset,
170                                       uint8_t alignment)
171{
172   /* We have our constant surface use a pitch of 4 bytes, so our index can
173    * be any component of a vector, and then we load 4 contiguous
174    * components starting from that.
175    *
176    * We break down the const_offset to a portion added to the variable offset
177    * and a portion done using fs_reg::offset, which means that if you have
178    * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
179    * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
180    * later notice that those loads are all the same and eliminate the
181    * redundant ones.
182    */
183   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
184   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
185
186   /* The pull load message will load a vec4 (16 bytes). If we are loading
187    * a double this means we are only loading 2 elements worth of data.
188    * We also want to use a 32-bit data type for the dst of the load operation
189    * so other parts of the driver don't get confused about the size of the
190    * result.
191    */
192   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
193   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
194                            vec4_result, surf_index, vec4_offset,
195                            brw_imm_ud(alignment));
196   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
197
198   shuffle_from_32bit_read(bld, dst, vec4_result,
199                           (const_offset & 0xf) / type_sz(dst.type), 1);
200}
201
202/**
203 * A helper for MOV generation for fixing up broken hardware SEND dependency
204 * handling.
205 */
206void
207fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
208{
209   /* The caller always wants uncompressed to emit the minimal extra
210    * dependencies, and to avoid having to deal with aligning its regs to 2.
211    */
212   const fs_builder ubld = bld.annotate("send dependency resolve")
213                              .quarter(0);
214
215   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
216}
217
218bool
219fs_inst::is_send_from_grf() const
220{
221   switch (opcode) {
222   case SHADER_OPCODE_SEND:
223   case SHADER_OPCODE_SHADER_TIME_ADD:
224   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
225   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
226   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
227   case SHADER_OPCODE_URB_WRITE_SIMD8:
228   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
229   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
230   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
231   case SHADER_OPCODE_URB_READ_SIMD8:
232   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
233   case SHADER_OPCODE_INTERLOCK:
234   case SHADER_OPCODE_MEMORY_FENCE:
235   case SHADER_OPCODE_BARRIER:
236      return true;
237   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
238      return src[1].file == VGRF;
239   case FS_OPCODE_FB_WRITE:
240   case FS_OPCODE_FB_READ:
241      return src[0].file == VGRF;
242   default:
243      if (is_tex())
244         return src[0].file == VGRF;
245
246      return false;
247   }
248}
249
250bool
251fs_inst::is_control_source(unsigned arg) const
252{
253   switch (opcode) {
254   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
255   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
256   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
257      return arg == 0;
258
259   case SHADER_OPCODE_BROADCAST:
260   case SHADER_OPCODE_SHUFFLE:
261   case SHADER_OPCODE_QUAD_SWIZZLE:
262   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
263   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
264   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
265   case SHADER_OPCODE_GET_BUFFER_SIZE:
266      return arg == 1;
267
268   case SHADER_OPCODE_MOV_INDIRECT:
269   case SHADER_OPCODE_CLUSTER_BROADCAST:
270   case SHADER_OPCODE_TEX:
271   case FS_OPCODE_TXB:
272   case SHADER_OPCODE_TXD:
273   case SHADER_OPCODE_TXF:
274   case SHADER_OPCODE_TXF_LZ:
275   case SHADER_OPCODE_TXF_CMS:
276   case SHADER_OPCODE_TXF_CMS_W:
277   case SHADER_OPCODE_TXF_UMS:
278   case SHADER_OPCODE_TXF_MCS:
279   case SHADER_OPCODE_TXL:
280   case SHADER_OPCODE_TXL_LZ:
281   case SHADER_OPCODE_TXS:
282   case SHADER_OPCODE_LOD:
283   case SHADER_OPCODE_TG4:
284   case SHADER_OPCODE_TG4_OFFSET:
285   case SHADER_OPCODE_SAMPLEINFO:
286      return arg == 1 || arg == 2;
287
288   case SHADER_OPCODE_SEND:
289      return arg == 0 || arg == 1;
290
291   default:
292      return false;
293   }
294}
295
296bool
297fs_inst::is_payload(unsigned arg) const
298{
299   switch (opcode) {
300   case FS_OPCODE_FB_WRITE:
301   case FS_OPCODE_FB_READ:
302   case SHADER_OPCODE_URB_WRITE_SIMD8:
303   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
304   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
305   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
306   case SHADER_OPCODE_URB_READ_SIMD8:
307   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
308   case VEC4_OPCODE_UNTYPED_ATOMIC:
309   case VEC4_OPCODE_UNTYPED_SURFACE_READ:
310   case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
311   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
312   case SHADER_OPCODE_SHADER_TIME_ADD:
313   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
314   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
315   case SHADER_OPCODE_INTERLOCK:
316   case SHADER_OPCODE_MEMORY_FENCE:
317   case SHADER_OPCODE_BARRIER:
318      return arg == 0;
319
320   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
321      return arg == 1;
322
323   case SHADER_OPCODE_SEND:
324      return arg == 2 || arg == 3;
325
326   default:
327      if (is_tex())
328         return arg == 0;
329      else
330         return false;
331   }
332}
333
334/**
335 * Returns true if this instruction's sources and destinations cannot
336 * safely be the same register.
337 *
338 * In most cases, a register can be written over safely by the same
339 * instruction that is its last use.  For a single instruction, the
340 * sources are dereferenced before writing of the destination starts
341 * (naturally).
342 *
343 * However, there are a few cases where this can be problematic:
344 *
345 * - Virtual opcodes that translate to multiple instructions in the
346 *   code generator: if src == dst and one instruction writes the
347 *   destination before a later instruction reads the source, then
348 *   src will have been clobbered.
349 *
350 * - SIMD16 compressed instructions with certain regioning (see below).
351 *
352 * The register allocator uses this information to set up conflicts between
353 * GRF sources and the destination.
354 */
355bool
356fs_inst::has_source_and_destination_hazard() const
357{
358   switch (opcode) {
359   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
360      /* Multiple partial writes to the destination */
361      return true;
362   case SHADER_OPCODE_SHUFFLE:
363      /* This instruction returns an arbitrary channel from the source and
364       * gets split into smaller instructions in the generator.  It's possible
365       * that one of the instructions will read from a channel corresponding
366       * to an earlier instruction.
367       */
368   case SHADER_OPCODE_SEL_EXEC:
369      /* This is implemented as
370       *
371       * mov(16)      g4<1>D      0D            { align1 WE_all 1H };
372       * mov(16)      g4<1>D      g5<8,8,1>D    { align1 1H }
373       *
374       * Because the source is only read in the second instruction, the first
375       * may stomp all over it.
376       */
377      return true;
378   case SHADER_OPCODE_QUAD_SWIZZLE:
379      switch (src[1].ud) {
380      case BRW_SWIZZLE_XXXX:
381      case BRW_SWIZZLE_YYYY:
382      case BRW_SWIZZLE_ZZZZ:
383      case BRW_SWIZZLE_WWWW:
384      case BRW_SWIZZLE_XXZZ:
385      case BRW_SWIZZLE_YYWW:
386      case BRW_SWIZZLE_XYXY:
387      case BRW_SWIZZLE_ZWZW:
388         /* These can be implemented as a single Align1 region on all
389          * platforms, so there's never a hazard between source and
390          * destination.  C.f. fs_generator::generate_quad_swizzle().
391          */
392         return false;
393      default:
394         return !is_uniform(src[0]);
395      }
396   default:
397      /* The SIMD16 compressed instruction
398       *
399       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
400       *
401       * is actually decoded in hardware as:
402       *
403       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
404       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
405       *
406       * Which is safe.  However, if we have uniform accesses
407       * happening, we get into trouble:
408       *
409       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
410       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
411       *
412       * Now our destination for the first instruction overwrote the
413       * second instruction's src0, and we get garbage for those 8
414       * pixels.  There's a similar issue for the pre-gfx6
415       * pixel_x/pixel_y, which are registers of 16-bit values and thus
416       * would get stomped by the first decode as well.
417       */
418      if (exec_size == 16) {
419         for (int i = 0; i < sources; i++) {
420            if (src[i].file == VGRF && (src[i].stride == 0 ||
421                                        src[i].type == BRW_REGISTER_TYPE_UW ||
422                                        src[i].type == BRW_REGISTER_TYPE_W ||
423                                        src[i].type == BRW_REGISTER_TYPE_UB ||
424                                        src[i].type == BRW_REGISTER_TYPE_B)) {
425               return true;
426            }
427         }
428      }
429      return false;
430   }
431}
432
433bool
434fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
435{
436   if (devinfo->ver == 6 && is_math())
437      return false;
438
439   if (is_send_from_grf())
440      return false;
441
442   /* From Wa_1604601757:
443    *
444    * "When multiplying a DW and any lower precision integer, source modifier
445    *  is not supported."
446    */
447   if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
448                              opcode == BRW_OPCODE_MAD)) {
449      const brw_reg_type exec_type = get_exec_type(this);
450      const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
451         MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
452         MIN2(type_sz(src[0].type), type_sz(src[1].type));
453
454      if (brw_reg_type_is_integer(exec_type) &&
455          type_sz(exec_type) >= 4 &&
456          type_sz(exec_type) != min_type_sz)
457         return false;
458   }
459
460   if (!backend_instruction::can_do_source_mods())
461      return false;
462
463   return true;
464}
465
466bool
467fs_inst::can_do_cmod()
468{
469   if (!backend_instruction::can_do_cmod())
470      return false;
471
472   /* The accumulator result appears to get used for the conditional modifier
473    * generation.  When negating a UD value, there is a 33rd bit generated for
474    * the sign in the accumulator value, so now you can't check, for example,
475    * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
476    */
477   for (unsigned i = 0; i < sources; i++) {
478      if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
479         return false;
480   }
481
482   return true;
483}
484
485bool
486fs_inst::can_change_types() const
487{
488   return dst.type == src[0].type &&
489          !src[0].abs && !src[0].negate && !saturate &&
490          (opcode == BRW_OPCODE_MOV ||
491           (opcode == BRW_OPCODE_SEL &&
492            dst.type == src[1].type &&
493            predicate != BRW_PREDICATE_NONE &&
494            !src[1].abs && !src[1].negate));
495}
496
497void
498fs_reg::init()
499{
500   memset((void*)this, 0, sizeof(*this));
501   type = BRW_REGISTER_TYPE_UD;
502   stride = 1;
503}
504
505/** Generic unset register constructor. */
506fs_reg::fs_reg()
507{
508   init();
509   this->file = BAD_FILE;
510}
511
512fs_reg::fs_reg(struct ::brw_reg reg) :
513   backend_reg(reg)
514{
515   this->offset = 0;
516   this->stride = 1;
517   if (this->file == IMM &&
518       (this->type != BRW_REGISTER_TYPE_V &&
519        this->type != BRW_REGISTER_TYPE_UV &&
520        this->type != BRW_REGISTER_TYPE_VF)) {
521      this->stride = 0;
522   }
523}
524
525bool
526fs_reg::equals(const fs_reg &r) const
527{
528   return (this->backend_reg::equals(r) &&
529           stride == r.stride);
530}
531
532bool
533fs_reg::negative_equals(const fs_reg &r) const
534{
535   return (this->backend_reg::negative_equals(r) &&
536           stride == r.stride);
537}
538
539bool
540fs_reg::is_contiguous() const
541{
542   switch (file) {
543   case ARF:
544   case FIXED_GRF:
545      return hstride == BRW_HORIZONTAL_STRIDE_1 &&
546             vstride == width + hstride;
547   case MRF:
548   case VGRF:
549   case ATTR:
550      return stride == 1;
551   case UNIFORM:
552   case IMM:
553   case BAD_FILE:
554      return true;
555   }
556
557   unreachable("Invalid register file");
558}
559
560unsigned
561fs_reg::component_size(unsigned width) const
562{
563   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
564                            hstride == 0 ? 0 :
565                            1 << (hstride - 1));
566   return MAX2(width * stride, 1) * type_sz(type);
567}
568
569/**
570 * Create a MOV to read the timestamp register.
571 */
572fs_reg
573fs_visitor::get_timestamp(const fs_builder &bld)
574{
575   assert(devinfo->ver >= 7);
576
577   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
578                                          BRW_ARF_TIMESTAMP,
579                                          0),
580                             BRW_REGISTER_TYPE_UD));
581
582   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
583
584   /* We want to read the 3 fields we care about even if it's not enabled in
585    * the dispatch.
586    */
587   bld.group(4, 0).exec_all().MOV(dst, ts);
588
589   return dst;
590}
591
592void
593fs_visitor::emit_shader_time_begin()
594{
595   /* We want only the low 32 bits of the timestamp.  Since it's running
596    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
597    * which is plenty of time for our purposes.  It is identical across the
598    * EUs, but since it's tracking GPU core speed it will increment at a
599    * varying rate as render P-states change.
600    */
601   shader_start_time = component(
602      get_timestamp(bld.annotate("shader time start")), 0);
603}
604
605void
606fs_visitor::emit_shader_time_end()
607{
608   /* Insert our code just before the final SEND with EOT. */
609   exec_node *end = this->instructions.get_tail();
610   assert(end && ((fs_inst *) end)->eot);
611   const fs_builder ibld = bld.annotate("shader time end")
612                              .exec_all().at(NULL, end);
613   const fs_reg timestamp = get_timestamp(ibld);
614
615   /* We only use the low 32 bits of the timestamp - see
616    * emit_shader_time_begin()).
617    *
618    * We could also check if render P-states have changed (or anything
619    * else that might disrupt timing) by setting smear to 2 and checking if
620    * that field is != 0.
621    */
622   const fs_reg shader_end_time = component(timestamp, 0);
623
624   /* Check that there weren't any timestamp reset events (assuming these
625    * were the only two timestamp reads that happened).
626    */
627   const fs_reg reset = component(timestamp, 2);
628   set_condmod(BRW_CONDITIONAL_Z,
629               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
630   ibld.IF(BRW_PREDICATE_NORMAL);
631
632   fs_reg start = shader_start_time;
633   start.negate = true;
634   const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
635                                        BRW_REGISTER_TYPE_UD),
636                                 0);
637   const fs_builder cbld = ibld.group(1, 0);
638   cbld.group(1, 0).ADD(diff, start, shader_end_time);
639
640   /* If there were no instructions between the two timestamp gets, the diff
641    * is 2 cycles.  Remove that overhead, so I can forget about that when
642    * trying to determine the time taken for single instructions.
643    */
644   cbld.ADD(diff, diff, brw_imm_ud(-2u));
645   SHADER_TIME_ADD(cbld, 0, diff);
646   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
647   ibld.emit(BRW_OPCODE_ELSE);
648   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
649   ibld.emit(BRW_OPCODE_ENDIF);
650}
651
652void
653fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
654                            int shader_time_subindex,
655                            fs_reg value)
656{
657   int index = shader_time_index * 3 + shader_time_subindex;
658   struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
659
660   fs_reg payload;
661   if (dispatch_width == 8)
662      payload = vgrf(glsl_type::uvec2_type);
663   else
664      payload = vgrf(glsl_type::uint_type);
665
666   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
667}
668
669void
670fs_visitor::vfail(const char *format, va_list va)
671{
672   char *msg;
673
674   if (failed)
675      return;
676
677   failed = true;
678
679   msg = ralloc_vasprintf(mem_ctx, format, va);
680   msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
681         dispatch_width, stage_abbrev, msg);
682
683   this->fail_msg = msg;
684
685   if (unlikely(debug_enabled)) {
686      fprintf(stderr, "%s",  msg);
687   }
688}
689
690void
691fs_visitor::fail(const char *format, ...)
692{
693   va_list va;
694
695   va_start(va, format);
696   vfail(format, va);
697   va_end(va);
698}
699
700/**
701 * Mark this program as impossible to compile with dispatch width greater
702 * than n.
703 *
704 * During the SIMD8 compile (which happens first), we can detect and flag
705 * things that are unsupported in SIMD16+ mode, so the compiler can skip the
706 * SIMD16+ compile altogether.
707 *
708 * During a compile of dispatch width greater than n (if one happens anyway),
709 * this just calls fail().
710 */
711void
712fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
713{
714   if (dispatch_width > n) {
715      fail("%s", msg);
716   } else {
717      max_dispatch_width = MIN2(max_dispatch_width, n);
718      brw_shader_perf_log(compiler, log_data,
719                          "Shader dispatch width limited to SIMD%d: %s\n",
720                          n, msg);
721   }
722}
723
724/**
725 * Returns true if the instruction has a flag that means it won't
726 * update an entire destination register.
727 *
728 * For example, dead code elimination and live variable analysis want to know
729 * when a write to a variable screens off any preceding values that were in
730 * it.
731 */
732bool
733fs_inst::is_partial_write() const
734{
735   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
736           (this->exec_size * type_sz(this->dst.type)) < 32 ||
737           !this->dst.is_contiguous() ||
738           this->dst.offset % REG_SIZE != 0);
739}
740
741unsigned
742fs_inst::components_read(unsigned i) const
743{
744   /* Return zero if the source is not present. */
745   if (src[i].file == BAD_FILE)
746      return 0;
747
748   switch (opcode) {
749   case FS_OPCODE_LINTERP:
750      if (i == 0)
751         return 2;
752      else
753         return 1;
754
755   case FS_OPCODE_PIXEL_X:
756   case FS_OPCODE_PIXEL_Y:
757      assert(i < 2);
758      if (i == 0)
759         return 2;
760      else
761         return 1;
762
763   case FS_OPCODE_FB_WRITE_LOGICAL:
764      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
765      /* First/second FB write color. */
766      if (i < 2)
767         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
768      else
769         return 1;
770
771   case SHADER_OPCODE_TEX_LOGICAL:
772   case SHADER_OPCODE_TXD_LOGICAL:
773   case SHADER_OPCODE_TXF_LOGICAL:
774   case SHADER_OPCODE_TXL_LOGICAL:
775   case SHADER_OPCODE_TXS_LOGICAL:
776   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
777   case FS_OPCODE_TXB_LOGICAL:
778   case SHADER_OPCODE_TXF_CMS_LOGICAL:
779   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
780   case SHADER_OPCODE_TXF_UMS_LOGICAL:
781   case SHADER_OPCODE_TXF_MCS_LOGICAL:
782   case SHADER_OPCODE_LOD_LOGICAL:
783   case SHADER_OPCODE_TG4_LOGICAL:
784   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
785   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
786      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
787             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
788      /* Texture coordinates. */
789      if (i == TEX_LOGICAL_SRC_COORDINATE)
790         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
791      /* Texture derivatives. */
792      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
793               opcode == SHADER_OPCODE_TXD_LOGICAL)
794         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
795      /* Texture offset. */
796      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
797         return 2;
798      /* MCS */
799      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
800         return 2;
801      else
802         return 1;
803
804   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
805   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
806      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
807      /* Surface coordinates. */
808      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
809         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
810      /* Surface operation source (ignored for reads). */
811      else if (i == SURFACE_LOGICAL_SRC_DATA)
812         return 0;
813      else
814         return 1;
815
816   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
817   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
818      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
819             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
820      /* Surface coordinates. */
821      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
822         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
823      /* Surface operation source. */
824      else if (i == SURFACE_LOGICAL_SRC_DATA)
825         return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
826      else
827         return 1;
828
829   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
830   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
831   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
832      assert(src[2].file == IMM);
833      return 1;
834
835   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
836      assert(src[2].file == IMM);
837      if (i == 1) { /* data to write */
838         const unsigned comps = src[2].ud / exec_size;
839         assert(comps > 0);
840         return comps;
841      } else {
842         return 1;
843      }
844
845   case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
846   case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
847      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
848      return 1;
849
850   case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
851      assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
852      if (i == SURFACE_LOGICAL_SRC_DATA) {
853         const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
854         assert(comps > 0);
855         return comps;
856      } else {
857         return 1;
858      }
859
860   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
861      assert(src[2].file == IMM);
862      return i == 1 ? src[2].ud : 1;
863
864   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
865   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
866   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
867      assert(src[2].file == IMM);
868      if (i == 1) {
869         /* Data source */
870         const unsigned op = src[2].ud;
871         switch (op) {
872         case BRW_AOP_INC:
873         case BRW_AOP_DEC:
874         case BRW_AOP_PREDEC:
875            return 0;
876         case BRW_AOP_CMPWR:
877            return 2;
878         default:
879            return 1;
880         }
881      } else {
882         return 1;
883      }
884
885   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
886   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
887   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
888      assert(src[2].file == IMM);
889      if (i == 1) {
890         /* Data source */
891         const unsigned op = src[2].ud;
892         return op == BRW_AOP_FCMPWR ? 2 : 1;
893      } else {
894         return 1;
895      }
896
897   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
898   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
899      /* Scattered logical opcodes use the following params:
900       * src[0] Surface coordinates
901       * src[1] Surface operation source (ignored for reads)
902       * src[2] Surface
903       * src[3] IMM with always 1 dimension.
904       * src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
905       */
906      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
907             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
908      return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
909
910   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
911   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
912      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
913             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
914      return 1;
915
916   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
917   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
918      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
919             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
920      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
921      /* Surface coordinates. */
922      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
923         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
924      /* Surface operation source. */
925      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
926         return 2;
927      else if (i == SURFACE_LOGICAL_SRC_DATA &&
928               (op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
929         return 0;
930      else
931         return 1;
932   }
933   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
934      return (i == 0 ? 2 : 1);
935
936   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
937      assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
938             src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
939      const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
940      /* Surface coordinates. */
941      if (i == SURFACE_LOGICAL_SRC_ADDRESS)
942         return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
943      /* Surface operation source. */
944      else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
945         return 2;
946      else
947         return 1;
948   }
949
950   default:
951      return 1;
952   }
953}
954
955unsigned
956fs_inst::size_read(int arg) const
957{
958   switch (opcode) {
959   case SHADER_OPCODE_SEND:
960      if (arg == 2) {
961         return mlen * REG_SIZE;
962      } else if (arg == 3) {
963         return ex_mlen * REG_SIZE;
964      }
965      break;
966
967   case FS_OPCODE_FB_WRITE:
968   case FS_OPCODE_REP_FB_WRITE:
969      if (arg == 0) {
970         if (base_mrf >= 0)
971            return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
972         else
973            return mlen * REG_SIZE;
974      }
975      break;
976
977   case FS_OPCODE_FB_READ:
978   case SHADER_OPCODE_URB_WRITE_SIMD8:
979   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
980   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
981   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
982   case SHADER_OPCODE_URB_READ_SIMD8:
983   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
984   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
985   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
986      if (arg == 0)
987         return mlen * REG_SIZE;
988      break;
989
990   case FS_OPCODE_SET_SAMPLE_ID:
991      if (arg == 1)
992         return 1;
993      break;
994
995   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7:
996      /* The payload is actually stored in src1 */
997      if (arg == 1)
998         return mlen * REG_SIZE;
999      break;
1000
1001   case FS_OPCODE_LINTERP:
1002      if (arg == 1)
1003         return 16;
1004      break;
1005
1006   case SHADER_OPCODE_LOAD_PAYLOAD:
1007      if (arg < this->header_size)
1008         return REG_SIZE;
1009      break;
1010
1011   case CS_OPCODE_CS_TERMINATE:
1012   case SHADER_OPCODE_BARRIER:
1013      return REG_SIZE;
1014
1015   case SHADER_OPCODE_MOV_INDIRECT:
1016      if (arg == 0) {
1017         assert(src[2].file == IMM);
1018         return src[2].ud;
1019      }
1020      break;
1021
1022   default:
1023      if (is_tex() && arg == 0 && src[0].file == VGRF)
1024         return mlen * REG_SIZE;
1025      break;
1026   }
1027
1028   switch (src[arg].file) {
1029   case UNIFORM:
1030   case IMM:
1031      return components_read(arg) * type_sz(src[arg].type);
1032   case BAD_FILE:
1033   case ARF:
1034   case FIXED_GRF:
1035   case VGRF:
1036   case ATTR:
1037      return components_read(arg) * src[arg].component_size(exec_size);
1038   case MRF:
1039      unreachable("MRF registers are not allowed as sources");
1040   }
1041   return 0;
1042}
1043
1044namespace {
1045   unsigned
1046   predicate_width(brw_predicate predicate)
1047   {
1048      switch (predicate) {
1049      case BRW_PREDICATE_NONE:            return 1;
1050      case BRW_PREDICATE_NORMAL:          return 1;
1051      case BRW_PREDICATE_ALIGN1_ANY2H:    return 2;
1052      case BRW_PREDICATE_ALIGN1_ALL2H:    return 2;
1053      case BRW_PREDICATE_ALIGN1_ANY4H:    return 4;
1054      case BRW_PREDICATE_ALIGN1_ALL4H:    return 4;
1055      case BRW_PREDICATE_ALIGN1_ANY8H:    return 8;
1056      case BRW_PREDICATE_ALIGN1_ALL8H:    return 8;
1057      case BRW_PREDICATE_ALIGN1_ANY16H:   return 16;
1058      case BRW_PREDICATE_ALIGN1_ALL16H:   return 16;
1059      case BRW_PREDICATE_ALIGN1_ANY32H:   return 32;
1060      case BRW_PREDICATE_ALIGN1_ALL32H:   return 32;
1061      default: unreachable("Unsupported predicate");
1062      }
1063   }
1064
1065   /* Return the subset of flag registers that an instruction could
1066    * potentially read or write based on the execution controls and flag
1067    * subregister number of the instruction.
1068    */
1069   unsigned
1070   flag_mask(const fs_inst *inst, unsigned width)
1071   {
1072      assert(util_is_power_of_two_nonzero(width));
1073      const unsigned start = (inst->flag_subreg * 16 + inst->group) &
1074                             ~(width - 1);
1075      const unsigned end = start + ALIGN(inst->exec_size, width);
1076      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
1077   }
1078
1079   unsigned
1080   bit_mask(unsigned n)
1081   {
1082      return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
1083   }
1084
1085   unsigned
1086   flag_mask(const fs_reg &r, unsigned sz)
1087   {
1088      if (r.file == ARF) {
1089         const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
1090         const unsigned end = start + sz;
1091         return bit_mask(end) & ~bit_mask(start);
1092      } else {
1093         return 0;
1094      }
1095   }
1096}
1097
1098unsigned
1099fs_inst::flags_read(const intel_device_info *devinfo) const
1100{
1101   if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
1102       predicate == BRW_PREDICATE_ALIGN1_ALLV) {
1103      /* The vertical predication modes combine corresponding bits from
1104       * f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
1105       */
1106      const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
1107      return flag_mask(this, 1) << shift | flag_mask(this, 1);
1108   } else if (predicate) {
1109      return flag_mask(this, predicate_width(predicate));
1110   } else {
1111      unsigned mask = 0;
1112      for (int i = 0; i < sources; i++) {
1113         mask |= flag_mask(src[i], size_read(i));
1114      }
1115      return mask;
1116   }
1117}
1118
1119unsigned
1120fs_inst::flags_written(const intel_device_info *devinfo) const
1121{
1122   /* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
1123    * using a separte cmpn and sel instruction.  This lowering occurs in
1124    * fs_vistor::lower_minmax which is called very, very late.
1125    */
1126   if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
1127                            opcode != BRW_OPCODE_CSEL &&
1128                            opcode != BRW_OPCODE_IF &&
1129                            opcode != BRW_OPCODE_WHILE)) ||
1130       opcode == FS_OPCODE_FB_WRITE) {
1131      return flag_mask(this, 1);
1132   } else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
1133              opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
1134      return flag_mask(this, 32);
1135   } else {
1136      return flag_mask(dst, size_written);
1137   }
1138}
1139
1140/**
1141 * Returns how many MRFs an FS opcode will write over.
1142 *
1143 * Note that this is not the 0 or 1 implied writes in an actual gen
1144 * instruction -- the FS opcodes often generate MOVs in addition.
1145 */
1146unsigned
1147fs_inst::implied_mrf_writes() const
1148{
1149   if (mlen == 0)
1150      return 0;
1151
1152   if (base_mrf == -1)
1153      return 0;
1154
1155   switch (opcode) {
1156   case SHADER_OPCODE_RCP:
1157   case SHADER_OPCODE_RSQ:
1158   case SHADER_OPCODE_SQRT:
1159   case SHADER_OPCODE_EXP2:
1160   case SHADER_OPCODE_LOG2:
1161   case SHADER_OPCODE_SIN:
1162   case SHADER_OPCODE_COS:
1163      return 1 * exec_size / 8;
1164   case SHADER_OPCODE_POW:
1165   case SHADER_OPCODE_INT_QUOTIENT:
1166   case SHADER_OPCODE_INT_REMAINDER:
1167      return 2 * exec_size / 8;
1168   case SHADER_OPCODE_TEX:
1169   case FS_OPCODE_TXB:
1170   case SHADER_OPCODE_TXD:
1171   case SHADER_OPCODE_TXF:
1172   case SHADER_OPCODE_TXF_CMS:
1173   case SHADER_OPCODE_TXF_MCS:
1174   case SHADER_OPCODE_TG4:
1175   case SHADER_OPCODE_TG4_OFFSET:
1176   case SHADER_OPCODE_TXL:
1177   case SHADER_OPCODE_TXS:
1178   case SHADER_OPCODE_LOD:
1179   case SHADER_OPCODE_SAMPLEINFO:
1180      return 1;
1181   case FS_OPCODE_FB_WRITE:
1182   case FS_OPCODE_REP_FB_WRITE:
1183      return src[0].file == BAD_FILE ? 0 : 2;
1184   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1185   case SHADER_OPCODE_GFX4_SCRATCH_READ:
1186      return 1;
1187   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
1188      return mlen;
1189   case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
1190      return mlen;
1191   default:
1192      unreachable("not reached");
1193   }
1194}
1195
1196fs_reg
1197fs_visitor::vgrf(const glsl_type *const type)
1198{
1199   int reg_width = dispatch_width / 8;
1200   return fs_reg(VGRF,
1201                 alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
1202                 brw_type_for_base_type(type));
1203}
1204
1205fs_reg::fs_reg(enum brw_reg_file file, int nr)
1206{
1207   init();
1208   this->file = file;
1209   this->nr = nr;
1210   this->type = BRW_REGISTER_TYPE_F;
1211   this->stride = (file == UNIFORM ? 0 : 1);
1212}
1213
1214fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
1215{
1216   init();
1217   this->file = file;
1218   this->nr = nr;
1219   this->type = type;
1220   this->stride = (file == UNIFORM ? 0 : 1);
1221}
1222
1223/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1224 * This brings in those uniform definitions
1225 */
1226void
1227fs_visitor::import_uniforms(fs_visitor *v)
1228{
1229   this->push_constant_loc = v->push_constant_loc;
1230   this->pull_constant_loc = v->pull_constant_loc;
1231   this->uniforms = v->uniforms;
1232   this->subgroup_id = v->subgroup_id;
1233   for (unsigned i = 0; i < ARRAY_SIZE(this->group_size); i++)
1234      this->group_size[i] = v->group_size[i];
1235}
1236
1237void
1238fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
1239{
1240   assert(stage == MESA_SHADER_FRAGMENT);
1241
1242   /* gl_FragCoord.x */
1243   bld.MOV(wpos, this->pixel_x);
1244   wpos = offset(wpos, bld, 1);
1245
1246   /* gl_FragCoord.y */
1247   bld.MOV(wpos, this->pixel_y);
1248   wpos = offset(wpos, bld, 1);
1249
1250   /* gl_FragCoord.z */
1251   if (devinfo->ver >= 6) {
1252      bld.MOV(wpos, this->pixel_z);
1253   } else {
1254      bld.emit(FS_OPCODE_LINTERP, wpos,
1255               this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
1256               component(interp_reg(VARYING_SLOT_POS, 2), 0));
1257   }
1258   wpos = offset(wpos, bld, 1);
1259
1260   /* gl_FragCoord.w: Already set up in emit_interpolation */
1261   bld.MOV(wpos, this->wpos_w);
1262}
1263
1264enum brw_barycentric_mode
1265brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
1266{
1267   /* Barycentric modes don't make sense for flat inputs. */
1268   assert(mode != INTERP_MODE_FLAT);
1269
1270   unsigned bary;
1271   switch (op) {
1272   case nir_intrinsic_load_barycentric_pixel:
1273   case nir_intrinsic_load_barycentric_at_offset:
1274      bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
1275      break;
1276   case nir_intrinsic_load_barycentric_centroid:
1277      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
1278      break;
1279   case nir_intrinsic_load_barycentric_sample:
1280   case nir_intrinsic_load_barycentric_at_sample:
1281      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
1282      break;
1283   default:
1284      unreachable("invalid intrinsic");
1285   }
1286
1287   if (mode == INTERP_MODE_NOPERSPECTIVE)
1288      bary += 3;
1289
1290   return (enum brw_barycentric_mode) bary;
1291}
1292
1293/**
1294 * Turn one of the two CENTROID barycentric modes into PIXEL mode.
1295 */
1296static enum brw_barycentric_mode
1297centroid_to_pixel(enum brw_barycentric_mode bary)
1298{
1299   assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
1300          bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
1301   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
1302}
1303
1304fs_reg *
1305fs_visitor::emit_frontfacing_interpolation()
1306{
1307   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1308
1309   if (devinfo->ver >= 12) {
1310      fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W));
1311
1312      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_W);
1313      bld.ASR(tmp, g1, brw_imm_d(15));
1314      bld.NOT(*reg, tmp);
1315   } else if (devinfo->ver >= 6) {
1316      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1317       * a boolean result from this (~0/true or 0/false).
1318       *
1319       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1320       * this task in only one instruction:
1321       *    - a negation source modifier will flip the bit; and
1322       *    - a W -> D type conversion will sign extend the bit into the high
1323       *      word of the destination.
1324       *
1325       * An ASR 15 fills the low word of the destination.
1326       */
1327      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1328      g0.negate = true;
1329
1330      bld.ASR(*reg, g0, brw_imm_d(15));
1331   } else {
1332      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1333       * a boolean result from this (1/true or 0/false).
1334       *
1335       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1336       * the negation source modifier to flip it. Unfortunately the SHR
1337       * instruction only operates on UD (or D with an abs source modifier)
1338       * sources without negation.
1339       *
1340       * Instead, use ASR (which will give ~0/true or 0/false).
1341       */
1342      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1343      g1_6.negate = true;
1344
1345      bld.ASR(*reg, g1_6, brw_imm_d(31));
1346   }
1347
1348   return reg;
1349}
1350
1351void
1352fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1353{
1354   assert(stage == MESA_SHADER_FRAGMENT);
1355   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1356   assert(dst.type == BRW_REGISTER_TYPE_F);
1357
1358   if (wm_prog_data->persample_dispatch) {
1359      /* Convert int_sample_pos to floating point */
1360      bld.MOV(dst, int_sample_pos);
1361      /* Scale to the range [0, 1] */
1362      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
1363   }
1364   else {
1365      /* From ARB_sample_shading specification:
1366       * "When rendering to a non-multisample buffer, or if multisample
1367       *  rasterization is disabled, gl_SamplePosition will always be
1368       *  (0.5, 0.5).
1369       */
1370      bld.MOV(dst, brw_imm_f(0.5f));
1371   }
1372}
1373
1374fs_reg *
1375fs_visitor::emit_samplepos_setup()
1376{
1377   assert(devinfo->ver >= 6);
1378
1379   const fs_builder abld = bld.annotate("compute sample position");
1380   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1381   fs_reg pos = *reg;
1382   fs_reg int_sample_x = vgrf(glsl_type::int_type);
1383   fs_reg int_sample_y = vgrf(glsl_type::int_type);
1384
1385   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1386    * mode will be enabled.
1387    *
1388    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1389    * R31.1:0         Position Offset X/Y for Slot[3:0]
1390    * R31.3:2         Position Offset X/Y for Slot[7:4]
1391    * .....
1392    *
1393    * The X, Y sample positions come in as bytes in  thread payload. So, read
1394    * the positions using vstride=16, width=8, hstride=2.
1395    */
1396   const fs_reg sample_pos_reg =
1397      fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
1398
1399   /* Compute gl_SamplePosition.x */
1400   abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
1401   compute_sample_position(offset(pos, abld, 0), int_sample_x);
1402
1403   /* Compute gl_SamplePosition.y */
1404   abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
1405   compute_sample_position(offset(pos, abld, 1), int_sample_y);
1406   return reg;
1407}
1408
1409fs_reg *
1410fs_visitor::emit_sampleid_setup()
1411{
1412   assert(stage == MESA_SHADER_FRAGMENT);
1413   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1414   assert(devinfo->ver >= 6);
1415
1416   const fs_builder abld = bld.annotate("compute sample id");
1417   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
1418
1419   if (!key->multisample_fbo) {
1420      /* As per GL_ARB_sample_shading specification:
1421       * "When rendering to a non-multisample buffer, or if multisample
1422       *  rasterization is disabled, gl_SampleID will always be zero."
1423       */
1424      abld.MOV(*reg, brw_imm_d(0));
1425   } else if (devinfo->ver >= 8) {
1426      /* Sample ID comes in as 4-bit numbers in g1.0:
1427       *
1428       *    15:12 Slot 3 SampleID (only used in SIMD16)
1429       *     11:8 Slot 2 SampleID (only used in SIMD16)
1430       *      7:4 Slot 1 SampleID
1431       *      3:0 Slot 0 SampleID
1432       *
1433       * Each slot corresponds to four channels, so we want to replicate each
1434       * half-byte value to 4 channels in a row:
1435       *
1436       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
1437       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
1438       *
1439       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
1440       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
1441       *
1442       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
1443       * channels to read the first byte (7:0), and the second group of 8
1444       * channels to read the second byte (15:8).  Then, we shift right by
1445       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
1446       * values into place.  Finally, we AND with 0xf to keep the low nibble.
1447       *
1448       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
1449       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
1450       *
1451       * TODO: These payload bits exist on Gfx7 too, but they appear to always
1452       *       be zero, so this code fails to work.  We should find out why.
1453       */
1454      const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
1455
1456      for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
1457         const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
1458         hbld.SHR(offset(tmp, hbld, i),
1459                  stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
1460                         1, 8, 0),
1461                  brw_imm_v(0x44440000));
1462      }
1463
1464      abld.AND(*reg, tmp, brw_imm_w(0xf));
1465   } else {
1466      const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
1467      const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
1468
1469      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1470       * 8x multisampling, subspan 0 will represent sample N (where N
1471       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1472       * 7. We can find the value of N by looking at R0.0 bits 7:6
1473       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1474       * (since samples are always delivered in pairs). That is, we
1475       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1476       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1477       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1478       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1479       * populating a temporary variable with the sequence (0, 1, 2, 3),
1480       * and then reading from it using vstride=1, width=4, hstride=0.
1481       * These computations hold good for 4x multisampling as well.
1482       *
1483       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1484       * the first four slots are sample 0 of subspan 0; the next four
1485       * are sample 1 of subspan 0; the third group is sample 0 of
1486       * subspan 1, and finally sample 1 of subspan 1.
1487       */
1488
1489      /* SKL+ has an extra bit for the Starting Sample Pair Index to
1490       * accomodate 16x MSAA.
1491       */
1492      abld.exec_all().group(1, 0)
1493          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1494               brw_imm_ud(0xc0));
1495      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
1496
1497      /* This works for SIMD8-SIMD16.  It also works for SIMD32 but only if we
1498       * can assume 4x MSAA.  Disallow it on IVB+
1499       *
1500       * FINISHME: One day, we could come up with a way to do this that
1501       * actually works on gfx7.
1502       */
1503      if (devinfo->ver >= 7)
1504         limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gfx7");
1505      abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
1506
1507      /* This special instruction takes care of setting vstride=1,
1508       * width=4, hstride=0 of t2 during an ADD instruction.
1509       */
1510      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1511   }
1512
1513   return reg;
1514}
1515
1516fs_reg *
1517fs_visitor::emit_samplemaskin_setup()
1518{
1519   assert(stage == MESA_SHADER_FRAGMENT);
1520   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
1521   assert(devinfo->ver >= 6);
1522
1523   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1524
1525   /* The HW doesn't provide us with expected values. */
1526   assert(!wm_prog_data->per_coarse_pixel_dispatch);
1527
1528   fs_reg coverage_mask =
1529      fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
1530
1531   if (wm_prog_data->persample_dispatch) {
1532      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
1533       * and a mask representing which sample is being processed by the
1534       * current shader invocation.
1535       *
1536       * From the OES_sample_variables specification:
1537       * "When per-sample shading is active due to the use of a fragment input
1538       *  qualified by "sample" or due to the use of the gl_SampleID or
1539       *  gl_SamplePosition variables, only the bit for the current sample is
1540       *  set in gl_SampleMaskIn."
1541       */
1542      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
1543
1544      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
1545         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
1546
1547      fs_reg one = vgrf(glsl_type::int_type);
1548      fs_reg enabled_mask = vgrf(glsl_type::int_type);
1549      abld.MOV(one, brw_imm_d(1));
1550      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
1551      abld.AND(*reg, enabled_mask, coverage_mask);
1552   } else {
1553      /* In per-pixel mode, the coverage mask is sufficient. */
1554      *reg = coverage_mask;
1555   }
1556   return reg;
1557}
1558
1559fs_reg *
1560fs_visitor::emit_shading_rate_setup()
1561{
1562   assert(devinfo->ver >= 11);
1563
1564   const fs_builder abld = bld.annotate("compute fragment shading rate");
1565
1566   fs_reg *reg = new(this->mem_ctx) fs_reg(bld.vgrf(BRW_REGISTER_TYPE_UD));
1567
1568   struct brw_wm_prog_data *wm_prog_data =
1569      brw_wm_prog_data(bld.shader->stage_prog_data);
1570
1571   /* Coarse pixel shading size fields overlap with other fields of not in
1572    * coarse pixel dispatch mode, so report 0 when that's not the case.
1573    */
1574   if (wm_prog_data->per_coarse_pixel_dispatch) {
1575      /* The shading rates provided in the shader are the actual 2D shading
1576       * rate while the SPIR-V built-in is the enum value that has the shading
1577       * rate encoded as a bitfield.  Fortunately, the bitfield value is just
1578       * the shading rate divided by two and shifted.
1579       */
1580
1581      /* r1.0 - 0:7 ActualCoarsePixelShadingSize.X */
1582      fs_reg actual_x = fs_reg(retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UB));
1583      /* r1.0 - 15:8 ActualCoarsePixelShadingSize.Y */
1584      fs_reg actual_y = byte_offset(actual_x, 1);
1585
1586      fs_reg int_rate_x = bld.vgrf(BRW_REGISTER_TYPE_UD);
1587      fs_reg int_rate_y = bld.vgrf(BRW_REGISTER_TYPE_UD);
1588
1589      abld.SHR(int_rate_y, actual_y, brw_imm_ud(1));
1590      abld.SHR(int_rate_x, actual_x, brw_imm_ud(1));
1591      abld.SHL(int_rate_x, int_rate_x, brw_imm_ud(2));
1592      abld.OR(*reg, int_rate_x, int_rate_y);
1593   } else {
1594      abld.MOV(*reg, brw_imm_ud(0));
1595   }
1596
1597   return reg;
1598}
1599
1600fs_reg
1601fs_visitor::resolve_source_modifiers(const fs_reg &src)
1602{
1603   if (!src.abs && !src.negate)
1604      return src;
1605
1606   fs_reg temp = bld.vgrf(src.type);
1607   bld.MOV(temp, src);
1608
1609   return temp;
1610}
1611
1612void
1613fs_visitor::emit_gs_thread_end()
1614{
1615   assert(stage == MESA_SHADER_GEOMETRY);
1616
1617   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
1618
1619   if (gs_compile->control_data_header_size_bits > 0) {
1620      emit_gs_control_data_bits(this->final_gs_vertex_count);
1621   }
1622
1623   const fs_builder abld = bld.annotate("thread end");
1624   fs_inst *inst;
1625
1626   if (gs_prog_data->static_vertex_count != -1) {
1627      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
1628         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
1629             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
1630             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
1631             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
1632            prev->eot = true;
1633
1634            /* Delete now dead instructions. */
1635            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
1636               if (dead == prev)
1637                  break;
1638               dead->remove();
1639            }
1640            return;
1641         } else if (prev->is_control_flow() || prev->has_side_effects()) {
1642            break;
1643         }
1644      }
1645      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
1646      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
1647      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
1648      inst->mlen = 1;
1649   } else {
1650      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
1651      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
1652      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
1653      sources[1] = this->final_gs_vertex_count;
1654      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
1655      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
1656      inst->mlen = 2;
1657   }
1658   inst->eot = true;
1659   inst->offset = 0;
1660}
1661
1662void
1663fs_visitor::assign_curb_setup()
1664{
1665   unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
1666
1667   unsigned ubo_push_length = 0;
1668   unsigned ubo_push_start[4];
1669   for (int i = 0; i < 4; i++) {
1670      ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
1671      ubo_push_length += stage_prog_data->ubo_ranges[i].length;
1672   }
1673
1674   prog_data->curb_read_length = uniform_push_length + ubo_push_length;
1675
1676   uint64_t used = 0;
1677
1678   if (stage == MESA_SHADER_COMPUTE &&
1679       brw_cs_prog_data(prog_data)->uses_inline_data) {
1680      /* With COMPUTE_WALKER, we can push up to one register worth of data via
1681       * the inline data parameter in the COMPUTE_WALKER command itself.
1682       *
1683       * TODO: Support inline data and push at the same time.
1684       */
1685      assert(devinfo->verx10 >= 125);
1686      assert(uniform_push_length <= 1);
1687   } else if (stage == MESA_SHADER_COMPUTE && devinfo->verx10 >= 125) {
1688      fs_builder ubld = bld.exec_all().group(8, 0).at(
1689         cfg->first_block(), cfg->first_block()->start());
1690
1691      /* The base address for our push data is passed in as R0.0[31:6].  We
1692       * have to mask off the bottom 6 bits.
1693       */
1694      fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1695      ubld.group(1, 0).AND(base_addr,
1696                           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
1697                           brw_imm_ud(INTEL_MASK(31, 6)));
1698
1699      fs_reg header0 = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1700      ubld.MOV(header0, brw_imm_ud(0));
1701      ubld.group(1, 0).SHR(component(header0, 2), base_addr, brw_imm_ud(4));
1702
1703      /* On Gfx12-HP we load constants at the start of the program using A32
1704       * stateless messages.
1705       */
1706      for (unsigned i = 0; i < uniform_push_length;) {
1707         /* Limit ourselves to HW limit of 8 Owords (8 * 16bytes = 128 bytes
1708          * or 4 registers).
1709          */
1710         unsigned num_regs = MIN2(uniform_push_length - i, 4);
1711         assert(num_regs > 0);
1712         num_regs = 1 << util_logbase2(num_regs);
1713
1714         fs_reg header;
1715         if (i == 0) {
1716            header = header0;
1717         } else {
1718            header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
1719            ubld.MOV(header, brw_imm_ud(0));
1720            ubld.group(1, 0).ADD(component(header, 2),
1721                                 component(header0, 2),
1722                                 brw_imm_ud(i * 2));
1723         }
1724
1725         fs_reg srcs[4] = {
1726            brw_imm_ud(0), /* desc */
1727            brw_imm_ud(0), /* ex_desc */
1728            header, /* payload */
1729            fs_reg(), /* payload2 */
1730         };
1731
1732         fs_reg dest = retype(brw_vec8_grf(payload.num_regs + i, 0),
1733                              BRW_REGISTER_TYPE_UD);
1734
1735         /* This instruction has to be run SIMD16 if we're filling more than a
1736          * single register.
1737          */
1738         unsigned send_width = MIN2(16, num_regs * 8);
1739
1740         fs_inst *send = ubld.group(send_width, 0).emit(SHADER_OPCODE_SEND,
1741                                                        dest, srcs, 4);
1742         send->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
1743         send->desc = brw_dp_desc(devinfo, GFX8_BTI_STATELESS_NON_COHERENT,
1744                                  GFX7_DATAPORT_DC_OWORD_BLOCK_READ,
1745                                  BRW_DATAPORT_OWORD_BLOCK_OWORDS(num_regs * 2));
1746         send->header_size = 1;
1747         send->mlen = 1;
1748         send->size_written = num_regs * REG_SIZE;
1749         send->send_is_volatile = true;
1750
1751         i += num_regs;
1752      }
1753
1754      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1755   }
1756
1757   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1758   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1759      for (unsigned int i = 0; i < inst->sources; i++) {
1760	 if (inst->src[i].file == UNIFORM) {
1761            int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
1762            int constant_nr;
1763            if (inst->src[i].nr >= UBO_START) {
1764               /* constant_nr is in 32-bit units, the rest are in bytes */
1765               constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
1766                             inst->src[i].offset / 4;
1767            } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1768               constant_nr = push_constant_loc[uniform_nr];
1769            } else {
1770               /* Section 5.11 of the OpenGL 4.1 spec says:
1771                * "Out-of-bounds reads return undefined values, which include
1772                *  values from other variables of the active program or zero."
1773                * Just return the first push constant.
1774                */
1775               constant_nr = 0;
1776            }
1777
1778            assert(constant_nr / 8 < 64);
1779            used |= BITFIELD64_BIT(constant_nr / 8);
1780
1781	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1782						  constant_nr / 8,
1783						  constant_nr % 8);
1784            brw_reg.abs = inst->src[i].abs;
1785            brw_reg.negate = inst->src[i].negate;
1786
1787            assert(inst->src[i].stride == 0);
1788            inst->src[i] = byte_offset(
1789               retype(brw_reg, inst->src[i].type),
1790               inst->src[i].offset % 4);
1791	 }
1792      }
1793   }
1794
1795   uint64_t want_zero = used & stage_prog_data->zero_push_reg;
1796   if (want_zero) {
1797      assert(!compiler->compact_params);
1798      fs_builder ubld = bld.exec_all().group(8, 0).at(
1799         cfg->first_block(), cfg->first_block()->start());
1800
1801      /* push_reg_mask_param is in 32-bit units */
1802      unsigned mask_param = stage_prog_data->push_reg_mask_param;
1803      struct brw_reg mask = brw_vec1_grf(payload.num_regs + mask_param / 8,
1804                                                            mask_param % 8);
1805
1806      fs_reg b32;
1807      for (unsigned i = 0; i < 64; i++) {
1808         if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
1809            fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
1810            ubld.SHL(horiz_offset(shifted, 8),
1811                     byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
1812                     brw_imm_v(0x01234567));
1813            ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
1814
1815            fs_builder ubld16 = ubld.group(16, 0);
1816            b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
1817            ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
1818         }
1819
1820         if (want_zero & BITFIELD64_BIT(i)) {
1821            assert(i < prog_data->curb_read_length);
1822            struct brw_reg push_reg =
1823               retype(brw_vec8_grf(payload.num_regs + i, 0),
1824                      BRW_REGISTER_TYPE_D);
1825
1826            ubld.AND(push_reg, push_reg, component(b32, i % 16));
1827         }
1828      }
1829
1830      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
1831   }
1832
1833   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
1834   this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
1835}
1836
1837/*
1838 * Build up an array of indices into the urb_setup array that
1839 * references the active entries of the urb_setup array.
1840 * Used to accelerate walking the active entries of the urb_setup array
1841 * on each upload.
1842 */
1843void
1844brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
1845{
1846   /* Make sure uint8_t is sufficient */
1847   STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
1848   uint8_t index = 0;
1849   for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
1850      if (wm_prog_data->urb_setup[attr] >= 0) {
1851         wm_prog_data->urb_setup_attribs[index++] = attr;
1852      }
1853   }
1854   wm_prog_data->urb_setup_attribs_count = index;
1855}
1856
1857static void
1858calculate_urb_setup(const struct intel_device_info *devinfo,
1859                    const struct brw_wm_prog_key *key,
1860                    struct brw_wm_prog_data *prog_data,
1861                    const nir_shader *nir)
1862{
1863   memset(prog_data->urb_setup, -1,
1864          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1865
1866   int urb_next = 0;
1867   /* Figure out where each of the incoming setup attributes lands. */
1868   if (devinfo->ver >= 6) {
1869      if (util_bitcount64(nir->info.inputs_read &
1870                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
1871         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1872          * first 16 varying inputs, so we can put them wherever we want.
1873          * Just put them in order.
1874          *
1875          * This is useful because it means that (a) inputs not used by the
1876          * fragment shader won't take up valuable register space, and (b) we
1877          * won't have to recompile the fragment shader if it gets paired with
1878          * a different vertex (or geometry) shader.
1879          */
1880         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1881            if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1882                BITFIELD64_BIT(i)) {
1883               prog_data->urb_setup[i] = urb_next++;
1884            }
1885         }
1886      } else {
1887         /* We have enough input varyings that the SF/SBE pipeline stage can't
1888          * arbitrarily rearrange them to suit our whim; we have to put them
1889          * in an order that matches the output of the previous pipeline stage
1890          * (geometry or vertex shader).
1891          */
1892
1893         /* Re-compute the VUE map here in the case that the one coming from
1894          * geometry has more than one position slot (used for Primitive
1895          * Replication).
1896          */
1897         struct brw_vue_map prev_stage_vue_map;
1898         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1899                             key->input_slots_valid,
1900                             nir->info.separate_shader, 1);
1901
1902         int first_slot =
1903            brw_compute_first_urb_slot_required(nir->info.inputs_read,
1904                                                &prev_stage_vue_map);
1905
1906         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1907         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1908              slot++) {
1909            int varying = prev_stage_vue_map.slot_to_varying[slot];
1910            if (varying != BRW_VARYING_SLOT_PAD &&
1911                (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
1912                 BITFIELD64_BIT(varying))) {
1913               prog_data->urb_setup[varying] = slot - first_slot;
1914            }
1915         }
1916         urb_next = prev_stage_vue_map.num_slots - first_slot;
1917      }
1918   } else {
1919      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1920      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1921         /* Point size is packed into the header, not as a general attribute */
1922         if (i == VARYING_SLOT_PSIZ)
1923            continue;
1924
1925	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1926	    /* The back color slot is skipped when the front color is
1927	     * also written to.  In addition, some slots can be
1928	     * written in the vertex shader and not read in the
1929	     * fragment shader.  So the register number must always be
1930	     * incremented, mapped or not.
1931	     */
1932	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1933	       prog_data->urb_setup[i] = urb_next;
1934            urb_next++;
1935	 }
1936      }
1937
1938      /*
1939       * It's a FS only attribute, and we did interpolation for this attribute
1940       * in SF thread. So, count it here, too.
1941       *
1942       * See compile_sf_prog() for more info.
1943       */
1944      if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1945         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1946   }
1947
1948   prog_data->num_varying_inputs = urb_next;
1949   prog_data->inputs = nir->info.inputs_read;
1950
1951   brw_compute_urb_setup_index(prog_data);
1952}
1953
1954void
1955fs_visitor::assign_urb_setup()
1956{
1957   assert(stage == MESA_SHADER_FRAGMENT);
1958   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
1959
1960   int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1961
1962   /* Offset all the urb_setup[] index by the actual position of the
1963    * setup regs, now that the location of the constants has been chosen.
1964    */
1965   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1966      for (int i = 0; i < inst->sources; i++) {
1967         if (inst->src[i].file == ATTR) {
1968            /* ATTR regs in the FS are in units of logical scalar inputs each
1969             * of which consumes half of a GRF register.
1970             */
1971            assert(inst->src[i].offset < REG_SIZE / 2);
1972            const unsigned grf = urb_start + inst->src[i].nr / 2;
1973            const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
1974                                    inst->src[i].offset;
1975            const unsigned width = inst->src[i].stride == 0 ?
1976                                   1 : MIN2(inst->exec_size, 8);
1977            struct brw_reg reg = stride(
1978               byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
1979                           offset),
1980               width * inst->src[i].stride,
1981               width, inst->src[i].stride);
1982            reg.abs = inst->src[i].abs;
1983            reg.negate = inst->src[i].negate;
1984            inst->src[i] = reg;
1985         }
1986      }
1987   }
1988
1989   /* Each attribute is 4 setup channels, each of which is half a reg. */
1990   this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
1991}
1992
1993void
1994fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
1995{
1996   for (int i = 0; i < inst->sources; i++) {
1997      if (inst->src[i].file == ATTR) {
1998         int grf = payload.num_regs +
1999                   prog_data->curb_read_length +
2000                   inst->src[i].nr +
2001                   inst->src[i].offset / REG_SIZE;
2002
2003         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
2004          *
2005          * VertStride must be used to cross GRF register boundaries. This
2006          * rule implies that elements within a 'Width' cannot cross GRF
2007          * boundaries.
2008          *
2009          * So, for registers that are large enough, we have to split the exec
2010          * size in two and trust the compression state to sort it out.
2011          */
2012         unsigned total_size = inst->exec_size *
2013                               inst->src[i].stride *
2014                               type_sz(inst->src[i].type);
2015
2016         assert(total_size <= 2 * REG_SIZE);
2017         const unsigned exec_size =
2018            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
2019
2020         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
2021         struct brw_reg reg =
2022            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
2023                               inst->src[i].offset % REG_SIZE),
2024                   exec_size * inst->src[i].stride,
2025                   width, inst->src[i].stride);
2026         reg.abs = inst->src[i].abs;
2027         reg.negate = inst->src[i].negate;
2028
2029         inst->src[i] = reg;
2030      }
2031   }
2032}
2033
2034void
2035fs_visitor::assign_vs_urb_setup()
2036{
2037   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
2038
2039   assert(stage == MESA_SHADER_VERTEX);
2040
2041   /* Each attribute is 4 regs. */
2042   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
2043
2044   assert(vs_prog_data->base.urb_read_length <= 15);
2045
2046   /* Rewrite all ATTR file references to the hw grf that they land in. */
2047   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2048      convert_attr_sources_to_hw_regs(inst);
2049   }
2050}
2051
2052void
2053fs_visitor::assign_tcs_urb_setup()
2054{
2055   assert(stage == MESA_SHADER_TESS_CTRL);
2056
2057   /* Rewrite all ATTR file references to HW_REGs. */
2058   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2059      convert_attr_sources_to_hw_regs(inst);
2060   }
2061}
2062
2063void
2064fs_visitor::assign_tes_urb_setup()
2065{
2066   assert(stage == MESA_SHADER_TESS_EVAL);
2067
2068   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2069
2070   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
2071
2072   /* Rewrite all ATTR file references to HW_REGs. */
2073   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2074      convert_attr_sources_to_hw_regs(inst);
2075   }
2076}
2077
2078void
2079fs_visitor::assign_gs_urb_setup()
2080{
2081   assert(stage == MESA_SHADER_GEOMETRY);
2082
2083   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
2084
2085   first_non_payload_grf +=
2086      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
2087
2088   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2089      /* Rewrite all ATTR file references to GRFs. */
2090      convert_attr_sources_to_hw_regs(inst);
2091   }
2092}
2093
2094
2095/**
2096 * Split large virtual GRFs into separate components if we can.
2097 *
2098 * This is mostly duplicated with what brw_fs_vector_splitting does,
2099 * but that's really conservative because it's afraid of doing
2100 * splitting that doesn't result in real progress after the rest of
2101 * the optimization phases, which would cause infinite looping in
2102 * optimization.  We can do it once here, safely.  This also has the
2103 * opportunity to split interpolated values, or maybe even uniforms,
2104 * which we don't have at the IR level.
2105 *
2106 * We want to split, because virtual GRFs are what we register
2107 * allocate and spill (due to contiguousness requirements for some
2108 * instructions), and they're what we naturally generate in the
2109 * codegen process, but most virtual GRFs don't actually need to be
2110 * contiguous sets of GRFs.  If we split, we'll end up with reduced
2111 * live intervals and better dead code elimination and coalescing.
2112 */
2113void
2114fs_visitor::split_virtual_grfs()
2115{
2116   /* Compact the register file so we eliminate dead vgrfs.  This
2117    * only defines split points for live registers, so if we have
2118    * too large dead registers they will hit assertions later.
2119    */
2120   compact_virtual_grfs();
2121
2122   int num_vars = this->alloc.count;
2123
2124   /* Count the total number of registers */
2125   int reg_count = 0;
2126   int vgrf_to_reg[num_vars];
2127   for (int i = 0; i < num_vars; i++) {
2128      vgrf_to_reg[i] = reg_count;
2129      reg_count += alloc.sizes[i];
2130   }
2131
2132   /* An array of "split points".  For each register slot, this indicates
2133    * if this slot can be separated from the previous slot.  Every time an
2134    * instruction uses multiple elements of a register (as a source or
2135    * destination), we mark the used slots as inseparable.  Then we go
2136    * through and split the registers into the smallest pieces we can.
2137    */
2138   bool *split_points = new bool[reg_count];
2139   memset(split_points, 0, reg_count * sizeof(*split_points));
2140
2141   /* Mark all used registers as fully splittable */
2142   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2143      if (inst->dst.file == VGRF) {
2144         int reg = vgrf_to_reg[inst->dst.nr];
2145         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
2146            split_points[reg + j] = true;
2147      }
2148
2149      for (int i = 0; i < inst->sources; i++) {
2150         if (inst->src[i].file == VGRF) {
2151            int reg = vgrf_to_reg[inst->src[i].nr];
2152            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
2153               split_points[reg + j] = true;
2154         }
2155      }
2156   }
2157
2158   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2159      /* We fix up undef instructions later */
2160      if (inst->opcode == SHADER_OPCODE_UNDEF) {
2161         /* UNDEF instructions are currently only used to undef entire
2162          * registers.  We need this invariant later when we split them.
2163          */
2164         assert(inst->dst.file == VGRF);
2165         assert(inst->dst.offset == 0);
2166         assert(inst->size_written == alloc.sizes[inst->dst.nr] * REG_SIZE);
2167         continue;
2168      }
2169
2170      if (inst->dst.file == VGRF) {
2171         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2172         for (unsigned j = 1; j < regs_written(inst); j++)
2173            split_points[reg + j] = false;
2174      }
2175      for (int i = 0; i < inst->sources; i++) {
2176         if (inst->src[i].file == VGRF) {
2177            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2178            for (unsigned j = 1; j < regs_read(inst, i); j++)
2179               split_points[reg + j] = false;
2180         }
2181      }
2182   }
2183
2184   int *new_virtual_grf = new int[reg_count];
2185   int *new_reg_offset = new int[reg_count];
2186
2187   int reg = 0;
2188   for (int i = 0; i < num_vars; i++) {
2189      /* The first one should always be 0 as a quick sanity check. */
2190      assert(split_points[reg] == false);
2191
2192      /* j = 0 case */
2193      new_reg_offset[reg] = 0;
2194      reg++;
2195      int offset = 1;
2196
2197      /* j > 0 case */
2198      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2199         /* If this is a split point, reset the offset to 0 and allocate a
2200          * new virtual GRF for the previous offset many registers
2201          */
2202         if (split_points[reg]) {
2203            assert(offset <= MAX_VGRF_SIZE);
2204            int grf = alloc.allocate(offset);
2205            for (int k = reg - offset; k < reg; k++)
2206               new_virtual_grf[k] = grf;
2207            offset = 0;
2208         }
2209         new_reg_offset[reg] = offset;
2210         offset++;
2211         reg++;
2212      }
2213
2214      /* The last one gets the original register number */
2215      assert(offset <= MAX_VGRF_SIZE);
2216      alloc.sizes[i] = offset;
2217      for (int k = reg - offset; k < reg; k++)
2218         new_virtual_grf[k] = i;
2219   }
2220   assert(reg == reg_count);
2221
2222   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2223      if (inst->opcode == SHADER_OPCODE_UNDEF) {
2224         const fs_builder ibld(this, block, inst);
2225         assert(inst->size_written % REG_SIZE == 0);
2226         unsigned reg_offset = 0;
2227         while (reg_offset < inst->size_written / REG_SIZE) {
2228            reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
2229            ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
2230            reg_offset += alloc.sizes[new_virtual_grf[reg]];
2231         }
2232         inst->remove(block);
2233         continue;
2234      }
2235
2236      if (inst->dst.file == VGRF) {
2237         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
2238         inst->dst.nr = new_virtual_grf[reg];
2239         inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
2240                            inst->dst.offset % REG_SIZE;
2241         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2242      }
2243      for (int i = 0; i < inst->sources; i++) {
2244	 if (inst->src[i].file == VGRF) {
2245            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
2246            inst->src[i].nr = new_virtual_grf[reg];
2247            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
2248                                  inst->src[i].offset % REG_SIZE;
2249            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2250         }
2251      }
2252   }
2253   invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2254
2255   delete[] split_points;
2256   delete[] new_virtual_grf;
2257   delete[] new_reg_offset;
2258}
2259
2260/**
2261 * Remove unused virtual GRFs and compact the vgrf_* arrays.
2262 *
2263 * During code generation, we create tons of temporary variables, many of
2264 * which get immediately killed and are never used again.  Yet, in later
2265 * optimization and analysis passes, such as compute_live_intervals, we need
2266 * to loop over all the virtual GRFs.  Compacting them can save a lot of
2267 * overhead.
2268 */
2269bool
2270fs_visitor::compact_virtual_grfs()
2271{
2272   bool progress = false;
2273   int *remap_table = new int[this->alloc.count];
2274   memset(remap_table, -1, this->alloc.count * sizeof(int));
2275
2276   /* Mark which virtual GRFs are used. */
2277   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2278      if (inst->dst.file == VGRF)
2279         remap_table[inst->dst.nr] = 0;
2280
2281      for (int i = 0; i < inst->sources; i++) {
2282         if (inst->src[i].file == VGRF)
2283            remap_table[inst->src[i].nr] = 0;
2284      }
2285   }
2286
2287   /* Compact the GRF arrays. */
2288   int new_index = 0;
2289   for (unsigned i = 0; i < this->alloc.count; i++) {
2290      if (remap_table[i] == -1) {
2291         /* We just found an unused register.  This means that we are
2292          * actually going to compact something.
2293          */
2294         progress = true;
2295      } else {
2296         remap_table[i] = new_index;
2297         alloc.sizes[new_index] = alloc.sizes[i];
2298         invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
2299         ++new_index;
2300      }
2301   }
2302
2303   this->alloc.count = new_index;
2304
2305   /* Patch all the instructions to use the newly renumbered registers */
2306   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2307      if (inst->dst.file == VGRF)
2308         inst->dst.nr = remap_table[inst->dst.nr];
2309
2310      for (int i = 0; i < inst->sources; i++) {
2311         if (inst->src[i].file == VGRF)
2312            inst->src[i].nr = remap_table[inst->src[i].nr];
2313      }
2314   }
2315
2316   /* Patch all the references to delta_xy, since they're used in register
2317    * allocation.  If they're unused, switch them to BAD_FILE so we don't
2318    * think some random VGRF is delta_xy.
2319    */
2320   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2321      if (delta_xy[i].file == VGRF) {
2322         if (remap_table[delta_xy[i].nr] != -1) {
2323            delta_xy[i].nr = remap_table[delta_xy[i].nr];
2324         } else {
2325            delta_xy[i].file = BAD_FILE;
2326         }
2327      }
2328   }
2329
2330   delete[] remap_table;
2331
2332   return progress;
2333}
2334
2335static int
2336get_subgroup_id_param_index(const intel_device_info *devinfo,
2337                            const brw_stage_prog_data *prog_data)
2338{
2339   if (prog_data->nr_params == 0)
2340      return -1;
2341
2342   if (devinfo->verx10 >= 125)
2343      return -1;
2344
2345   /* The local thread id is always the last parameter in the list */
2346   uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
2347   if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
2348      return prog_data->nr_params - 1;
2349
2350   return -1;
2351}
2352
2353/**
2354 * Struct for handling complex alignments.
2355 *
2356 * A complex alignment is stored as multiplier and an offset.  A value is
2357 * considered to be aligned if it is {offset} larger than a multiple of {mul}.
2358 * For instance, with an alignment of {8, 2}, cplx_align_apply would do the
2359 * following:
2360 *
2361 *  N  | cplx_align_apply({8, 2}, N)
2362 * ----+-----------------------------
2363 *  4  | 6
2364 *  6  | 6
2365 *  8  | 14
2366 *  10 | 14
2367 *  12 | 14
2368 *  14 | 14
2369 *  16 | 22
2370 */
2371struct cplx_align {
2372   unsigned mul:4;
2373   unsigned offset:4;
2374};
2375
2376#define CPLX_ALIGN_MAX_MUL 8
2377
2378static void
2379cplx_align_assert_sane(struct cplx_align a)
2380{
2381   assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
2382   assert(a.offset < a.mul);
2383}
2384
2385/**
2386 * Combines two alignments to produce a least multiple of sorts.
2387 *
2388 * The returned alignment is the smallest (in terms of multiplier) such that
2389 * anything aligned to both a and b will be aligned to the new alignment.
2390 * This function will assert-fail if a and b are not compatible, i.e. if the
2391 * offset parameters are such that no common alignment is possible.
2392 */
2393static struct cplx_align
2394cplx_align_combine(struct cplx_align a, struct cplx_align b)
2395{
2396   cplx_align_assert_sane(a);
2397   cplx_align_assert_sane(b);
2398
2399   /* Assert that the alignments agree. */
2400   assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
2401
2402   return a.mul > b.mul ? a : b;
2403}
2404
2405/**
2406 * Apply a complex alignment
2407 *
2408 * This function will return the smallest number greater than or equal to
2409 * offset that is aligned to align.
2410 */
2411static unsigned
2412cplx_align_apply(struct cplx_align align, unsigned offset)
2413{
2414   return ALIGN(offset - align.offset, align.mul) + align.offset;
2415}
2416
2417#define UNIFORM_SLOT_SIZE 4
2418
2419struct uniform_slot_info {
2420   /** True if the given uniform slot is live */
2421   unsigned is_live:1;
2422
2423   /** True if this slot and the next slot must remain contiguous */
2424   unsigned contiguous:1;
2425
2426   struct cplx_align align;
2427};
2428
2429static void
2430mark_uniform_slots_read(struct uniform_slot_info *slots,
2431                        unsigned num_slots, unsigned alignment)
2432{
2433   assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
2434   assert(alignment <= CPLX_ALIGN_MAX_MUL);
2435
2436   /* We can't align a slot to anything less than the slot size */
2437   alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
2438
2439   struct cplx_align align = {alignment, 0};
2440   cplx_align_assert_sane(align);
2441
2442   for (unsigned i = 0; i < num_slots; i++) {
2443      slots[i].is_live = true;
2444      if (i < num_slots - 1)
2445         slots[i].contiguous = true;
2446
2447      align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
2448      if (slots[i].align.mul == 0) {
2449         slots[i].align = align;
2450      } else {
2451         slots[i].align = cplx_align_combine(slots[i].align, align);
2452      }
2453   }
2454}
2455
2456/**
2457 * Assign UNIFORM file registers to either push constants or pull constants.
2458 *
2459 * We allow a fragment shader to have more than the specified minimum
2460 * maximum number of fragment shader uniform components (64).  If
2461 * there are too many of these, they'd fill up all of register space.
2462 * So, this will push some of them out to the pull constant buffer and
2463 * update the program to load them.
2464 */
2465void
2466fs_visitor::assign_constant_locations()
2467{
2468   /* Only the first compile gets to decide on locations. */
2469   if (push_constant_loc) {
2470      assert(pull_constant_loc);
2471      return;
2472   }
2473
2474   if (compiler->compact_params) {
2475      struct uniform_slot_info slots[uniforms + 1];
2476      memset(slots, 0, sizeof(slots));
2477
2478      foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2479         for (int i = 0 ; i < inst->sources; i++) {
2480            if (inst->src[i].file != UNIFORM)
2481               continue;
2482
2483            /* NIR tightly packs things so the uniform number might not be
2484             * aligned (if we have a double right after a float, for
2485             * instance).  This is fine because the process of re-arranging
2486             * them will ensure that things are properly aligned.  The offset
2487             * into that uniform, however, must be aligned.
2488             *
2489             * In Vulkan, we have explicit offsets but everything is crammed
2490             * into a single "variable" so inst->src[i].nr will always be 0.
2491             * Everything will be properly aligned relative to that one base.
2492             */
2493            assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
2494
2495            unsigned u = inst->src[i].nr +
2496                         inst->src[i].offset / UNIFORM_SLOT_SIZE;
2497
2498            if (u >= uniforms)
2499               continue;
2500
2501            unsigned slots_read;
2502            if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
2503               slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
2504            } else {
2505               unsigned bytes_read = inst->components_read(i) *
2506                                     type_sz(inst->src[i].type);
2507               slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
2508            }
2509
2510            assert(u + slots_read <= uniforms);
2511            mark_uniform_slots_read(&slots[u], slots_read,
2512                                    type_sz(inst->src[i].type));
2513         }
2514      }
2515
2516      int subgroup_id_index = get_subgroup_id_param_index(devinfo,
2517                                                          stage_prog_data);
2518
2519      /* Only allow 16 registers (128 uniform components) as push constants.
2520       *
2521       * Just demote the end of the list.  We could probably do better
2522       * here, demoting things that are rarely used in the program first.
2523       *
2524       * If changing this value, note the limitation about total_regs in
2525       * brw_curbe.c.
2526       */
2527      unsigned int max_push_components = 16 * 8;
2528      if (subgroup_id_index >= 0)
2529         max_push_components--; /* Save a slot for the thread ID */
2530
2531      /* We push small arrays, but no bigger than 16 floats.  This is big
2532       * enough for a vec4 but hopefully not large enough to push out other
2533       * stuff.  We should probably use a better heuristic at some point.
2534       */
2535      const unsigned int max_chunk_size = 16;
2536
2537      unsigned int num_push_constants = 0;
2538      unsigned int num_pull_constants = 0;
2539
2540      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2541      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2542
2543      /* Default to -1 meaning no location */
2544      memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
2545      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2546
2547      int chunk_start = -1;
2548      struct cplx_align align;
2549      for (unsigned u = 0; u < uniforms; u++) {
2550         if (!slots[u].is_live) {
2551            assert(chunk_start == -1);
2552            continue;
2553         }
2554
2555         /* Skip subgroup_id_index to put it in the last push register. */
2556         if (subgroup_id_index == (int)u)
2557            continue;
2558
2559         if (chunk_start == -1) {
2560            chunk_start = u;
2561            align = slots[u].align;
2562         } else {
2563            /* Offset into the chunk */
2564            unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
2565
2566            /* Shift the slot alignment down by the chunk offset so it is
2567             * comparable with the base chunk alignment.
2568             */
2569            struct cplx_align slot_align = slots[u].align;
2570            slot_align.offset =
2571               (slot_align.offset - chunk_offset) & (align.mul - 1);
2572
2573            align = cplx_align_combine(align, slot_align);
2574         }
2575
2576         /* Sanity check the alignment */
2577         cplx_align_assert_sane(align);
2578
2579         if (slots[u].contiguous)
2580            continue;
2581
2582         /* Adjust the alignment to be in terms of slots, not bytes */
2583         assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
2584         assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
2585         align.mul /= UNIFORM_SLOT_SIZE;
2586         align.offset /= UNIFORM_SLOT_SIZE;
2587
2588         unsigned push_start_align = cplx_align_apply(align, num_push_constants);
2589         unsigned chunk_size = u - chunk_start + 1;
2590         if ((!compiler->supports_pull_constants && u < UBO_START) ||
2591             (chunk_size < max_chunk_size &&
2592              push_start_align + chunk_size <= max_push_components)) {
2593            /* Align up the number of push constants */
2594            num_push_constants = push_start_align;
2595            for (unsigned i = 0; i < chunk_size; i++)
2596               push_constant_loc[chunk_start + i] = num_push_constants++;
2597         } else {
2598            /* We need to pull this one */
2599            num_pull_constants = cplx_align_apply(align, num_pull_constants);
2600            for (unsigned i = 0; i < chunk_size; i++)
2601               pull_constant_loc[chunk_start + i] = num_pull_constants++;
2602         }
2603
2604         /* Reset the chunk and start again */
2605         chunk_start = -1;
2606      }
2607
2608      /* Add the CS local thread ID uniform at the end of the push constants */
2609      if (subgroup_id_index >= 0)
2610         push_constant_loc[subgroup_id_index] = num_push_constants++;
2611
2612      /* As the uniforms are going to be reordered, stash the old array and
2613       * create two new arrays for push/pull params.
2614       */
2615      uint32_t *param = stage_prog_data->param;
2616      stage_prog_data->nr_params = num_push_constants;
2617      if (num_push_constants) {
2618         stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
2619                                                num_push_constants);
2620      } else {
2621         stage_prog_data->param = NULL;
2622      }
2623      assert(stage_prog_data->nr_pull_params == 0);
2624      assert(stage_prog_data->pull_param == NULL);
2625      if (num_pull_constants > 0) {
2626         stage_prog_data->nr_pull_params = num_pull_constants;
2627         stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
2628                                                     num_pull_constants);
2629      }
2630
2631      /* Up until now, the param[] array has been indexed by reg + offset
2632       * of UNIFORM registers.  Move pull constants into pull_param[] and
2633       * condense param[] to only contain the uniforms we chose to push.
2634       *
2635       * NOTE: Because we are condensing the params[] array, we know that
2636       * push_constant_loc[i] <= i and we can do it in one smooth loop without
2637       * having to make a copy.
2638       */
2639      for (unsigned int i = 0; i < uniforms; i++) {
2640         uint32_t value = param[i];
2641         if (pull_constant_loc[i] != -1) {
2642            stage_prog_data->pull_param[pull_constant_loc[i]] = value;
2643         } else if (push_constant_loc[i] != -1) {
2644            stage_prog_data->param[push_constant_loc[i]] = value;
2645         }
2646      }
2647      ralloc_free(param);
2648   } else {
2649      /* If we don't want to compact anything, just set up dummy push/pull
2650       * arrays.  All the rest of the compiler cares about are these arrays.
2651       */
2652      push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2653      pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2654
2655      for (unsigned u = 0; u < uniforms; u++)
2656         push_constant_loc[u] = u;
2657
2658      memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
2659   }
2660
2661   /* Now that we know how many regular uniforms we'll push, reduce the
2662    * UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
2663    */
2664   /* For gen4/5:
2665    * Only allow 16 registers (128 uniform components) as push constants.
2666    *
2667    * If changing this value, note the limitation about total_regs in
2668    * brw_curbe.c/crocus_state.c
2669    */
2670   const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
2671   unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
2672   for (int i = 0; i < 4; i++) {
2673      struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
2674
2675      if (push_length + range->length > max_push_length)
2676         range->length = max_push_length - push_length;
2677
2678      push_length += range->length;
2679   }
2680   assert(push_length <= max_push_length);
2681}
2682
2683bool
2684fs_visitor::get_pull_locs(const fs_reg &src,
2685                          unsigned *out_surf_index,
2686                          unsigned *out_pull_index)
2687{
2688   assert(src.file == UNIFORM);
2689
2690   if (src.nr >= UBO_START) {
2691      const struct brw_ubo_range *range =
2692         &prog_data->ubo_ranges[src.nr - UBO_START];
2693
2694      /* If this access is in our (reduced) range, use the push data. */
2695      if (src.offset / 32 < range->length)
2696         return false;
2697
2698      *out_surf_index = prog_data->binding_table.ubo_start + range->block;
2699      *out_pull_index = (32 * range->start + src.offset) / 4;
2700
2701      prog_data->has_ubo_pull = true;
2702      return true;
2703   }
2704
2705   const unsigned location = src.nr + src.offset / 4;
2706
2707   if (location < uniforms && pull_constant_loc[location] != -1) {
2708      /* A regular uniform push constant */
2709      *out_surf_index = stage_prog_data->binding_table.pull_constants_start;
2710      *out_pull_index = pull_constant_loc[location];
2711
2712      prog_data->has_ubo_pull = true;
2713      return true;
2714   }
2715
2716   return false;
2717}
2718
2719/**
2720 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2721 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2722 */
2723void
2724fs_visitor::lower_constant_loads()
2725{
2726   unsigned index, pull_index;
2727
2728   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2729      /* Set up the annotation tracking for new generated instructions. */
2730      const fs_builder ibld(this, block, inst);
2731
2732      for (int i = 0; i < inst->sources; i++) {
2733	 if (inst->src[i].file != UNIFORM)
2734	    continue;
2735
2736         /* We'll handle this case later */
2737         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
2738            continue;
2739
2740         if (!get_pull_locs(inst->src[i], &index, &pull_index))
2741	    continue;
2742
2743         assert(inst->src[i].stride == 0);
2744
2745         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
2746         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
2747         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
2748         const unsigned base = pull_index * 4;
2749
2750         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
2751                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
2752
2753         /* Rewrite the instruction to use the temporary VGRF. */
2754         inst->src[i].file = VGRF;
2755         inst->src[i].nr = dst.nr;
2756         inst->src[i].offset = (base & (block_sz - 1)) +
2757                               inst->src[i].offset % 4;
2758      }
2759
2760      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
2761          inst->src[0].file == UNIFORM) {
2762
2763         if (!get_pull_locs(inst->src[0], &index, &pull_index))
2764            continue;
2765
2766         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
2767                                    brw_imm_ud(index),
2768                                    inst->src[1],
2769                                    pull_index * 4, 4);
2770         inst->remove(block);
2771      }
2772   }
2773   invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
2774}
2775
2776bool
2777fs_visitor::opt_algebraic()
2778{
2779   bool progress = false;
2780
2781   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2782      switch (inst->opcode) {
2783      case BRW_OPCODE_MOV:
2784         if (!devinfo->has_64bit_float &&
2785             !devinfo->has_64bit_int &&
2786             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2787              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2788              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2789            assert(inst->dst.type == inst->src[0].type);
2790            assert(!inst->saturate);
2791            assert(!inst->src[0].abs);
2792            assert(!inst->src[0].negate);
2793            const brw::fs_builder ibld(this, block, inst);
2794
2795            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2796                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
2797            ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2798                     subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
2799
2800            inst->remove(block);
2801            progress = true;
2802         }
2803
2804         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2805              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2806             inst->dst.is_null() &&
2807             (inst->src[0].abs || inst->src[0].negate)) {
2808            inst->src[0].abs = false;
2809            inst->src[0].negate = false;
2810            progress = true;
2811            break;
2812         }
2813
2814         if (inst->src[0].file != IMM)
2815            break;
2816
2817         if (inst->saturate) {
2818            /* Full mixed-type saturates don't happen.  However, we can end up
2819             * with things like:
2820             *
2821             *    mov.sat(8) g21<1>DF       -1F
2822             *
2823             * Other mixed-size-but-same-base-type cases may also be possible.
2824             */
2825            if (inst->dst.type != inst->src[0].type &&
2826                inst->dst.type != BRW_REGISTER_TYPE_DF &&
2827                inst->src[0].type != BRW_REGISTER_TYPE_F)
2828               assert(!"unimplemented: saturate mixed types");
2829
2830            if (brw_saturate_immediate(inst->src[0].type,
2831                                       &inst->src[0].as_brw_reg())) {
2832               inst->saturate = false;
2833               progress = true;
2834            }
2835         }
2836         break;
2837
2838      case BRW_OPCODE_MUL:
2839         if (inst->src[1].file != IMM)
2840            continue;
2841
2842         if (brw_reg_type_is_floating_point(inst->src[1].type))
2843            break;
2844
2845         /* a * 1.0 = a */
2846         if (inst->src[1].is_one()) {
2847            inst->opcode = BRW_OPCODE_MOV;
2848            inst->src[1] = reg_undef;
2849            progress = true;
2850            break;
2851         }
2852
2853         /* a * -1.0 = -a */
2854         if (inst->src[1].is_negative_one()) {
2855            inst->opcode = BRW_OPCODE_MOV;
2856            inst->src[0].negate = !inst->src[0].negate;
2857            inst->src[1] = reg_undef;
2858            progress = true;
2859            break;
2860         }
2861
2862         break;
2863      case BRW_OPCODE_ADD:
2864         if (inst->src[1].file != IMM)
2865            continue;
2866
2867         if (brw_reg_type_is_integer(inst->src[1].type) &&
2868             inst->src[1].is_zero()) {
2869            inst->opcode = BRW_OPCODE_MOV;
2870            inst->src[1] = reg_undef;
2871            progress = true;
2872            break;
2873         }
2874
2875         if (inst->src[0].file == IMM) {
2876            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2877            inst->opcode = BRW_OPCODE_MOV;
2878            inst->src[0].f += inst->src[1].f;
2879            inst->src[1] = reg_undef;
2880            progress = true;
2881            break;
2882         }
2883         break;
2884      case BRW_OPCODE_OR:
2885         if (inst->src[0].equals(inst->src[1]) ||
2886             inst->src[1].is_zero()) {
2887            /* On Gfx8+, the OR instruction can have a source modifier that
2888             * performs logical not on the operand.  Cases of 'OR r0, ~r1, 0'
2889             * or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
2890             */
2891            if (inst->src[0].negate) {
2892               inst->opcode = BRW_OPCODE_NOT;
2893               inst->src[0].negate = false;
2894            } else {
2895               inst->opcode = BRW_OPCODE_MOV;
2896            }
2897            inst->src[1] = reg_undef;
2898            progress = true;
2899            break;
2900         }
2901         break;
2902      case BRW_OPCODE_CMP:
2903         if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
2904              inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
2905             inst->src[1].is_zero() &&
2906             (inst->src[0].abs || inst->src[0].negate)) {
2907            inst->src[0].abs = false;
2908            inst->src[0].negate = false;
2909            progress = true;
2910            break;
2911         }
2912         break;
2913      case BRW_OPCODE_SEL:
2914         if (!devinfo->has_64bit_float &&
2915             !devinfo->has_64bit_int &&
2916             (inst->dst.type == BRW_REGISTER_TYPE_DF ||
2917              inst->dst.type == BRW_REGISTER_TYPE_UQ ||
2918              inst->dst.type == BRW_REGISTER_TYPE_Q)) {
2919            assert(inst->dst.type == inst->src[0].type);
2920            assert(!inst->saturate);
2921            assert(!inst->src[0].abs && !inst->src[0].negate);
2922            assert(!inst->src[1].abs && !inst->src[1].negate);
2923            const brw::fs_builder ibld(this, block, inst);
2924
2925            set_predicate(inst->predicate,
2926                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
2927                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
2928                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
2929            set_predicate(inst->predicate,
2930                          ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
2931                                   subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
2932                                   subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
2933
2934            inst->remove(block);
2935            progress = true;
2936         }
2937         if (inst->src[0].equals(inst->src[1])) {
2938            inst->opcode = BRW_OPCODE_MOV;
2939            inst->src[1] = reg_undef;
2940            inst->predicate = BRW_PREDICATE_NONE;
2941            inst->predicate_inverse = false;
2942            progress = true;
2943         } else if (inst->saturate && inst->src[1].file == IMM) {
2944            switch (inst->conditional_mod) {
2945            case BRW_CONDITIONAL_LE:
2946            case BRW_CONDITIONAL_L:
2947               switch (inst->src[1].type) {
2948               case BRW_REGISTER_TYPE_F:
2949                  if (inst->src[1].f >= 1.0f) {
2950                     inst->opcode = BRW_OPCODE_MOV;
2951                     inst->src[1] = reg_undef;
2952                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2953                     progress = true;
2954                  }
2955                  break;
2956               default:
2957                  break;
2958               }
2959               break;
2960            case BRW_CONDITIONAL_GE:
2961            case BRW_CONDITIONAL_G:
2962               switch (inst->src[1].type) {
2963               case BRW_REGISTER_TYPE_F:
2964                  if (inst->src[1].f <= 0.0f) {
2965                     inst->opcode = BRW_OPCODE_MOV;
2966                     inst->src[1] = reg_undef;
2967                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2968                     progress = true;
2969                  }
2970                  break;
2971               default:
2972                  break;
2973               }
2974            default:
2975               break;
2976            }
2977         }
2978         break;
2979      case BRW_OPCODE_MAD:
2980         if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
2981             inst->src[1].type != BRW_REGISTER_TYPE_F ||
2982             inst->src[2].type != BRW_REGISTER_TYPE_F)
2983            break;
2984         if (inst->src[1].is_one()) {
2985            inst->opcode = BRW_OPCODE_ADD;
2986            inst->src[1] = inst->src[2];
2987            inst->src[2] = reg_undef;
2988            progress = true;
2989         } else if (inst->src[2].is_one()) {
2990            inst->opcode = BRW_OPCODE_ADD;
2991            inst->src[2] = reg_undef;
2992            progress = true;
2993         }
2994         break;
2995      case SHADER_OPCODE_BROADCAST:
2996         if (is_uniform(inst->src[0])) {
2997            inst->opcode = BRW_OPCODE_MOV;
2998            inst->sources = 1;
2999            inst->force_writemask_all = true;
3000            progress = true;
3001         } else if (inst->src[1].file == IMM) {
3002            inst->opcode = BRW_OPCODE_MOV;
3003            /* It's possible that the selected component will be too large and
3004             * overflow the register.  This can happen if someone does a
3005             * readInvocation() from GLSL or SPIR-V and provides an OOB
3006             * invocationIndex.  If this happens and we some how manage
3007             * to constant fold it in and get here, then component() may cause
3008             * us to start reading outside of the VGRF which will lead to an
3009             * assert later.  Instead, just let it wrap around if it goes over
3010             * exec_size.
3011             */
3012            const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
3013            inst->src[0] = component(inst->src[0], comp);
3014            inst->sources = 1;
3015            inst->force_writemask_all = true;
3016            progress = true;
3017         }
3018         break;
3019
3020      case SHADER_OPCODE_SHUFFLE:
3021         if (is_uniform(inst->src[0])) {
3022            inst->opcode = BRW_OPCODE_MOV;
3023            inst->sources = 1;
3024            progress = true;
3025         } else if (inst->src[1].file == IMM) {
3026            inst->opcode = BRW_OPCODE_MOV;
3027            inst->src[0] = component(inst->src[0],
3028                                     inst->src[1].ud);
3029            inst->sources = 1;
3030            progress = true;
3031         }
3032         break;
3033
3034      default:
3035	 break;
3036      }
3037
3038      /* Swap if src[0] is immediate. */
3039      if (progress && inst->is_commutative()) {
3040         if (inst->src[0].file == IMM) {
3041            fs_reg tmp = inst->src[1];
3042            inst->src[1] = inst->src[0];
3043            inst->src[0] = tmp;
3044         }
3045      }
3046   }
3047
3048   if (progress)
3049      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
3050                          DEPENDENCY_INSTRUCTION_DETAIL);
3051
3052   return progress;
3053}
3054
3055/**
3056 * Optimize sample messages that have constant zero values for the trailing
3057 * texture coordinates. We can just reduce the message length for these
3058 * instructions instead of reserving a register for it. Trailing parameters
3059 * that aren't sent default to zero anyway. This will cause the dead code
3060 * eliminator to remove the MOV instruction that would otherwise be emitted to
3061 * set up the zero value.
3062 */
3063bool
3064fs_visitor::opt_zero_samples()
3065{
3066   /* Gfx4 infers the texturing opcode based on the message length so we can't
3067    * change it.  Gfx12.5 has restrictions on the number of coordinate
3068    * parameters that have to be provided for some texture types
3069    * (Wa_14013363432).
3070    */
3071   if (devinfo->ver < 5 || devinfo->verx10 == 125)
3072      return false;
3073
3074   bool progress = false;
3075
3076   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3077      if (!inst->is_tex())
3078         continue;
3079
3080      fs_inst *load_payload = (fs_inst *) inst->prev;
3081
3082      if (load_payload->is_head_sentinel() ||
3083          load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3084         continue;
3085
3086      /* We don't want to remove the message header or the first parameter.
3087       * Removing the first parameter is not allowed, see the Haswell PRM
3088       * volume 7, page 149:
3089       *
3090       *     "Parameter 0 is required except for the sampleinfo message, which
3091       *      has no parameter 0"
3092       */
3093      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
3094             load_payload->src[(inst->mlen - inst->header_size) /
3095                               (inst->exec_size / 8) +
3096                               inst->header_size - 1].is_zero()) {
3097         inst->mlen -= inst->exec_size / 8;
3098         progress = true;
3099      }
3100   }
3101
3102   if (progress)
3103      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3104
3105   return progress;
3106}
3107
3108bool
3109fs_visitor::opt_register_renaming()
3110{
3111   bool progress = false;
3112   int depth = 0;
3113
3114   unsigned remap[alloc.count];
3115   memset(remap, ~0u, sizeof(unsigned) * alloc.count);
3116
3117   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3118      if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
3119         depth++;
3120      } else if (inst->opcode == BRW_OPCODE_ENDIF ||
3121                 inst->opcode == BRW_OPCODE_WHILE) {
3122         depth--;
3123      }
3124
3125      /* Rewrite instruction sources. */
3126      for (int i = 0; i < inst->sources; i++) {
3127         if (inst->src[i].file == VGRF &&
3128             remap[inst->src[i].nr] != ~0u &&
3129             remap[inst->src[i].nr] != inst->src[i].nr) {
3130            inst->src[i].nr = remap[inst->src[i].nr];
3131            progress = true;
3132         }
3133      }
3134
3135      const unsigned dst = inst->dst.nr;
3136
3137      if (depth == 0 &&
3138          inst->dst.file == VGRF &&
3139          alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
3140          !inst->is_partial_write()) {
3141         if (remap[dst] == ~0u) {
3142            remap[dst] = dst;
3143         } else {
3144            remap[dst] = alloc.allocate(regs_written(inst));
3145            inst->dst.nr = remap[dst];
3146            progress = true;
3147         }
3148      } else if (inst->dst.file == VGRF &&
3149                 remap[dst] != ~0u &&
3150                 remap[dst] != dst) {
3151         inst->dst.nr = remap[dst];
3152         progress = true;
3153      }
3154   }
3155
3156   if (progress) {
3157      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
3158                          DEPENDENCY_VARIABLES);
3159
3160      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
3161         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
3162            delta_xy[i].nr = remap[delta_xy[i].nr];
3163         }
3164      }
3165   }
3166
3167   return progress;
3168}
3169
3170/**
3171 * Remove redundant or useless halts.
3172 *
3173 * For example, we can eliminate halts in the following sequence:
3174 *
3175 * halt        (redundant with the next halt)
3176 * halt        (useless; jumps to the next instruction)
3177 * halt-target
3178 */
3179bool
3180fs_visitor::opt_redundant_halt()
3181{
3182   bool progress = false;
3183
3184   unsigned halt_count = 0;
3185   fs_inst *halt_target = NULL;
3186   bblock_t *halt_target_block = NULL;
3187   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3188      if (inst->opcode == BRW_OPCODE_HALT)
3189         halt_count++;
3190
3191      if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
3192         halt_target = inst;
3193         halt_target_block = block;
3194         break;
3195      }
3196   }
3197
3198   if (!halt_target) {
3199      assert(halt_count == 0);
3200      return false;
3201   }
3202
3203   /* Delete any HALTs immediately before the halt target. */
3204   for (fs_inst *prev = (fs_inst *) halt_target->prev;
3205        !prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
3206        prev = (fs_inst *) halt_target->prev) {
3207      prev->remove(halt_target_block);
3208      halt_count--;
3209      progress = true;
3210   }
3211
3212   if (halt_count == 0) {
3213      halt_target->remove(halt_target_block);
3214      progress = true;
3215   }
3216
3217   if (progress)
3218      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3219
3220   return progress;
3221}
3222
3223/**
3224 * Compute a bitmask with GRF granularity with a bit set for each GRF starting
3225 * from \p r.offset which overlaps the region starting at \p s.offset and
3226 * spanning \p ds bytes.
3227 */
3228static inline unsigned
3229mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
3230{
3231   const int rel_offset = reg_offset(s) - reg_offset(r);
3232   const int shift = rel_offset / REG_SIZE;
3233   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
3234   assert(reg_space(r) == reg_space(s) &&
3235          shift >= 0 && shift < int(8 * sizeof(unsigned)));
3236   return ((1 << n) - 1) << shift;
3237}
3238
3239bool
3240fs_visitor::compute_to_mrf()
3241{
3242   bool progress = false;
3243   int next_ip = 0;
3244
3245   /* No MRFs on Gen >= 7. */
3246   if (devinfo->ver >= 7)
3247      return false;
3248
3249   const fs_live_variables &live = live_analysis.require();
3250
3251   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3252      int ip = next_ip;
3253      next_ip++;
3254
3255      if (inst->opcode != BRW_OPCODE_MOV ||
3256	  inst->is_partial_write() ||
3257	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
3258	  inst->dst.type != inst->src[0].type ||
3259	  inst->src[0].abs || inst->src[0].negate ||
3260          !inst->src[0].is_contiguous() ||
3261          inst->src[0].offset % REG_SIZE != 0)
3262	 continue;
3263
3264      /* Can't compute-to-MRF this GRF if someone else was going to
3265       * read it later.
3266       */
3267      if (live.vgrf_end[inst->src[0].nr] > ip)
3268	 continue;
3269
3270      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
3271       * things that computed the value of all GRFs of the source region.  The
3272       * regs_left bitset keeps track of the registers we haven't yet found a
3273       * generating instruction for.
3274       */
3275      unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
3276
3277      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3278         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3279                             inst->src[0], inst->size_read(0))) {
3280	    /* Found the last thing to write our reg we want to turn
3281	     * into a compute-to-MRF.
3282	     */
3283
3284	    /* If this one instruction didn't populate all the
3285	     * channels, bail.  We might be able to rewrite everything
3286	     * that writes that reg, but it would require smarter
3287	     * tracking.
3288	     */
3289	    if (scan_inst->is_partial_write())
3290	       break;
3291
3292            /* Handling things not fully contained in the source of the copy
3293             * would need us to understand coalescing out more than one MOV at
3294             * a time.
3295             */
3296            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
3297                                     inst->src[0], inst->size_read(0)))
3298               break;
3299
3300	    /* SEND instructions can't have MRF as a destination. */
3301	    if (scan_inst->mlen)
3302	       break;
3303
3304	    if (devinfo->ver == 6) {
3305	       /* gfx6 math instructions must have the destination be
3306		* GRF, so no compute-to-MRF for them.
3307		*/
3308	       if (scan_inst->is_math()) {
3309		  break;
3310	       }
3311	    }
3312
3313            /* Clear the bits for any registers this instruction overwrites. */
3314            regs_left &= ~mask_relative_to(
3315               inst->src[0], scan_inst->dst, scan_inst->size_written);
3316            if (!regs_left)
3317               break;
3318	 }
3319
3320	 /* We don't handle control flow here.  Most computation of
3321	  * values that end up in MRFs are shortly before the MRF
3322	  * write anyway.
3323	  */
3324	 if (block->start() == scan_inst)
3325	    break;
3326
3327	 /* You can't read from an MRF, so if someone else reads our
3328	  * MRF's source GRF that we wanted to rewrite, that stops us.
3329	  */
3330	 bool interfered = false;
3331	 for (int i = 0; i < scan_inst->sources; i++) {
3332            if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
3333                                inst->src[0], inst->size_read(0))) {
3334	       interfered = true;
3335	    }
3336	 }
3337	 if (interfered)
3338	    break;
3339
3340         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3341                             inst->dst, inst->size_written)) {
3342	    /* If somebody else writes our MRF here, we can't
3343	     * compute-to-MRF before that.
3344	     */
3345            break;
3346         }
3347
3348         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
3349             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
3350                             inst->dst, inst->size_written)) {
3351	    /* Found a SEND instruction, which means that there are
3352	     * live values in MRFs from base_mrf to base_mrf +
3353	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
3354	     * above it.
3355	     */
3356            break;
3357         }
3358      }
3359
3360      if (regs_left)
3361         continue;
3362
3363      /* Found all generating instructions of our MRF's source value, so it
3364       * should be safe to rewrite them to point to the MRF directly.
3365       */
3366      regs_left = (1 << regs_read(inst, 0)) - 1;
3367
3368      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3369         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
3370                             inst->src[0], inst->size_read(0))) {
3371            /* Clear the bits for any registers this instruction overwrites. */
3372            regs_left &= ~mask_relative_to(
3373               inst->src[0], scan_inst->dst, scan_inst->size_written);
3374
3375            const unsigned rel_offset = reg_offset(scan_inst->dst) -
3376                                        reg_offset(inst->src[0]);
3377
3378            if (inst->dst.nr & BRW_MRF_COMPR4) {
3379               /* Apply the same address transformation done by the hardware
3380                * for COMPR4 MRF writes.
3381                */
3382               assert(rel_offset < 2 * REG_SIZE);
3383               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
3384
3385               /* Clear the COMPR4 bit if the generating instruction is not
3386                * compressed.
3387                */
3388               if (scan_inst->size_written < 2 * REG_SIZE)
3389                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
3390
3391            } else {
3392               /* Calculate the MRF number the result of this instruction is
3393                * ultimately written to.
3394                */
3395               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
3396            }
3397
3398            scan_inst->dst.file = MRF;
3399            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
3400            scan_inst->saturate |= inst->saturate;
3401            if (!regs_left)
3402               break;
3403         }
3404      }
3405
3406      assert(!regs_left);
3407      inst->remove(block);
3408      progress = true;
3409   }
3410
3411   if (progress)
3412      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3413
3414   return progress;
3415}
3416
3417/**
3418 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
3419 * flow.  We could probably do better here with some form of divergence
3420 * analysis.
3421 */
3422bool
3423fs_visitor::eliminate_find_live_channel()
3424{
3425   bool progress = false;
3426   unsigned depth = 0;
3427
3428   if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
3429      /* The optimization below assumes that channel zero is live on thread
3430       * dispatch, which may not be the case if the fixed function dispatches
3431       * threads sparsely.
3432       */
3433      return false;
3434   }
3435
3436   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3437      switch (inst->opcode) {
3438      case BRW_OPCODE_IF:
3439      case BRW_OPCODE_DO:
3440         depth++;
3441         break;
3442
3443      case BRW_OPCODE_ENDIF:
3444      case BRW_OPCODE_WHILE:
3445         depth--;
3446         break;
3447
3448      case BRW_OPCODE_HALT:
3449         /* This can potentially make control flow non-uniform until the end
3450          * of the program.
3451          */
3452         return progress;
3453
3454      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3455         if (depth == 0) {
3456            inst->opcode = BRW_OPCODE_MOV;
3457            inst->src[0] = brw_imm_ud(0u);
3458            inst->sources = 1;
3459            inst->force_writemask_all = true;
3460            progress = true;
3461         }
3462         break;
3463
3464      default:
3465         break;
3466      }
3467   }
3468
3469   if (progress)
3470      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
3471
3472   return progress;
3473}
3474
3475/**
3476 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3477 * instructions to FS_OPCODE_REP_FB_WRITE.
3478 */
3479void
3480fs_visitor::emit_repclear_shader()
3481{
3482   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3483   int base_mrf = 0;
3484   int color_mrf = base_mrf + 2;
3485   fs_inst *mov;
3486
3487   if (uniforms > 0) {
3488      mov = bld.exec_all().group(4, 0)
3489               .MOV(brw_message_reg(color_mrf),
3490                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
3491   } else {
3492      struct brw_reg reg =
3493         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
3494                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
3495                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
3496
3497      mov = bld.exec_all().group(4, 0)
3498               .MOV(brw_uvec_mrf(4, color_mrf, 0), fs_reg(reg));
3499   }
3500
3501   fs_inst *write = NULL;
3502   if (key->nr_color_regions == 1) {
3503      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3504      write->saturate = key->clamp_fragment_color;
3505      write->base_mrf = color_mrf;
3506      write->target = 0;
3507      write->header_size = 0;
3508      write->mlen = 1;
3509   } else {
3510      assume(key->nr_color_regions > 0);
3511
3512      struct brw_reg header =
3513         retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
3514      bld.exec_all().group(16, 0)
3515         .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3516
3517      for (int i = 0; i < key->nr_color_regions; ++i) {
3518         if (i > 0) {
3519            bld.exec_all().group(1, 0)
3520               .MOV(component(header, 2), brw_imm_ud(i));
3521         }
3522
3523         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
3524         write->saturate = key->clamp_fragment_color;
3525         write->base_mrf = base_mrf;
3526         write->target = i;
3527         write->header_size = 2;
3528         write->mlen = 3;
3529      }
3530   }
3531   write->eot = true;
3532   write->last_rt = true;
3533
3534   calculate_cfg();
3535
3536   assign_constant_locations();
3537   assign_curb_setup();
3538
3539   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3540   if (uniforms > 0) {
3541      assert(mov->src[0].file == FIXED_GRF);
3542      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
3543   }
3544
3545   lower_scoreboard();
3546}
3547
3548/**
3549 * Walks through basic blocks, looking for repeated MRF writes and
3550 * removing the later ones.
3551 */
3552bool
3553fs_visitor::remove_duplicate_mrf_writes()
3554{
3555   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
3556   bool progress = false;
3557
3558   /* Need to update the MRF tracking for compressed instructions. */
3559   if (dispatch_width >= 16)
3560      return false;
3561
3562   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3563
3564   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3565      if (inst->is_control_flow()) {
3566	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3567      }
3568
3569      if (inst->opcode == BRW_OPCODE_MOV &&
3570	  inst->dst.file == MRF) {
3571         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
3572	 if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
3573             inst->dst.equals(prev_inst->dst) &&
3574             inst->src[0].equals(prev_inst->src[0]) &&
3575             inst->saturate == prev_inst->saturate &&
3576             inst->predicate == prev_inst->predicate &&
3577             inst->conditional_mod == prev_inst->conditional_mod &&
3578             inst->exec_size == prev_inst->exec_size) {
3579	    inst->remove(block);
3580	    progress = true;
3581	    continue;
3582	 }
3583      }
3584
3585      /* Clear out the last-write records for MRFs that were overwritten. */
3586      if (inst->dst.file == MRF) {
3587         last_mrf_move[inst->dst.nr] = NULL;
3588      }
3589
3590      if (inst->mlen > 0 && inst->base_mrf != -1) {
3591	 /* Found a SEND instruction, which will include two or fewer
3592	  * implied MRF writes.  We could do better here.
3593	  */
3594	 for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
3595	    last_mrf_move[inst->base_mrf + i] = NULL;
3596	 }
3597      }
3598
3599      /* Clear out any MRF move records whose sources got overwritten. */
3600      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3601         if (last_mrf_move[i] &&
3602             regions_overlap(inst->dst, inst->size_written,
3603                             last_mrf_move[i]->src[0],
3604                             last_mrf_move[i]->size_read(0))) {
3605            last_mrf_move[i] = NULL;
3606         }
3607      }
3608
3609      if (inst->opcode == BRW_OPCODE_MOV &&
3610	  inst->dst.file == MRF &&
3611	  inst->src[0].file != ARF &&
3612	  !inst->is_partial_write()) {
3613         last_mrf_move[inst->dst.nr] = inst;
3614      }
3615   }
3616
3617   if (progress)
3618      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3619
3620   return progress;
3621}
3622
3623/**
3624 * Rounding modes for conversion instructions are included for each
3625 * conversion, but right now it is a state. So once it is set,
3626 * we don't need to call it again for subsequent calls.
3627 *
3628 * This is useful for vector/matrices conversions, as setting the
3629 * mode once is enough for the full vector/matrix
3630 */
3631bool
3632fs_visitor::remove_extra_rounding_modes()
3633{
3634   bool progress = false;
3635   unsigned execution_mode = this->nir->info.float_controls_execution_mode;
3636
3637   brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
3638   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
3639        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
3640        FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
3641       execution_mode)
3642      base_mode = BRW_RND_MODE_RTNE;
3643   if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
3644        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
3645        FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
3646       execution_mode)
3647      base_mode = BRW_RND_MODE_RTZ;
3648
3649   foreach_block (block, cfg) {
3650      brw_rnd_mode prev_mode = base_mode;
3651
3652      foreach_inst_in_block_safe (fs_inst, inst, block) {
3653         if (inst->opcode == SHADER_OPCODE_RND_MODE) {
3654            assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
3655            const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
3656            if (mode == prev_mode) {
3657               inst->remove(block);
3658               progress = true;
3659            } else {
3660               prev_mode = mode;
3661            }
3662         }
3663      }
3664   }
3665
3666   if (progress)
3667      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3668
3669   return progress;
3670}
3671
3672static void
3673clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3674{
3675   /* Clear the flag for registers that actually got read (as expected). */
3676   for (int i = 0; i < inst->sources; i++) {
3677      int grf;
3678      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
3679         grf = inst->src[i].nr;
3680      } else {
3681         continue;
3682      }
3683
3684      if (grf >= first_grf &&
3685          grf < first_grf + grf_len) {
3686         deps[grf - first_grf] = false;
3687         if (inst->exec_size == 16)
3688            deps[grf - first_grf + 1] = false;
3689      }
3690   }
3691}
3692
3693/**
3694 * Implements this workaround for the original 965:
3695 *
3696 *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3697 *      check for post destination dependencies on this instruction, software
3698 *      must ensure that there is no destination hazard for the case of ‘write
3699 *      followed by a posted write’ shown in the following example.
3700 *
3701 *      1. mov r3 0
3702 *      2. send r3.xy <rest of send instruction>
3703 *      3. mov r2 r3
3704 *
3705 *      Due to no post-destination dependency check on the ‘send’, the above
3706 *      code sequence could have two instructions (1 and 2) in flight at the
3707 *      same time that both consider ‘r3’ as the target of their final writes.
3708 */
3709void
3710fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
3711                                                        fs_inst *inst)
3712{
3713   int write_len = regs_written(inst);
3714   int first_write_grf = inst->dst.nr;
3715   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3716   assert(write_len < (int)sizeof(needs_dep) - 1);
3717
3718   memset(needs_dep, false, sizeof(needs_dep));
3719   memset(needs_dep, true, write_len);
3720
3721   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3722
3723   /* Walk backwards looking for writes to registers we're writing which
3724    * aren't read since being written.  If we hit the start of the program,
3725    * we assume that there are no outstanding dependencies on entry to the
3726    * program.
3727    */
3728   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
3729      /* If we hit control flow, assume that there *are* outstanding
3730       * dependencies, and force their cleanup before our instruction.
3731       */
3732      if (block->start() == scan_inst && block->num != 0) {
3733         for (int i = 0; i < write_len; i++) {
3734            if (needs_dep[i])
3735               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
3736                               first_write_grf + i);
3737         }
3738         return;
3739      }
3740
3741      /* We insert our reads as late as possible on the assumption that any
3742       * instruction but a MOV that might have left us an outstanding
3743       * dependency has more latency than a MOV.
3744       */
3745      if (scan_inst->dst.file == VGRF) {
3746         for (unsigned i = 0; i < regs_written(scan_inst); i++) {
3747            int reg = scan_inst->dst.nr + i;
3748
3749            if (reg >= first_write_grf &&
3750                reg < first_write_grf + write_len &&
3751                needs_dep[reg - first_write_grf]) {
3752               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
3753               needs_dep[reg - first_write_grf] = false;
3754               if (scan_inst->exec_size == 16)
3755                  needs_dep[reg - first_write_grf + 1] = false;
3756            }
3757         }
3758      }
3759
3760      /* Clear the flag for registers that actually got read (as expected). */
3761      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3762
3763      /* Continue the loop only if we haven't resolved all the dependencies */
3764      int i;
3765      for (i = 0; i < write_len; i++) {
3766         if (needs_dep[i])
3767            break;
3768      }
3769      if (i == write_len)
3770         return;
3771   }
3772}
3773
3774/**
3775 * Implements this workaround for the original 965:
3776 *
3777 *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3778 *      used as a destination register until after it has been sourced by an
3779 *      instruction with a different destination register.
3780 */
3781void
3782fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3783{
3784   int write_len = regs_written(inst);
3785   unsigned first_write_grf = inst->dst.nr;
3786   bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
3787   assert(write_len < (int)sizeof(needs_dep) - 1);
3788
3789   memset(needs_dep, false, sizeof(needs_dep));
3790   memset(needs_dep, true, write_len);
3791   /* Walk forwards looking for writes to registers we're writing which aren't
3792    * read before being written.
3793    */
3794   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
3795      /* If we hit control flow, force resolve all remaining dependencies. */
3796      if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
3797         for (int i = 0; i < write_len; i++) {
3798            if (needs_dep[i])
3799               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3800                               first_write_grf + i);
3801         }
3802         return;
3803      }
3804
3805      /* Clear the flag for registers that actually got read (as expected). */
3806      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3807
3808      /* We insert our reads as late as possible since they're reading the
3809       * result of a SEND, which has massive latency.
3810       */
3811      if (scan_inst->dst.file == VGRF &&
3812          scan_inst->dst.nr >= first_write_grf &&
3813          scan_inst->dst.nr < first_write_grf + write_len &&
3814          needs_dep[scan_inst->dst.nr - first_write_grf]) {
3815         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
3816                         scan_inst->dst.nr);
3817         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
3818      }
3819
3820      /* Continue the loop only if we haven't resolved all the dependencies */
3821      int i;
3822      for (i = 0; i < write_len; i++) {
3823         if (needs_dep[i])
3824            break;
3825      }
3826      if (i == write_len)
3827         return;
3828   }
3829}
3830
3831void
3832fs_visitor::insert_gfx4_send_dependency_workarounds()
3833{
3834   if (devinfo->ver != 4 || devinfo->is_g4x)
3835      return;
3836
3837   bool progress = false;
3838
3839   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3840      if (inst->mlen != 0 && inst->dst.file == VGRF) {
3841         insert_gfx4_pre_send_dependency_workarounds(block, inst);
3842         insert_gfx4_post_send_dependency_workarounds(block, inst);
3843         progress = true;
3844      }
3845   }
3846
3847   if (progress)
3848      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
3849}
3850
3851/**
3852 * Turns the generic expression-style uniform pull constant load instruction
3853 * into a hardware-specific series of instructions for loading a pull
3854 * constant.
3855 *
3856 * The expression style allows the CSE pass before this to optimize out
3857 * repeated loads from the same offset, and gives the pre-register-allocation
3858 * scheduling full flexibility, while the conversion to native instructions
3859 * allows the post-register-allocation scheduler the best information
3860 * possible.
3861 *
3862 * Note that execution masking for setting up pull constant loads is special:
3863 * the channels that need to be written are unrelated to the current execution
3864 * mask, since a later instruction will use one of the result channels as a
3865 * source operand for all 8 or 16 of its channels.
3866 */
3867void
3868fs_visitor::lower_uniform_pull_constant_loads()
3869{
3870   foreach_block_and_inst (block, fs_inst, inst, cfg) {
3871      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3872         continue;
3873
3874      const fs_reg& surface = inst->src[0];
3875      const fs_reg& offset_B = inst->src[1];
3876      assert(offset_B.file == IMM);
3877
3878      if (devinfo->has_lsc) {
3879         const fs_builder ubld =
3880            fs_builder(this, block, inst).group(8, 0).exec_all();
3881
3882         const fs_reg payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
3883         ubld.MOV(payload, offset_B);
3884
3885         inst->sfid = GFX12_SFID_UGM;
3886         inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
3887                                   1 /* simd_size */,
3888                                   LSC_ADDR_SURFTYPE_BTI,
3889                                   LSC_ADDR_SIZE_A32,
3890                                   1 /* num_coordinates */,
3891                                   LSC_DATA_SIZE_D32,
3892                                   inst->size_written / 4,
3893                                   true /* transpose */,
3894                                   LSC_CACHE_LOAD_L1STATE_L3MOCS,
3895                                   true /* has_dest */);
3896
3897         fs_reg ex_desc;
3898         if (surface.file == IMM) {
3899            ex_desc = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
3900         } else {
3901            /* We only need the first component for the payload so we can use
3902             * one of the other components for the extended descriptor
3903             */
3904            ex_desc = component(payload, 1);
3905            ubld.group(1, 0).SHL(ex_desc, surface, brw_imm_ud(24));
3906         }
3907
3908         /* Update the original instruction. */
3909         inst->opcode = SHADER_OPCODE_SEND;
3910         inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
3911         inst->ex_mlen = 0;
3912         inst->header_size = 0;
3913         inst->send_has_side_effects = false;
3914         inst->send_is_volatile = true;
3915         inst->exec_size = 1;
3916
3917         /* Finally, the payload */
3918         inst->resize_sources(3);
3919         inst->src[0] = brw_imm_ud(0); /* desc */
3920         inst->src[1] = ex_desc;
3921         inst->src[2] = payload;
3922
3923         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3924      } else if (devinfo->ver >= 7) {
3925         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
3926         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
3927
3928         ubld.group(8, 0).MOV(payload,
3929                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
3930         ubld.group(1, 0).MOV(component(payload, 2),
3931                              brw_imm_ud(offset_B.ud / 16));
3932
3933         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7;
3934         inst->src[1] = payload;
3935         inst->header_size = 1;
3936         inst->mlen = 1;
3937
3938         invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
3939      } else {
3940         /* Before register allocation, we didn't tell the scheduler about the
3941          * MRF we use.  We know it's safe to use this MRF because nothing
3942          * else does except for register spill/unspill, which generates and
3943          * uses its MRF within a single IR instruction.
3944          */
3945         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1;
3946         inst->mlen = 1;
3947      }
3948   }
3949}
3950
3951bool
3952fs_visitor::lower_load_payload()
3953{
3954   bool progress = false;
3955
3956   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3957      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3958         continue;
3959
3960      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
3961      assert(inst->saturate == false);
3962      fs_reg dst = inst->dst;
3963
3964      /* Get rid of COMPR4.  We'll add it back in if we need it */
3965      if (dst.file == MRF)
3966         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
3967
3968      const fs_builder ibld(this, block, inst);
3969      const fs_builder ubld = ibld.exec_all();
3970
3971      for (uint8_t i = 0; i < inst->header_size;) {
3972         /* Number of header GRFs to initialize at once with a single MOV
3973          * instruction.
3974          */
3975         const unsigned n =
3976            (i + 1 < inst->header_size && inst->src[i].stride == 1 &&
3977             inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
3978            2 : 1;
3979
3980         if (inst->src[i].file != BAD_FILE)
3981            ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
3982                                     retype(inst->src[i], BRW_REGISTER_TYPE_UD));
3983
3984         dst = byte_offset(dst, n * REG_SIZE);
3985         i += n;
3986      }
3987
3988      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
3989          inst->exec_size > 8) {
3990         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3991          * a straightforward copy.  Instead, the result of the
3992          * LOAD_PAYLOAD is treated as interleaved and the first four
3993          * non-header sources are unpacked as:
3994          *
3995          * m + 0: r0
3996          * m + 1: g0
3997          * m + 2: b0
3998          * m + 3: a0
3999          * m + 4: r1
4000          * m + 5: g1
4001          * m + 6: b1
4002          * m + 7: a1
4003          *
4004          * This is used for gen <= 5 fb writes.
4005          */
4006         assert(inst->exec_size == 16);
4007         assert(inst->header_size + 4 <= inst->sources);
4008         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
4009            if (inst->src[i].file != BAD_FILE) {
4010               if (devinfo->has_compr4) {
4011                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
4012                  compr4_dst.nr |= BRW_MRF_COMPR4;
4013                  ibld.MOV(compr4_dst, inst->src[i]);
4014               } else {
4015                  /* Platform doesn't have COMPR4.  We have to fake it */
4016                  fs_reg mov_dst = retype(dst, inst->src[i].type);
4017                  ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
4018                  mov_dst.nr += 4;
4019                  ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
4020               }
4021            }
4022
4023            dst.nr++;
4024         }
4025
4026         /* The loop above only ever incremented us through the first set
4027          * of 4 registers.  However, thanks to the magic of COMPR4, we
4028          * actually wrote to the first 8 registers, so we need to take
4029          * that into account now.
4030          */
4031         dst.nr += 4;
4032
4033         /* The COMPR4 code took care of the first 4 sources.  We'll let
4034          * the regular path handle any remaining sources.  Yes, we are
4035          * modifying the instruction but we're about to delete it so
4036          * this really doesn't hurt anything.
4037          */
4038         inst->header_size += 4;
4039      }
4040
4041      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
4042         if (inst->src[i].file != BAD_FILE) {
4043            dst.type = inst->src[i].type;
4044            ibld.MOV(dst, inst->src[i]);
4045         } else {
4046            dst.type = BRW_REGISTER_TYPE_UD;
4047         }
4048         dst = offset(dst, ibld, 1);
4049      }
4050
4051      inst->remove(block);
4052      progress = true;
4053   }
4054
4055   if (progress)
4056      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4057
4058   return progress;
4059}
4060
4061void
4062fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
4063{
4064   const fs_builder ibld(this, block, inst);
4065
4066   const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
4067   if (inst->src[1].file == IMM &&
4068       (( ud && inst->src[1].ud <= UINT16_MAX) ||
4069        (!ud && inst->src[1].d <= INT16_MAX && inst->src[1].d >= INT16_MIN))) {
4070      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
4071       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
4072       * src1 are used.
4073       *
4074       * If multiplying by an immediate value that fits in 16-bits, do a
4075       * single MUL instruction with that value in the proper location.
4076       */
4077      if (devinfo->ver < 7) {
4078         fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
4079         ibld.MOV(imm, inst->src[1]);
4080         ibld.MUL(inst->dst, imm, inst->src[0]);
4081      } else {
4082         ibld.MUL(inst->dst, inst->src[0],
4083                  ud ? brw_imm_uw(inst->src[1].ud)
4084                     : brw_imm_w(inst->src[1].d));
4085      }
4086   } else {
4087      /* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
4088       * do 32-bit integer multiplication in one instruction, but instead
4089       * must do a sequence (which actually calculates a 64-bit result):
4090       *
4091       *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
4092       *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
4093       *    mov(8)  g2<1>D     acc0<8,8,1>D
4094       *
4095       * But on Gen > 6, the ability to use second accumulator register
4096       * (acc1) for non-float data types was removed, preventing a simple
4097       * implementation in SIMD16. A 16-channel result can be calculated by
4098       * executing the three instructions twice in SIMD8, once with quarter
4099       * control of 1Q for the first eight channels and again with 2Q for
4100       * the second eight channels.
4101       *
4102       * Which accumulator register is implicitly accessed (by AccWrEnable
4103       * for instance) is determined by the quarter control. Unfortunately
4104       * Ivybridge (and presumably Baytrail) has a hardware bug in which an
4105       * implicit accumulator access by an instruction with 2Q will access
4106       * acc1 regardless of whether the data type is usable in acc1.
4107       *
4108       * Specifically, the 2Q mach(8) writes acc1 which does not exist for
4109       * integer data types.
4110       *
4111       * Since we only want the low 32-bits of the result, we can do two
4112       * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
4113       * adjust the high result and add them (like the mach is doing):
4114       *
4115       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
4116       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
4117       *    shl(8)  g9<1>D     g8<8,8,1>D      16D
4118       *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
4119       *
4120       * We avoid the shl instruction by realizing that we only want to add
4121       * the low 16-bits of the "high" result to the high 16-bits of the
4122       * "low" result and using proper regioning on the add:
4123       *
4124       *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
4125       *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
4126       *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
4127       *
4128       * Since it does not use the (single) accumulator register, we can
4129       * schedule multi-component multiplications much better.
4130       */
4131
4132      bool needs_mov = false;
4133      fs_reg orig_dst = inst->dst;
4134
4135      /* Get a new VGRF for the "low" 32x16-bit multiplication result if
4136       * reusing the original destination is impossible due to hardware
4137       * restrictions, source/destination overlap, or it being the null
4138       * register.
4139       */
4140      fs_reg low = inst->dst;
4141      if (orig_dst.is_null() || orig_dst.file == MRF ||
4142          regions_overlap(inst->dst, inst->size_written,
4143                          inst->src[0], inst->size_read(0)) ||
4144          regions_overlap(inst->dst, inst->size_written,
4145                          inst->src[1], inst->size_read(1)) ||
4146          inst->dst.stride >= 4) {
4147         needs_mov = true;
4148         low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
4149                      inst->dst.type);
4150      }
4151
4152      /* Get a new VGRF but keep the same stride as inst->dst */
4153      fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
4154      high.stride = inst->dst.stride;
4155      high.offset = inst->dst.offset % REG_SIZE;
4156
4157      if (devinfo->ver >= 7) {
4158         /* From Wa_1604601757:
4159          *
4160          * "When multiplying a DW and any lower precision integer, source modifier
4161          *  is not supported."
4162          *
4163          * An unsupported negate modifier on src[1] would ordinarily be
4164          * lowered by the subsequent lower_regioning pass.  In this case that
4165          * pass would spawn another dword multiply.  Instead, lower the
4166          * modifier first.
4167          */
4168         const bool source_mods_unsupported = (devinfo->ver >= 12);
4169
4170         if (inst->src[1].abs || (inst->src[1].negate &&
4171                                  source_mods_unsupported))
4172            lower_src_modifiers(this, block, inst, 1);
4173
4174         if (inst->src[1].file == IMM) {
4175            ibld.MUL(low, inst->src[0],
4176                     brw_imm_uw(inst->src[1].ud & 0xffff));
4177            ibld.MUL(high, inst->src[0],
4178                     brw_imm_uw(inst->src[1].ud >> 16));
4179         } else {
4180            ibld.MUL(low, inst->src[0],
4181                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4182            ibld.MUL(high, inst->src[0],
4183                     subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
4184         }
4185      } else {
4186         if (inst->src[0].abs)
4187            lower_src_modifiers(this, block, inst, 0);
4188
4189         ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
4190                  inst->src[1]);
4191         ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
4192                  inst->src[1]);
4193      }
4194
4195      ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
4196               subscript(low, BRW_REGISTER_TYPE_UW, 1),
4197               subscript(high, BRW_REGISTER_TYPE_UW, 0));
4198
4199      if (needs_mov || inst->conditional_mod)
4200         set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
4201   }
4202}
4203
4204void
4205fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
4206{
4207   const fs_builder ibld(this, block, inst);
4208
4209   /* Considering two 64-bit integers ab and cd where each letter        ab
4210    * corresponds to 32 bits, we get a 128-bit result WXYZ. We         * cd
4211    * only need to provide the YZ part of the result.               -------
4212    *                                                                    BD
4213    *  Only BD needs to be 64 bits. For AD and BC we only care       +  AD
4214    *  about the lower 32 bits (since they are part of the upper     +  BC
4215    *  32 bits of our result). AC is not needed since it starts      + AC
4216    *  on the 65th bit of the result.                               -------
4217    *                                                                  WXYZ
4218    */
4219   unsigned int q_regs = regs_written(inst);
4220   unsigned int d_regs = (q_regs + 1) / 2;
4221
4222   fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
4223   fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4224   fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4225
4226   /* Here we need the full 64 bit result for 32b * 32b. */
4227   if (devinfo->has_integer_dword_mul) {
4228      ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4229               subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4230   } else {
4231      fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4232      fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
4233      fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
4234
4235      fs_inst *mul = ibld.MUL(acc,
4236                            subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4237                            subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
4238      mul->writes_accumulator = true;
4239
4240      ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4241                subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4242      ibld.MOV(bd_low, acc);
4243
4244      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
4245      ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
4246   }
4247
4248   ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
4249            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
4250   ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
4251            subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
4252
4253   ibld.ADD(ad, ad, bc);
4254   ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
4255            subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
4256
4257   if (devinfo->has_64bit_int) {
4258      ibld.MOV(inst->dst, bd);
4259   } else {
4260      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
4261               subscript(bd, BRW_REGISTER_TYPE_UD, 0));
4262      ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
4263               subscript(bd, BRW_REGISTER_TYPE_UD, 1));
4264   }
4265}
4266
4267void
4268fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
4269{
4270   const fs_builder ibld(this, block, inst);
4271
4272   /* According to the BDW+ BSpec page for the "Multiply Accumulate
4273    * High" instruction:
4274    *
4275    *  "An added preliminary mov is required for source modification on
4276    *   src1:
4277    *      mov (8) r3.0<1>:d -r3<8;8,1>:d
4278    *      mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
4279    *      mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
4280    */
4281   if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
4282      lower_src_modifiers(this, block, inst, 1);
4283
4284   /* Should have been lowered to 8-wide. */
4285   assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
4286   const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type);
4287   fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
4288   fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
4289
4290   if (devinfo->ver >= 8) {
4291      /* Until Gfx8, integer multiplies read 32-bits from one source,
4292       * and 16-bits from the other, and relying on the MACH instruction
4293       * to generate the high bits of the result.
4294       *
4295       * On Gfx8, the multiply instruction does a full 32x32-bit
4296       * multiply, but in order to do a 64-bit multiply we can simulate
4297       * the previous behavior and then use a MACH instruction.
4298       */
4299      assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
4300             mul->src[1].type == BRW_REGISTER_TYPE_UD);
4301      mul->src[1].type = BRW_REGISTER_TYPE_UW;
4302      mul->src[1].stride *= 2;
4303
4304      if (mul->src[1].file == IMM) {
4305         mul->src[1] = brw_imm_uw(mul->src[1].ud);
4306      }
4307   } else if (devinfo->verx10 == 70 &&
4308              inst->group > 0) {
4309      /* Among other things the quarter control bits influence which
4310       * accumulator register is used by the hardware for instructions
4311       * that access the accumulator implicitly (e.g. MACH).  A
4312       * second-half instruction would normally map to acc1, which
4313       * doesn't exist on Gfx7 and up (the hardware does emulate it for
4314       * floating-point instructions *only* by taking advantage of the
4315       * extra precision of acc0 not normally used for floating point
4316       * arithmetic).
4317       *
4318       * HSW and up are careful enough not to try to access an
4319       * accumulator register that doesn't exist, but on earlier Gfx7
4320       * hardware we need to make sure that the quarter control bits are
4321       * zero to avoid non-deterministic behaviour and emit an extra MOV
4322       * to get the result masked correctly according to the current
4323       * channel enables.
4324       */
4325      mach->group = 0;
4326      mach->force_writemask_all = true;
4327      mach->dst = ibld.vgrf(inst->dst.type);
4328      ibld.MOV(inst->dst, mach->dst);
4329   }
4330}
4331
4332bool
4333fs_visitor::lower_integer_multiplication()
4334{
4335   bool progress = false;
4336
4337   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4338      if (inst->opcode == BRW_OPCODE_MUL) {
4339         /* If the instruction is already in a form that does not need lowering,
4340          * return early.
4341          */
4342         if (devinfo->ver >= 7) {
4343            if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
4344               continue;
4345         } else {
4346            if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
4347               continue;
4348         }
4349
4350         if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
4351              inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
4352             (inst->src[0].type == BRW_REGISTER_TYPE_Q ||
4353              inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
4354             (inst->src[1].type == BRW_REGISTER_TYPE_Q ||
4355              inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
4356            lower_mul_qword_inst(inst, block);
4357            inst->remove(block);
4358            progress = true;
4359         } else if (!inst->dst.is_accumulator() &&
4360                    (inst->dst.type == BRW_REGISTER_TYPE_D ||
4361                     inst->dst.type == BRW_REGISTER_TYPE_UD) &&
4362                    (!devinfo->has_integer_dword_mul ||
4363                     devinfo->verx10 >= 125)) {
4364            lower_mul_dword_inst(inst, block);
4365            inst->remove(block);
4366            progress = true;
4367         }
4368      } else if (inst->opcode == SHADER_OPCODE_MULH) {
4369         lower_mulh_inst(inst, block);
4370         inst->remove(block);
4371         progress = true;
4372      }
4373
4374   }
4375
4376   if (progress)
4377      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4378
4379   return progress;
4380}
4381
4382bool
4383fs_visitor::lower_minmax()
4384{
4385   assert(devinfo->ver < 6);
4386
4387   bool progress = false;
4388
4389   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4390      const fs_builder ibld(this, block, inst);
4391
4392      if (inst->opcode == BRW_OPCODE_SEL &&
4393          inst->predicate == BRW_PREDICATE_NONE) {
4394         /* If src1 is an immediate value that is not NaN, then it can't be
4395          * NaN.  In that case, emit CMP because it is much better for cmod
4396          * propagation.  Likewise if src1 is not float.  Gfx4 and Gfx5 don't
4397          * support HF or DF, so it is not necessary to check for those.
4398          */
4399         if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
4400             (inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
4401            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4402                     inst->conditional_mod);
4403         } else {
4404            ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
4405                      inst->conditional_mod);
4406         }
4407         inst->predicate = BRW_PREDICATE_NORMAL;
4408         inst->conditional_mod = BRW_CONDITIONAL_NONE;
4409
4410         progress = true;
4411      }
4412   }
4413
4414   if (progress)
4415      invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
4416
4417   return progress;
4418}
4419
4420bool
4421fs_visitor::lower_sub_sat()
4422{
4423   bool progress = false;
4424
4425   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
4426      const fs_builder ibld(this, block, inst);
4427
4428      if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
4429          inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4430         /* The fundamental problem is the hardware performs source negation
4431          * at the bit width of the source.  If the source is 0x80000000D, the
4432          * negation is 0x80000000D.  As a result, subtractSaturate(0,
4433          * 0x80000000) will produce 0x80000000 instead of 0x7fffffff.  There
4434          * are at least three ways to resolve this:
4435          *
4436          * 1. Use the accumulator for the negated source.  The accumulator is
4437          *    33 bits, so our source 0x80000000 is sign-extended to
4438          *    0x1800000000.  The negation of which is 0x080000000.  This
4439          *    doesn't help for 64-bit integers (which are already bigger than
4440          *    33 bits).  There are also only 8 accumulators, so SIMD16 or
4441          *    SIMD32 instructions would have to be split into multiple SIMD8
4442          *    instructions.
4443          *
4444          * 2. Use slightly different math.  For any n-bit value x, we know (x
4445          *    >> 1) != -(x >> 1).  We can use this fact to only do
4446          *    subtractions involving (x >> 1).  subtractSaturate(a, b) ==
4447          *    subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
4448          *
4449          * 3. For unsigned sources, it is sufficient to replace the
4450          *    subtractSaturate with (a > b) ? a - b : 0.
4451          *
4452          * It may also be possible to use the SUBB instruction.  This
4453          * implicitly writes the accumulator, so it could only be used in the
4454          * same situations as #1 above.  It is further limited by only
4455          * allowing UD sources.
4456          */
4457         if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
4458             inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
4459            fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
4460
4461            ibld.MOV(acc, inst->src[1]);
4462            fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
4463            add->saturate = true;
4464            add->src[0].negate = true;
4465         } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
4466            /* tmp = src1 >> 1;
4467             * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
4468             */
4469            fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
4470            fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
4471            fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
4472            fs_inst *add;
4473
4474            ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
4475
4476            add = ibld.ADD(tmp2, inst->src[1], tmp1);
4477            add->src[1].negate = true;
4478
4479            add = ibld.ADD(tmp3, inst->src[0], tmp1);
4480            add->src[1].negate = true;
4481            add->saturate = true;
4482
4483            add = ibld.ADD(inst->dst, tmp3, tmp2);
4484            add->src[1].negate = true;
4485            add->saturate = true;
4486         } else {
4487            /* a > b ? a - b : 0 */
4488            ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
4489                     BRW_CONDITIONAL_G);
4490
4491            fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
4492            add->src[1].negate = !add->src[1].negate;
4493
4494            ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
4495               ->predicate = BRW_PREDICATE_NORMAL;
4496         }
4497
4498         inst->remove(block);
4499         progress = true;
4500      }
4501   }
4502
4503   if (progress)
4504      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
4505
4506   return progress;
4507}
4508
4509/**
4510 * Get the mask of SIMD channels enabled during dispatch and not yet disabled
4511 * by discard.  Due to the layout of the sample mask in the fragment shader
4512 * thread payload, \p bld is required to have a dispatch_width() not greater
4513 * than 16 for fragment shaders.
4514 */
4515static fs_reg
4516sample_mask_reg(const fs_builder &bld)
4517{
4518   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
4519
4520   if (v->stage != MESA_SHADER_FRAGMENT) {
4521      return brw_imm_ud(0xffffffff);
4522   } else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
4523      assert(bld.dispatch_width() <= 16);
4524      return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
4525   } else {
4526      assert(v->devinfo->ver >= 6 && bld.dispatch_width() <= 16);
4527      return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
4528                    BRW_REGISTER_TYPE_UW);
4529   }
4530}
4531
4532static void
4533setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
4534                    fs_reg *dst, fs_reg color, unsigned components)
4535{
4536   if (key->clamp_fragment_color) {
4537      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
4538      assert(color.type == BRW_REGISTER_TYPE_F);
4539
4540      for (unsigned i = 0; i < components; i++)
4541         set_saturate(true,
4542                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
4543
4544      color = tmp;
4545   }
4546
4547   for (unsigned i = 0; i < components; i++)
4548      dst[i] = offset(color, bld, i);
4549}
4550
4551uint32_t
4552brw_fb_write_msg_control(const fs_inst *inst,
4553                         const struct brw_wm_prog_data *prog_data)
4554{
4555   uint32_t mctl;
4556
4557   if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
4558      assert(inst->group == 0 && inst->exec_size == 16);
4559      mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
4560   } else if (prog_data->dual_src_blend) {
4561      assert(inst->exec_size == 8);
4562
4563      if (inst->group % 16 == 0)
4564         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
4565      else if (inst->group % 16 == 8)
4566         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
4567      else
4568         unreachable("Invalid dual-source FB write instruction group");
4569   } else {
4570      assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
4571
4572      if (inst->exec_size == 16)
4573         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
4574      else if (inst->exec_size == 8)
4575         mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
4576      else
4577         unreachable("Invalid FB write execution size");
4578   }
4579
4580   return mctl;
4581}
4582
4583static void
4584lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
4585                            const struct brw_wm_prog_data *prog_data,
4586                            const brw_wm_prog_key *key,
4587                            const fs_visitor::thread_payload &payload)
4588{
4589   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
4590   const intel_device_info *devinfo = bld.shader->devinfo;
4591   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
4592   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
4593   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
4594   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
4595   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
4596   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
4597   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
4598   const unsigned components =
4599      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
4600
4601   assert(inst->target != 0 || src0_alpha.file == BAD_FILE);
4602
4603   /* We can potentially have a message length of up to 15, so we have to set
4604    * base_mrf to either 0 or 1 in order to fit in m0..m15.
4605    */
4606   fs_reg sources[15];
4607   int header_size = 2, payload_header_size;
4608   unsigned length = 0;
4609
4610   if (devinfo->ver < 6) {
4611      /* TODO: Support SIMD32 on gfx4-5 */
4612      assert(bld.group() < 16);
4613
4614      /* For gfx4-5, we always have a header consisting of g0 and g1.  We have
4615       * an implied MOV from g0,g1 to the start of the message.  The MOV from
4616       * g0 is handled by the hardware and the MOV from g1 is provided by the
4617       * generator.  This is required because, on gfx4-5, the generator may
4618       * generate two write messages with different message lengths in order
4619       * to handle AA data properly.
4620       *
4621       * Also, since the pixel mask goes in the g0 portion of the message and
4622       * since render target writes are the last thing in the shader, we write
4623       * the pixel mask directly into g0 and it will get copied as part of the
4624       * implied write.
4625       */
4626      if (prog_data->uses_kill) {
4627         bld.exec_all().group(1, 0)
4628            .MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
4629                 sample_mask_reg(bld));
4630      }
4631
4632      assert(length == 0);
4633      length = 2;
4634   } else if ((devinfo->verx10 <= 70 &&
4635               prog_data->uses_kill) ||
4636              (devinfo->ver < 11 &&
4637               (color1.file != BAD_FILE || key->nr_color_regions > 1))) {
4638      /* From the Sandy Bridge PRM, volume 4, page 198:
4639       *
4640       *     "Dispatched Pixel Enables. One bit per pixel indicating
4641       *      which pixels were originally enabled when the thread was
4642       *      dispatched. This field is only required for the end-of-
4643       *      thread message and on all dual-source messages."
4644       */
4645      const fs_builder ubld = bld.exec_all().group(8, 0);
4646
4647      fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
4648      if (bld.group() < 16) {
4649         /* The header starts off as g0 and g1 for the first half */
4650         ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4651                                              BRW_REGISTER_TYPE_UD));
4652      } else {
4653         /* The header starts off as g0 and g2 for the second half */
4654         assert(bld.group() < 32);
4655         const fs_reg header_sources[2] = {
4656            retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4657            retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
4658         };
4659         ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
4660
4661         /* Gfx12 will require additional fix-ups if we ever hit this path. */
4662         assert(devinfo->ver < 12);
4663      }
4664
4665      uint32_t g00_bits = 0;
4666
4667      /* Set "Source0 Alpha Present to RenderTarget" bit in message
4668       * header.
4669       */
4670      if (src0_alpha.file != BAD_FILE)
4671         g00_bits |= 1 << 11;
4672
4673      /* Set computes stencil to render target */
4674      if (prog_data->computed_stencil)
4675         g00_bits |= 1 << 14;
4676
4677      if (g00_bits) {
4678         /* OR extra bits into g0.0 */
4679         ubld.group(1, 0).OR(component(header, 0),
4680                             retype(brw_vec1_grf(0, 0),
4681                                    BRW_REGISTER_TYPE_UD),
4682                             brw_imm_ud(g00_bits));
4683      }
4684
4685      /* Set the render target index for choosing BLEND_STATE. */
4686      if (inst->target > 0) {
4687         ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
4688      }
4689
4690      if (prog_data->uses_kill) {
4691         ubld.group(1, 0).MOV(retype(component(header, 15),
4692                                     BRW_REGISTER_TYPE_UW),
4693                              sample_mask_reg(bld));
4694      }
4695
4696      assert(length == 0);
4697      sources[0] = header;
4698      sources[1] = horiz_offset(header, 8);
4699      length = 2;
4700   }
4701   assert(length == 0 || length == 2);
4702   header_size = length;
4703
4704   if (payload.aa_dest_stencil_reg[0]) {
4705      assert(inst->group < 16);
4706      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
4707      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
4708         .MOV(sources[length],
4709              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
4710      length++;
4711   }
4712
4713   if (src0_alpha.file != BAD_FILE) {
4714      for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
4715         const fs_builder &ubld = bld.exec_all().group(8, i)
4716                                    .annotate("FB write src0 alpha");
4717         const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
4718         ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
4719         setup_color_payload(ubld, key, &sources[length], tmp, 1);
4720         length++;
4721      }
4722   }
4723
4724   if (sample_mask.file != BAD_FILE) {
4725      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
4726                               BRW_REGISTER_TYPE_UD);
4727
4728      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
4729       * relevant.  Since it's unsigned single words one vgrf is always
4730       * 16-wide, but only the lower or higher 8 channels will be used by the
4731       * hardware when doing a SIMD8 write depending on whether we have
4732       * selected the subspans for the first or second half respectively.
4733       */
4734      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
4735      sample_mask.type = BRW_REGISTER_TYPE_UW;
4736      sample_mask.stride *= 2;
4737
4738      bld.exec_all().annotate("FB write oMask")
4739         .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
4740                           inst->group % 16),
4741              sample_mask);
4742      length++;
4743   }
4744
4745   payload_header_size = length;
4746
4747   setup_color_payload(bld, key, &sources[length], color0, components);
4748   length += 4;
4749
4750   if (color1.file != BAD_FILE) {
4751      setup_color_payload(bld, key, &sources[length], color1, components);
4752      length += 4;
4753   }
4754
4755   if (src_depth.file != BAD_FILE) {
4756      sources[length] = src_depth;
4757      length++;
4758   }
4759
4760   if (dst_depth.file != BAD_FILE) {
4761      sources[length] = dst_depth;
4762      length++;
4763   }
4764
4765   if (src_stencil.file != BAD_FILE) {
4766      assert(devinfo->ver >= 9);
4767      assert(bld.dispatch_width() == 8);
4768
4769      /* XXX: src_stencil is only available on gfx9+. dst_depth is never
4770       * available on gfx9+. As such it's impossible to have both enabled at the
4771       * same time and therefore length cannot overrun the array.
4772       */
4773      assert(length < 15);
4774
4775      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
4776      bld.exec_all().annotate("FB write OS")
4777         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
4778              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
4779      length++;
4780   }
4781
4782   fs_inst *load;
4783   if (devinfo->ver >= 7) {
4784      /* Send from the GRF */
4785      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
4786      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
4787      payload.nr = bld.shader->alloc.allocate(regs_written(load));
4788      load->dst = payload;
4789
4790      uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
4791
4792      inst->desc =
4793         (inst->group / 16) << 11 | /* rt slot group */
4794         brw_fb_write_desc(devinfo, inst->target, msg_ctl, inst->last_rt,
4795                           prog_data->per_coarse_pixel_dispatch);
4796
4797      uint32_t ex_desc = 0;
4798      if (devinfo->ver >= 11) {
4799         /* Set the "Render Target Index" and "Src0 Alpha Present" fields
4800          * in the extended message descriptor, in lieu of using a header.
4801          */
4802         ex_desc = inst->target << 12 | (src0_alpha.file != BAD_FILE) << 15;
4803
4804         if (key->nr_color_regions == 0)
4805            ex_desc |= 1 << 20; /* Null Render Target */
4806      }
4807      inst->ex_desc = ex_desc;
4808
4809      inst->opcode = SHADER_OPCODE_SEND;
4810      inst->resize_sources(3);
4811      inst->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
4812      inst->src[0] = brw_imm_ud(0);
4813      inst->src[1] = brw_imm_ud(0);
4814      inst->src[2] = payload;
4815      inst->mlen = regs_written(load);
4816      inst->ex_mlen = 0;
4817      inst->header_size = header_size;
4818      inst->check_tdr = true;
4819      inst->send_has_side_effects = true;
4820   } else {
4821      /* Send from the MRF */
4822      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
4823                              sources, length, payload_header_size);
4824
4825      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
4826       * will do this for us if we just give it a COMPR4 destination.
4827       */
4828      if (devinfo->ver < 6 && bld.dispatch_width() == 16)
4829         load->dst.nr |= BRW_MRF_COMPR4;
4830
4831      if (devinfo->ver < 6) {
4832         /* Set up src[0] for the implied MOV from grf0-1 */
4833         inst->resize_sources(1);
4834         inst->src[0] = brw_vec8_grf(0, 0);
4835      } else {
4836         inst->resize_sources(0);
4837      }
4838      inst->base_mrf = 1;
4839      inst->opcode = FS_OPCODE_FB_WRITE;
4840      inst->mlen = regs_written(load);
4841      inst->header_size = header_size;
4842   }
4843}
4844
4845static void
4846lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
4847{
4848   const intel_device_info *devinfo = bld.shader->devinfo;
4849   const fs_builder &ubld = bld.exec_all().group(8, 0);
4850   const unsigned length = 2;
4851   const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
4852
4853   if (bld.group() < 16) {
4854      ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
4855                                           BRW_REGISTER_TYPE_UD));
4856   } else {
4857      assert(bld.group() < 32);
4858      const fs_reg header_sources[] = {
4859         retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
4860         retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
4861      };
4862      ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
4863
4864      if (devinfo->ver >= 12) {
4865         /* On Gfx12 the Viewport and Render Target Array Index fields (AKA
4866          * Poly 0 Info) are provided in r1.1 instead of r0.0, and the render
4867          * target message header format was updated accordingly -- However
4868          * the updated format only works for the lower 16 channels in a
4869          * SIMD32 thread, since the higher 16 channels want the subspan data
4870          * from r2 instead of r1, so we need to copy over the contents of
4871          * r1.1 in order to fix things up.
4872          */
4873         ubld.group(1, 0).MOV(component(header, 9),
4874                              retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD));
4875      }
4876   }
4877
4878   /* BSpec 12470 (Gfx8-11), BSpec 47842 (Gfx12+) :
4879    *
4880    *   "Must be zero for Render Target Read message."
4881    *
4882    * For bits :
4883    *   - 14 : Stencil Present to Render Target
4884    *   - 13 : Source Depth Present to Render Target
4885    *   - 12 : oMask to Render Target
4886    *   - 11 : Source0 Alpha Present to Render Target
4887    */
4888   ubld.group(1, 0).AND(component(header, 0),
4889                        component(header, 0),
4890                        brw_imm_ud(~INTEL_MASK(14, 11)));
4891
4892   inst->resize_sources(1);
4893   inst->src[0] = header;
4894   inst->opcode = FS_OPCODE_FB_READ;
4895   inst->mlen = length;
4896   inst->header_size = length;
4897}
4898
4899static void
4900lower_sampler_logical_send_gfx4(const fs_builder &bld, fs_inst *inst, opcode op,
4901                                const fs_reg &coordinate,
4902                                const fs_reg &shadow_c,
4903                                const fs_reg &lod, const fs_reg &lod2,
4904                                const fs_reg &surface,
4905                                const fs_reg &sampler,
4906                                unsigned coord_components,
4907                                unsigned grad_components)
4908{
4909   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
4910                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
4911   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
4912   fs_reg msg_end = msg_begin;
4913
4914   /* g0 header. */
4915   msg_end = offset(msg_end, bld.group(8, 0), 1);
4916
4917   for (unsigned i = 0; i < coord_components; i++)
4918      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
4919              offset(coordinate, bld, i));
4920
4921   msg_end = offset(msg_end, bld, coord_components);
4922
4923   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
4924    * require all three components to be present and zero if they are unused.
4925    */
4926   if (coord_components > 0 &&
4927       (has_lod || shadow_c.file != BAD_FILE ||
4928        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
4929      assert(coord_components <= 3);
4930      for (unsigned i = 0; i < 3 - coord_components; i++)
4931         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
4932
4933      msg_end = offset(msg_end, bld, 3 - coord_components);
4934   }
4935
4936   if (op == SHADER_OPCODE_TXD) {
4937      /* TXD unsupported in SIMD16 mode. */
4938      assert(bld.dispatch_width() == 8);
4939
4940      /* the slots for u and v are always present, but r is optional */
4941      if (coord_components < 2)
4942         msg_end = offset(msg_end, bld, 2 - coord_components);
4943
4944      /*  P   = u, v, r
4945       * dPdx = dudx, dvdx, drdx
4946       * dPdy = dudy, dvdy, drdy
4947       *
4948       * 1-arg: Does not exist.
4949       *
4950       * 2-arg: dudx   dvdx   dudy   dvdy
4951       *        dPdx.x dPdx.y dPdy.x dPdy.y
4952       *        m4     m5     m6     m7
4953       *
4954       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
4955       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
4956       *        m5     m6     m7     m8     m9     m10
4957       */
4958      for (unsigned i = 0; i < grad_components; i++)
4959         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
4960
4961      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4962
4963      for (unsigned i = 0; i < grad_components; i++)
4964         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
4965
4966      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
4967   }
4968
4969   if (has_lod) {
4970      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
4971       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
4972       */
4973      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
4974             bld.dispatch_width() == 16);
4975
4976      const brw_reg_type type =
4977         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
4978          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
4979      bld.MOV(retype(msg_end, type), lod);
4980      msg_end = offset(msg_end, bld, 1);
4981   }
4982
4983   if (shadow_c.file != BAD_FILE) {
4984      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
4985         /* There's no plain shadow compare message, so we use shadow
4986          * compare with a bias of 0.0.
4987          */
4988         bld.MOV(msg_end, brw_imm_f(0.0f));
4989         msg_end = offset(msg_end, bld, 1);
4990      }
4991
4992      bld.MOV(msg_end, shadow_c);
4993      msg_end = offset(msg_end, bld, 1);
4994   }
4995
4996   inst->opcode = op;
4997   inst->src[0] = reg_undef;
4998   inst->src[1] = surface;
4999   inst->src[2] = sampler;
5000   inst->resize_sources(3);
5001   inst->base_mrf = msg_begin.nr;
5002   inst->mlen = msg_end.nr - msg_begin.nr;
5003   inst->header_size = 1;
5004}
5005
5006static void
5007lower_sampler_logical_send_gfx5(const fs_builder &bld, fs_inst *inst, opcode op,
5008                                const fs_reg &coordinate,
5009                                const fs_reg &shadow_c,
5010                                const fs_reg &lod, const fs_reg &lod2,
5011                                const fs_reg &sample_index,
5012                                const fs_reg &surface,
5013                                const fs_reg &sampler,
5014                                unsigned coord_components,
5015                                unsigned grad_components)
5016{
5017   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
5018   fs_reg msg_coords = message;
5019   unsigned header_size = 0;
5020
5021   if (inst->offset != 0) {
5022      /* The offsets set up by the visitor are in the m1 header, so we can't
5023       * go headerless.
5024       */
5025      header_size = 1;
5026      message.nr--;
5027   }
5028
5029   for (unsigned i = 0; i < coord_components; i++)
5030      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
5031              offset(coordinate, bld, i));
5032
5033   fs_reg msg_end = offset(msg_coords, bld, coord_components);
5034   fs_reg msg_lod = offset(msg_coords, bld, 4);
5035
5036   if (shadow_c.file != BAD_FILE) {
5037      fs_reg msg_shadow = msg_lod;
5038      bld.MOV(msg_shadow, shadow_c);
5039      msg_lod = offset(msg_shadow, bld, 1);
5040      msg_end = msg_lod;
5041   }
5042
5043   switch (op) {
5044   case SHADER_OPCODE_TXL:
5045   case FS_OPCODE_TXB:
5046      bld.MOV(msg_lod, lod);
5047      msg_end = offset(msg_lod, bld, 1);
5048      break;
5049   case SHADER_OPCODE_TXD:
5050      /**
5051       *  P   =  u,    v,    r
5052       * dPdx = dudx, dvdx, drdx
5053       * dPdy = dudy, dvdy, drdy
5054       *
5055       * Load up these values:
5056       * - dudx   dudy   dvdx   dvdy   drdx   drdy
5057       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
5058       */
5059      msg_end = msg_lod;
5060      for (unsigned i = 0; i < grad_components; i++) {
5061         bld.MOV(msg_end, offset(lod, bld, i));
5062         msg_end = offset(msg_end, bld, 1);
5063
5064         bld.MOV(msg_end, offset(lod2, bld, i));
5065         msg_end = offset(msg_end, bld, 1);
5066      }
5067      break;
5068   case SHADER_OPCODE_TXS:
5069      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
5070      bld.MOV(msg_lod, lod);
5071      msg_end = offset(msg_lod, bld, 1);
5072      break;
5073   case SHADER_OPCODE_TXF:
5074      msg_lod = offset(msg_coords, bld, 3);
5075      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
5076      msg_end = offset(msg_lod, bld, 1);
5077      break;
5078   case SHADER_OPCODE_TXF_CMS:
5079      msg_lod = offset(msg_coords, bld, 3);
5080      /* lod */
5081      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
5082      /* sample index */
5083      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
5084      msg_end = offset(msg_lod, bld, 2);
5085      break;
5086   default:
5087      break;
5088   }
5089
5090   inst->opcode = op;
5091   inst->src[0] = reg_undef;
5092   inst->src[1] = surface;
5093   inst->src[2] = sampler;
5094   inst->resize_sources(3);
5095   inst->base_mrf = message.nr;
5096   inst->mlen = msg_end.nr - message.nr;
5097   inst->header_size = header_size;
5098
5099   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5100   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5101}
5102
5103static bool
5104is_high_sampler(const struct intel_device_info *devinfo, const fs_reg &sampler)
5105{
5106   if (devinfo->verx10 <= 70)
5107      return false;
5108
5109   return sampler.file != IMM || sampler.ud >= 16;
5110}
5111
5112static unsigned
5113sampler_msg_type(const intel_device_info *devinfo,
5114                 opcode opcode, bool shadow_compare)
5115{
5116   assert(devinfo->ver >= 5);
5117   switch (opcode) {
5118   case SHADER_OPCODE_TEX:
5119      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
5120                              GFX5_SAMPLER_MESSAGE_SAMPLE;
5121   case FS_OPCODE_TXB:
5122      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
5123                              GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
5124   case SHADER_OPCODE_TXL:
5125      return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
5126                              GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
5127   case SHADER_OPCODE_TXL_LZ:
5128      return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
5129                              GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
5130   case SHADER_OPCODE_TXS:
5131   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5132      return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
5133   case SHADER_OPCODE_TXD:
5134      assert(!shadow_compare || devinfo->verx10 >= 75);
5135      return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
5136                              GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
5137   case SHADER_OPCODE_TXF:
5138      return GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5139   case SHADER_OPCODE_TXF_LZ:
5140      assert(devinfo->ver >= 9);
5141      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
5142   case SHADER_OPCODE_TXF_CMS_W:
5143      assert(devinfo->ver >= 9);
5144      return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
5145   case SHADER_OPCODE_TXF_CMS:
5146      return devinfo->ver >= 7 ? GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
5147                                 GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
5148   case SHADER_OPCODE_TXF_UMS:
5149      assert(devinfo->ver >= 7);
5150      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
5151   case SHADER_OPCODE_TXF_MCS:
5152      assert(devinfo->ver >= 7);
5153      return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
5154   case SHADER_OPCODE_LOD:
5155      return GFX5_SAMPLER_MESSAGE_LOD;
5156   case SHADER_OPCODE_TG4:
5157      assert(devinfo->ver >= 7);
5158      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
5159                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
5160      break;
5161   case SHADER_OPCODE_TG4_OFFSET:
5162      assert(devinfo->ver >= 7);
5163      return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
5164                              GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
5165   case SHADER_OPCODE_SAMPLEINFO:
5166      return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
5167   default:
5168      unreachable("not reached");
5169   }
5170}
5171
5172static void
5173lower_sampler_logical_send_gfx7(const fs_builder &bld, fs_inst *inst, opcode op,
5174                                const fs_reg &coordinate,
5175                                const fs_reg &shadow_c,
5176                                fs_reg lod, const fs_reg &lod2,
5177                                const fs_reg &min_lod,
5178                                const fs_reg &sample_index,
5179                                const fs_reg &mcs,
5180                                const fs_reg &surface,
5181                                const fs_reg &sampler,
5182                                const fs_reg &surface_handle,
5183                                const fs_reg &sampler_handle,
5184                                const fs_reg &tg4_offset,
5185                                unsigned coord_components,
5186                                unsigned grad_components)
5187{
5188   const intel_device_info *devinfo = bld.shader->devinfo;
5189   const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
5190   unsigned reg_width = bld.dispatch_width() / 8;
5191   unsigned header_size = 0, length = 0;
5192   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
5193   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
5194      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
5195
5196   /* We must have exactly one of surface/sampler and surface/sampler_handle */
5197   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5198   assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
5199
5200   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
5201       inst->offset != 0 || inst->eot ||
5202       op == SHADER_OPCODE_SAMPLEINFO ||
5203       sampler_handle.file != BAD_FILE ||
5204       is_high_sampler(devinfo, sampler)) {
5205      /* For general texture offsets (no txf workaround), we need a header to
5206       * put them in.
5207       *
5208       * TG4 needs to place its channel select in the header, for interaction
5209       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
5210       * larger sampler numbers we need to offset the Sampler State Pointer in
5211       * the header.
5212       */
5213      fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
5214      header_size = 1;
5215      length++;
5216
5217      /* If we're requesting fewer than four channels worth of response,
5218       * and we have an explicit header, we need to set up the sampler
5219       * writemask.  It's reversed from normal: 1 means "don't write".
5220       */
5221      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
5222         assert(regs_written(inst) % reg_width == 0);
5223         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
5224         inst->offset |= mask << 12;
5225      }
5226
5227      /* Build the actual header */
5228      const fs_builder ubld = bld.exec_all().group(8, 0);
5229      const fs_builder ubld1 = ubld.group(1, 0);
5230      ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
5231      if (inst->offset) {
5232         ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
5233      } else if (bld.shader->stage != MESA_SHADER_VERTEX &&
5234                 bld.shader->stage != MESA_SHADER_FRAGMENT) {
5235         /* The vertex and fragment stages have g0.2 set to 0, so
5236          * header0.2 is 0 when g0 is copied. Other stages may not, so we
5237          * must set it to 0 to avoid setting undesirable bits in the
5238          * message.
5239          */
5240         ubld1.MOV(component(header, 2), brw_imm_ud(0));
5241      }
5242
5243      if (sampler_handle.file != BAD_FILE) {
5244         /* Bindless sampler handles aren't relative to the sampler state
5245          * pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
5246          * Instead, it's an absolute pointer relative to dynamic state base
5247          * address.
5248          *
5249          * Sampler states are 16 bytes each and the pointer we give here has
5250          * to be 32-byte aligned.  In order to avoid more indirect messages
5251          * than required, we assume that all bindless sampler states are
5252          * 32-byte aligned.  This sacrifices a bit of general state base
5253          * address space but means we can do something more efficient in the
5254          * shader.
5255          */
5256         ubld1.MOV(component(header, 3), sampler_handle);
5257      } else if (is_high_sampler(devinfo, sampler)) {
5258         fs_reg sampler_state_ptr =
5259            retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD);
5260
5261         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
5262          * with the ones included in g0.3 bits 4:0.  Mask them out.
5263          */
5264         if (devinfo->ver >= 11) {
5265            sampler_state_ptr = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5266            ubld1.AND(sampler_state_ptr,
5267                      retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5268                      brw_imm_ud(INTEL_MASK(31, 5)));
5269         }
5270
5271         if (sampler.file == BRW_IMMEDIATE_VALUE) {
5272            assert(sampler.ud >= 16);
5273            const int sampler_state_size = 16; /* 16 bytes */
5274
5275            ubld1.ADD(component(header, 3), sampler_state_ptr,
5276                      brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
5277         } else {
5278            fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
5279            ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
5280            ubld1.SHL(tmp, tmp, brw_imm_ud(4));
5281            ubld1.ADD(component(header, 3), sampler_state_ptr, tmp);
5282         }
5283      } else if (devinfo->ver >= 11) {
5284         /* Gfx11+ sampler message headers include bits in 4:0 which conflict
5285          * with the ones included in g0.3 bits 4:0.  Mask them out.
5286          */
5287         ubld1.AND(component(header, 3),
5288                   retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
5289                   brw_imm_ud(INTEL_MASK(31, 5)));
5290      }
5291   }
5292
5293   if (shadow_c.file != BAD_FILE) {
5294      bld.MOV(sources[length], shadow_c);
5295      length++;
5296   }
5297
5298   bool coordinate_done = false;
5299
5300   /* Set up the LOD info */
5301   switch (op) {
5302   case FS_OPCODE_TXB:
5303   case SHADER_OPCODE_TXL:
5304      if (devinfo->ver >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
5305         op = SHADER_OPCODE_TXL_LZ;
5306         break;
5307      }
5308      bld.MOV(sources[length], lod);
5309      length++;
5310      break;
5311   case SHADER_OPCODE_TXD:
5312      /* TXD should have been lowered in SIMD16 mode. */
5313      assert(bld.dispatch_width() == 8);
5314
5315      /* Load dPdx and the coordinate together:
5316       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
5317       */
5318      for (unsigned i = 0; i < coord_components; i++) {
5319         bld.MOV(sources[length++], offset(coordinate, bld, i));
5320
5321         /* For cube map array, the coordinate is (u,v,r,ai) but there are
5322          * only derivatives for (u, v, r).
5323          */
5324         if (i < grad_components) {
5325            bld.MOV(sources[length++], offset(lod, bld, i));
5326            bld.MOV(sources[length++], offset(lod2, bld, i));
5327         }
5328      }
5329
5330      coordinate_done = true;
5331      break;
5332   case SHADER_OPCODE_TXS:
5333      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
5334      length++;
5335      break;
5336   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5337      /* We need an LOD; just use 0 */
5338      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
5339      length++;
5340      break;
5341   case SHADER_OPCODE_TXF:
5342      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
5343       * On Gfx9 they are u, v, lod, r
5344       */
5345      bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
5346
5347      if (devinfo->ver >= 9) {
5348         if (coord_components >= 2) {
5349            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
5350                    offset(coordinate, bld, 1));
5351         } else {
5352            sources[length] = brw_imm_d(0);
5353         }
5354         length++;
5355      }
5356
5357      if (devinfo->ver >= 9 && lod.is_zero()) {
5358         op = SHADER_OPCODE_TXF_LZ;
5359      } else {
5360         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
5361         length++;
5362      }
5363
5364      for (unsigned i = devinfo->ver >= 9 ? 2 : 1; i < coord_components; i++)
5365         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5366                 offset(coordinate, bld, i));
5367
5368      coordinate_done = true;
5369      break;
5370
5371   case SHADER_OPCODE_TXF_CMS:
5372   case SHADER_OPCODE_TXF_CMS_W:
5373   case SHADER_OPCODE_TXF_UMS:
5374   case SHADER_OPCODE_TXF_MCS:
5375      if (op == SHADER_OPCODE_TXF_UMS ||
5376          op == SHADER_OPCODE_TXF_CMS ||
5377          op == SHADER_OPCODE_TXF_CMS_W) {
5378         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
5379         length++;
5380      }
5381
5382      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
5383         /* Data from the multisample control surface. */
5384         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
5385         length++;
5386
5387         /* On Gfx9+ we'll use ld2dms_w instead which has two registers for
5388          * the MCS data.
5389          */
5390         if (op == SHADER_OPCODE_TXF_CMS_W) {
5391            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
5392                    mcs.file == IMM ?
5393                    mcs :
5394                    offset(mcs, bld, 1));
5395            length++;
5396         }
5397      }
5398
5399      /* There is no offsetting for this message; just copy in the integer
5400       * texture coordinates.
5401       */
5402      for (unsigned i = 0; i < coord_components; i++)
5403         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5404                 offset(coordinate, bld, i));
5405
5406      coordinate_done = true;
5407      break;
5408   case SHADER_OPCODE_TG4_OFFSET:
5409      /* More crazy intermixing */
5410      for (unsigned i = 0; i < 2; i++) /* u, v */
5411         bld.MOV(sources[length++], offset(coordinate, bld, i));
5412
5413      for (unsigned i = 0; i < 2; i++) /* offu, offv */
5414         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
5415                 offset(tg4_offset, bld, i));
5416
5417      if (coord_components == 3) /* r if present */
5418         bld.MOV(sources[length++], offset(coordinate, bld, 2));
5419
5420      coordinate_done = true;
5421      break;
5422   default:
5423      break;
5424   }
5425
5426   /* Set up the coordinate (except for cases where it was done above) */
5427   if (!coordinate_done) {
5428      for (unsigned i = 0; i < coord_components; i++)
5429         bld.MOV(sources[length++], offset(coordinate, bld, i));
5430   }
5431
5432   if (min_lod.file != BAD_FILE) {
5433      /* Account for all of the missing coordinate sources */
5434      length += 4 - coord_components;
5435      if (op == SHADER_OPCODE_TXD)
5436         length += (3 - grad_components) * 2;
5437
5438      bld.MOV(sources[length++], min_lod);
5439   }
5440
5441   unsigned mlen;
5442   if (reg_width == 2)
5443      mlen = length * reg_width - header_size;
5444   else
5445      mlen = length * reg_width;
5446
5447   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
5448                                     BRW_REGISTER_TYPE_F);
5449   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
5450
5451   /* Generate the SEND. */
5452   inst->opcode = SHADER_OPCODE_SEND;
5453   inst->mlen = mlen;
5454   inst->header_size = header_size;
5455
5456   const unsigned msg_type =
5457      sampler_msg_type(devinfo, op, inst->shadow_compare);
5458   const unsigned simd_mode =
5459      inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
5460                             BRW_SAMPLER_SIMD_MODE_SIMD16;
5461
5462   uint32_t base_binding_table_index;
5463   switch (op) {
5464   case SHADER_OPCODE_TG4:
5465   case SHADER_OPCODE_TG4_OFFSET:
5466      base_binding_table_index = prog_data->binding_table.gather_texture_start;
5467      break;
5468   case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
5469      base_binding_table_index = prog_data->binding_table.image_start;
5470      break;
5471   default:
5472      base_binding_table_index = prog_data->binding_table.texture_start;
5473      break;
5474   }
5475
5476   inst->sfid = BRW_SFID_SAMPLER;
5477   if (surface.file == IMM &&
5478       (sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
5479      inst->desc = brw_sampler_desc(devinfo,
5480                                    surface.ud + base_binding_table_index,
5481                                    sampler.file == IMM ? sampler.ud % 16 : 0,
5482                                    msg_type,
5483                                    simd_mode,
5484                                    0 /* return_format unused on gfx7+ */);
5485      inst->src[0] = brw_imm_ud(0);
5486      inst->src[1] = brw_imm_ud(0);
5487   } else if (surface_handle.file != BAD_FILE) {
5488      /* Bindless surface */
5489      assert(devinfo->ver >= 9);
5490      inst->desc = brw_sampler_desc(devinfo,
5491                                    GFX9_BTI_BINDLESS,
5492                                    sampler.file == IMM ? sampler.ud % 16 : 0,
5493                                    msg_type,
5494                                    simd_mode,
5495                                    0 /* return_format unused on gfx7+ */);
5496
5497      /* For bindless samplers, the entire address is included in the message
5498       * header so we can leave the portion in the message descriptor 0.
5499       */
5500      if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
5501         inst->src[0] = brw_imm_ud(0);
5502      } else {
5503         const fs_builder ubld = bld.group(1, 0).exec_all();
5504         fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5505         ubld.SHL(desc, sampler, brw_imm_ud(8));
5506         inst->src[0] = desc;
5507      }
5508
5509      /* We assume that the driver provided the handle in the top 20 bits so
5510       * we can use the surface handle directly as the extended descriptor.
5511       */
5512      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5513   } else {
5514      /* Immediate portion of the descriptor */
5515      inst->desc = brw_sampler_desc(devinfo,
5516                                    0, /* surface */
5517                                    0, /* sampler */
5518                                    msg_type,
5519                                    simd_mode,
5520                                    0 /* return_format unused on gfx7+ */);
5521      const fs_builder ubld = bld.group(1, 0).exec_all();
5522      fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5523      if (surface.equals(sampler)) {
5524         /* This case is common in GL */
5525         ubld.MUL(desc, surface, brw_imm_ud(0x101));
5526      } else {
5527         if (sampler_handle.file != BAD_FILE) {
5528            ubld.MOV(desc, surface);
5529         } else if (sampler.file == IMM) {
5530            ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
5531         } else {
5532            ubld.SHL(desc, sampler, brw_imm_ud(8));
5533            ubld.OR(desc, desc, surface);
5534         }
5535      }
5536      if (base_binding_table_index)
5537         ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
5538      ubld.AND(desc, desc, brw_imm_ud(0xfff));
5539
5540      inst->src[0] = component(desc, 0);
5541      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5542   }
5543
5544   inst->ex_desc = 0;
5545
5546   inst->src[2] = src_payload;
5547   inst->resize_sources(3);
5548
5549   if (inst->eot) {
5550      /* EOT sampler messages don't make sense to split because it would
5551       * involve ending half of the thread early.
5552       */
5553      assert(inst->group == 0);
5554      /* We need to use SENDC for EOT sampler messages */
5555      inst->check_tdr = true;
5556      inst->send_has_side_effects = true;
5557   }
5558
5559   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
5560   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
5561}
5562
5563static void
5564lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
5565{
5566   const intel_device_info *devinfo = bld.shader->devinfo;
5567   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
5568   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
5569   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
5570   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
5571   const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
5572   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
5573   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
5574   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
5575   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
5576   const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
5577   const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
5578   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
5579   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
5580   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
5581   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
5582   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
5583
5584   if (devinfo->ver >= 7) {
5585      lower_sampler_logical_send_gfx7(bld, inst, op, coordinate,
5586                                      shadow_c, lod, lod2, min_lod,
5587                                      sample_index,
5588                                      mcs, surface, sampler,
5589                                      surface_handle, sampler_handle,
5590                                      tg4_offset,
5591                                      coord_components, grad_components);
5592   } else if (devinfo->ver >= 5) {
5593      lower_sampler_logical_send_gfx5(bld, inst, op, coordinate,
5594                                      shadow_c, lod, lod2, sample_index,
5595                                      surface, sampler,
5596                                      coord_components, grad_components);
5597   } else {
5598      lower_sampler_logical_send_gfx4(bld, inst, op, coordinate,
5599                                      shadow_c, lod, lod2,
5600                                      surface, sampler,
5601                                      coord_components, grad_components);
5602   }
5603}
5604
5605/**
5606 * Predicate the specified instruction on the sample mask.
5607 */
5608static void
5609emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
5610{
5611   assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
5612          bld.group() == inst->group &&
5613          bld.dispatch_width() == inst->exec_size);
5614
5615   const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
5616   const fs_reg sample_mask = sample_mask_reg(bld);
5617   const unsigned subreg = sample_mask_flag_subreg(v);
5618
5619   if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
5620      assert(sample_mask.file == ARF &&
5621             sample_mask.nr == brw_flag_subreg(subreg).nr &&
5622             sample_mask.subnr == brw_flag_subreg(
5623                subreg + inst->group / 16).subnr);
5624   } else {
5625      bld.group(1, 0).exec_all()
5626         .MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
5627   }
5628
5629   if (inst->predicate) {
5630      assert(inst->predicate == BRW_PREDICATE_NORMAL);
5631      assert(!inst->predicate_inverse);
5632      assert(inst->flag_subreg == 0);
5633      /* Combine the sample mask with the existing predicate by using a
5634       * vertical predication mode.
5635       */
5636      inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
5637   } else {
5638      inst->flag_subreg = subreg;
5639      inst->predicate = BRW_PREDICATE_NORMAL;
5640      inst->predicate_inverse = false;
5641   }
5642}
5643
5644static void
5645setup_surface_descriptors(const fs_builder &bld, fs_inst *inst, uint32_t desc,
5646                          const fs_reg &surface, const fs_reg &surface_handle)
5647{
5648   const ASSERTED intel_device_info *devinfo = bld.shader->devinfo;
5649
5650   /* We must have exactly one of surface and surface_handle */
5651   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
5652
5653   if (surface.file == IMM) {
5654      inst->desc = desc | (surface.ud & 0xff);
5655      inst->src[0] = brw_imm_ud(0);
5656      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5657   } else if (surface_handle.file != BAD_FILE) {
5658      /* Bindless surface */
5659      assert(devinfo->ver >= 9);
5660      inst->desc = desc | GFX9_BTI_BINDLESS;
5661      inst->src[0] = brw_imm_ud(0);
5662
5663      /* We assume that the driver provided the handle in the top 20 bits so
5664       * we can use the surface handle directly as the extended descriptor.
5665       */
5666      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
5667   } else {
5668      inst->desc = desc;
5669      const fs_builder ubld = bld.exec_all().group(1, 0);
5670      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5671      ubld.AND(tmp, surface, brw_imm_ud(0xff));
5672      inst->src[0] = component(tmp, 0);
5673      inst->src[1] = brw_imm_ud(0); /* ex_desc */
5674   }
5675}
5676
5677static void
5678lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5679{
5680   const intel_device_info *devinfo = bld.shader->devinfo;
5681
5682   /* Get the logical send arguments. */
5683   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5684   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5685   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
5686   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
5687   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
5688   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
5689   const fs_reg &allow_sample_mask =
5690      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
5691   assert(arg.file == IMM);
5692   assert(allow_sample_mask.file == IMM);
5693
5694   /* Calculate the total number of components of the payload. */
5695   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
5696   const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
5697
5698   const bool is_typed_access =
5699      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
5700      inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
5701      inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
5702
5703   const bool is_surface_access = is_typed_access ||
5704      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL ||
5705      inst->opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL ||
5706      inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL;
5707
5708   const bool is_stateless =
5709      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
5710                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
5711
5712   const bool has_side_effects = inst->has_side_effects();
5713
5714   fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
5715                                               fs_reg(brw_imm_d(0xffff));
5716
5717   /* From the BDW PRM Volume 7, page 147:
5718    *
5719    *  "For the Data Cache Data Port*, the header must be present for the
5720    *   following message types: [...] Typed read/write/atomics"
5721    *
5722    * Earlier generations have a similar wording.  Because of this restriction
5723    * we don't attempt to implement sample masks via predication for such
5724    * messages prior to Gfx9, since we have to provide a header anyway.  On
5725    * Gfx11+ the header has been removed so we can only use predication.
5726    *
5727    * For all stateless A32 messages, we also need a header
5728    */
5729   fs_reg header;
5730   if ((devinfo->ver < 9 && is_typed_access) || is_stateless) {
5731      fs_builder ubld = bld.exec_all().group(8, 0);
5732      header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
5733      if (is_stateless) {
5734         assert(!is_surface_access);
5735         ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
5736      } else {
5737         ubld.MOV(header, brw_imm_d(0));
5738         if (is_surface_access)
5739            ubld.group(1, 0).MOV(component(header, 7), sample_mask);
5740      }
5741   }
5742   const unsigned header_sz = header.file != BAD_FILE ? 1 : 0;
5743
5744   fs_reg payload, payload2;
5745   unsigned mlen, ex_mlen = 0;
5746   if (devinfo->ver >= 9 &&
5747       (src.file == BAD_FILE || header.file == BAD_FILE)) {
5748      /* We have split sends on gfx9 and above */
5749      if (header.file == BAD_FILE) {
5750         payload = bld.move_to_vgrf(addr, addr_sz);
5751         payload2 = bld.move_to_vgrf(src, src_sz);
5752         mlen = addr_sz * (inst->exec_size / 8);
5753         ex_mlen = src_sz * (inst->exec_size / 8);
5754      } else {
5755         assert(src.file == BAD_FILE);
5756         payload = header;
5757         payload2 = bld.move_to_vgrf(addr, addr_sz);
5758         mlen = header_sz;
5759         ex_mlen = addr_sz * (inst->exec_size / 8);
5760      }
5761   } else {
5762      /* Allocate space for the payload. */
5763      const unsigned sz = header_sz + addr_sz + src_sz;
5764      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
5765      fs_reg *const components = new fs_reg[sz];
5766      unsigned n = 0;
5767
5768      /* Construct the payload. */
5769      if (header.file != BAD_FILE)
5770         components[n++] = header;
5771
5772      for (unsigned i = 0; i < addr_sz; i++)
5773         components[n++] = offset(addr, bld, i);
5774
5775      for (unsigned i = 0; i < src_sz; i++)
5776         components[n++] = offset(src, bld, i);
5777
5778      bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
5779      mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
5780
5781      delete[] components;
5782   }
5783
5784   /* Predicate the instruction on the sample mask if no header is
5785    * provided.
5786    */
5787   if ((header.file == BAD_FILE || !is_surface_access) &&
5788       sample_mask.file != BAD_FILE && sample_mask.file != IMM)
5789      emit_predicate_on_sample_mask(bld, inst);
5790
5791   uint32_t sfid;
5792   switch (inst->opcode) {
5793   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5794   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5795      /* Byte scattered opcodes go through the normal data cache */
5796      sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
5797      break;
5798
5799   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5800   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5801      sfid =  devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE :
5802              devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE :
5803                                  BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
5804      break;
5805
5806   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5807   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5808   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5809   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5810      /* Untyped Surface messages go through the data cache but the SFID value
5811       * changed on Haswell.
5812       */
5813      sfid = (devinfo->verx10 >= 75 ?
5814              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5815              GFX7_SFID_DATAPORT_DATA_CACHE);
5816      break;
5817
5818   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5819   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5820   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5821      /* Typed surface messages go through the render cache on IVB and the
5822       * data cache on HSW+.
5823       */
5824      sfid = (devinfo->verx10 >= 75 ?
5825              HSW_SFID_DATAPORT_DATA_CACHE_1 :
5826              GFX6_SFID_DATAPORT_RENDER_CACHE);
5827      break;
5828
5829   default:
5830      unreachable("Unsupported surface opcode");
5831   }
5832
5833   uint32_t desc;
5834   switch (inst->opcode) {
5835   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
5836      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5837                                            arg.ud, /* num_channels */
5838                                            false   /* write */);
5839      break;
5840
5841   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
5842      desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
5843                                            arg.ud, /* num_channels */
5844                                            true    /* write */);
5845      break;
5846
5847   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
5848      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5849                                           arg.ud, /* bit_size */
5850                                           false   /* write */);
5851      break;
5852
5853   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
5854      desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
5855                                           arg.ud, /* bit_size */
5856                                           true    /* write */);
5857      break;
5858
5859   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
5860      assert(arg.ud == 32); /* bit_size */
5861      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5862                                            false  /* write */);
5863      break;
5864
5865   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
5866      assert(arg.ud == 32); /* bit_size */
5867      desc = brw_dp_dword_scattered_rw_desc(devinfo, inst->exec_size,
5868                                            true   /* write */);
5869      break;
5870
5871   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
5872      desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
5873                                        arg.ud, /* atomic_op */
5874                                        !inst->dst.is_null());
5875      break;
5876
5877   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
5878      desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
5879                                              arg.ud, /* atomic_op */
5880                                              !inst->dst.is_null());
5881      break;
5882
5883   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
5884      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5885                                          arg.ud, /* num_channels */
5886                                          false   /* write */);
5887      break;
5888
5889   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
5890      desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
5891                                          arg.ud, /* num_channels */
5892                                          true    /* write */);
5893      break;
5894
5895   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
5896      desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
5897                                      arg.ud, /* atomic_op */
5898                                      !inst->dst.is_null());
5899      break;
5900
5901   default:
5902      unreachable("Unknown surface logical instruction");
5903   }
5904
5905   /* Update the original instruction. */
5906   inst->opcode = SHADER_OPCODE_SEND;
5907   inst->mlen = mlen;
5908   inst->ex_mlen = ex_mlen;
5909   inst->header_size = header_sz;
5910   inst->send_has_side_effects = has_side_effects;
5911   inst->send_is_volatile = !has_side_effects;
5912
5913   /* Set up SFID and descriptors */
5914   inst->sfid = sfid;
5915   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
5916
5917   /* Finally, the payload */
5918   inst->src[2] = payload;
5919   inst->src[3] = payload2;
5920
5921   inst->resize_sources(4);
5922}
5923
5924static enum lsc_opcode
5925brw_atomic_op_to_lsc_atomic_op(unsigned op)
5926{
5927   switch(op) {
5928   case BRW_AOP_AND:
5929      return LSC_OP_ATOMIC_AND;
5930   case BRW_AOP_OR:
5931      return LSC_OP_ATOMIC_OR;
5932   case BRW_AOP_XOR:
5933      return LSC_OP_ATOMIC_XOR;
5934   case BRW_AOP_MOV:
5935      return LSC_OP_ATOMIC_STORE;
5936   case BRW_AOP_INC:
5937      return LSC_OP_ATOMIC_INC;
5938   case BRW_AOP_DEC:
5939      return LSC_OP_ATOMIC_DEC;
5940   case BRW_AOP_ADD:
5941      return LSC_OP_ATOMIC_ADD;
5942   case BRW_AOP_SUB:
5943      return LSC_OP_ATOMIC_SUB;
5944   case BRW_AOP_IMAX:
5945      return LSC_OP_ATOMIC_MAX;
5946   case BRW_AOP_IMIN:
5947      return LSC_OP_ATOMIC_MIN;
5948   case BRW_AOP_UMAX:
5949      return LSC_OP_ATOMIC_UMAX;
5950   case BRW_AOP_UMIN:
5951      return LSC_OP_ATOMIC_UMIN;
5952   case BRW_AOP_CMPWR:
5953      return LSC_OP_ATOMIC_CMPXCHG;
5954   default:
5955      assert(false);
5956      unreachable("invalid atomic opcode");
5957   }
5958}
5959
5960static enum lsc_opcode
5961brw_atomic_op_to_lsc_fatomic_op(uint32_t aop)
5962{
5963   switch(aop) {
5964   case BRW_AOP_FMAX:
5965      return LSC_OP_ATOMIC_FMAX;
5966   case BRW_AOP_FMIN:
5967      return LSC_OP_ATOMIC_FMIN;
5968   case BRW_AOP_FCMPWR:
5969      return LSC_OP_ATOMIC_FCMPXCHG;
5970   case BRW_AOP_FADD:
5971      return LSC_OP_ATOMIC_FADD;
5972   default:
5973      unreachable("Unsupported float atomic opcode");
5974   }
5975}
5976
5977static enum lsc_data_size
5978lsc_bits_to_data_size(unsigned bit_size)
5979{
5980   switch (bit_size / 8) {
5981   case 1:  return LSC_DATA_SIZE_D8U32;
5982   case 2:  return LSC_DATA_SIZE_D16U32;
5983   case 4:  return LSC_DATA_SIZE_D32;
5984   case 8:  return LSC_DATA_SIZE_D64;
5985   default:
5986      unreachable("Unsupported data size.");
5987   }
5988}
5989
5990static void
5991lower_lsc_surface_logical_send(const fs_builder &bld, fs_inst *inst)
5992{
5993   const intel_device_info *devinfo = bld.shader->devinfo;
5994   assert(devinfo->has_lsc);
5995
5996   /* Get the logical send arguments. */
5997   const fs_reg addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
5998   const fs_reg src = inst->src[SURFACE_LOGICAL_SRC_DATA];
5999   const fs_reg surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
6000   const fs_reg surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
6001   const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
6002   const fs_reg arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
6003   const fs_reg allow_sample_mask =
6004      inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK];
6005   assert(arg.file == IMM);
6006   assert(allow_sample_mask.file == IMM);
6007
6008   /* Calculate the total number of components of the payload. */
6009   const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
6010   const unsigned src_comps = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
6011   const unsigned src_sz = type_sz(src.type);
6012
6013   const bool has_side_effects = inst->has_side_effects();
6014
6015   unsigned ex_mlen = 0;
6016   fs_reg payload, payload2;
6017   payload = bld.move_to_vgrf(addr, addr_sz);
6018   if (src.file != BAD_FILE) {
6019      payload2 = bld.move_to_vgrf(src, src_comps);
6020      ex_mlen = (src_comps * src_sz * inst->exec_size) / REG_SIZE;
6021   }
6022
6023   /* Predicate the instruction on the sample mask if needed */
6024   fs_reg sample_mask = allow_sample_mask.ud ? sample_mask_reg(bld) :
6025                                               fs_reg(brw_imm_d(0xffff));
6026   if (sample_mask.file != BAD_FILE && sample_mask.file != IMM)
6027      emit_predicate_on_sample_mask(bld, inst);
6028
6029   if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6030      inst->sfid = GFX12_SFID_SLM;
6031   else
6032      inst->sfid = GFX12_SFID_UGM;
6033
6034   /* We must have exactly one of surface and surface_handle */
6035   assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
6036
6037   enum lsc_addr_surface_type surf_type;
6038   if (surface_handle.file != BAD_FILE)
6039      surf_type = LSC_ADDR_SURFTYPE_BSS;
6040   else if (surface.file == IMM && surface.ud == GFX7_BTI_SLM)
6041      surf_type = LSC_ADDR_SURFTYPE_FLAT;
6042   else
6043      surf_type = LSC_ADDR_SURFTYPE_BTI;
6044
6045   switch (inst->opcode) {
6046   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6047      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6048                                surf_type, LSC_ADDR_SIZE_A32,
6049                                1 /* num_coordinates */,
6050                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6051                                false /* transpose */,
6052                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6053                                true /* has_dest */);
6054      break;
6055   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6056      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6057                                surf_type, LSC_ADDR_SIZE_A32,
6058                                1 /* num_coordinates */,
6059                                LSC_DATA_SIZE_D32, arg.ud /* num_channels */,
6060                                false /* transpose */,
6061                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6062                                false /* has_dest */);
6063      break;
6064   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6065   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
6066      /* Bspec: Atomic instruction -> Cache section:
6067       *
6068       *    Atomic messages are always forced to "un-cacheable" in the L1
6069       *    cache.
6070       */
6071      enum lsc_opcode opcode =
6072         inst->opcode == SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL ?
6073         brw_atomic_op_to_lsc_fatomic_op(arg.ud) :
6074         brw_atomic_op_to_lsc_atomic_op(arg.ud);
6075      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6076                                surf_type, LSC_ADDR_SIZE_A32,
6077                                1 /* num_coordinates */,
6078                                lsc_bits_to_data_size(src_sz * 8),
6079                                1 /* num_channels */,
6080                                false /* transpose */,
6081                                LSC_CACHE_STORE_L1UC_L3WB,
6082                                !inst->dst.is_null());
6083      break;
6084   }
6085   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6086      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6087                                surf_type, LSC_ADDR_SIZE_A32,
6088                                1 /* num_coordinates */,
6089                                lsc_bits_to_data_size(arg.ud),
6090                                1 /* num_channels */,
6091                                false /* transpose */,
6092                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6093                                true /* has_dest */);
6094      break;
6095   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6096      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6097                                surf_type, LSC_ADDR_SIZE_A32,
6098                                1 /* num_coordinates */,
6099                                lsc_bits_to_data_size(arg.ud),
6100                                1 /* num_channels */,
6101                                false /* transpose */,
6102                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6103                                false /* has_dest */);
6104      break;
6105   default:
6106      unreachable("Unknown surface logical instruction");
6107   }
6108
6109   inst->src[0] = brw_imm_ud(0);
6110
6111   /* Set up extended descriptors */
6112   switch (surf_type) {
6113   case LSC_ADDR_SURFTYPE_FLAT:
6114      inst->src[1] = brw_imm_ud(0);
6115      break;
6116   case LSC_ADDR_SURFTYPE_BSS:
6117      /* We assume that the driver provided the handle in the top 20 bits so
6118       * we can use the surface handle directly as the extended descriptor.
6119       */
6120      inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
6121      break;
6122   case LSC_ADDR_SURFTYPE_BTI:
6123      if (surface.file == IMM) {
6124         inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, surface.ud));
6125      } else {
6126         const fs_builder ubld = bld.exec_all().group(1, 0);
6127         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6128         ubld.SHL(tmp, surface, brw_imm_ud(24));
6129         inst->src[1] = component(tmp, 0);
6130      }
6131      break;
6132   default:
6133      unreachable("Unknown surface type");
6134   }
6135
6136   /* Update the original instruction. */
6137   inst->opcode = SHADER_OPCODE_SEND;
6138   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6139   inst->ex_mlen = ex_mlen;
6140   inst->header_size = 0;
6141   inst->send_has_side_effects = has_side_effects;
6142   inst->send_is_volatile = !has_side_effects;
6143
6144   /* Finally, the payload */
6145   inst->src[2] = payload;
6146   inst->src[3] = payload2;
6147
6148   inst->resize_sources(4);
6149}
6150
6151static void
6152lower_surface_block_logical_send(const fs_builder &bld, fs_inst *inst)
6153{
6154   const intel_device_info *devinfo = bld.shader->devinfo;
6155   assert(devinfo->ver >= 9);
6156
6157   /* Get the logical send arguments. */
6158   const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
6159   const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
6160   const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
6161   const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
6162   const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
6163   assert(arg.file == IMM);
6164   assert(inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == BAD_FILE);
6165   assert(inst->src[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK].file == BAD_FILE);
6166
6167   const bool is_stateless =
6168      surface.file == IMM && (surface.ud == BRW_BTI_STATELESS ||
6169                              surface.ud == GFX8_BTI_STATELESS_NON_COHERENT);
6170
6171   const bool has_side_effects = inst->has_side_effects();
6172
6173   const bool align_16B =
6174      inst->opcode != SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL;
6175
6176   const bool write = inst->opcode == SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL;
6177
6178   /* The address is stored in the header.  See MH_A32_GO and MH_BTS_GO. */
6179   fs_builder ubld = bld.exec_all().group(8, 0);
6180   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6181
6182   if (is_stateless)
6183      ubld.emit(SHADER_OPCODE_SCRATCH_HEADER, header);
6184   else
6185      ubld.MOV(header, brw_imm_d(0));
6186
6187   /* Address in OWord units when aligned to OWords. */
6188   if (align_16B)
6189      ubld.group(1, 0).SHR(component(header, 2), addr, brw_imm_ud(4));
6190   else
6191      ubld.group(1, 0).MOV(component(header, 2), addr);
6192
6193   fs_reg data;
6194   unsigned ex_mlen = 0;
6195   if (write) {
6196      const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
6197      data = retype(bld.move_to_vgrf(src, src_sz), BRW_REGISTER_TYPE_UD);
6198      ex_mlen = src_sz * type_sz(src.type) * inst->exec_size / REG_SIZE;
6199   }
6200
6201   inst->opcode = SHADER_OPCODE_SEND;
6202   inst->mlen = 1;
6203   inst->ex_mlen = ex_mlen;
6204   inst->header_size = 1;
6205   inst->send_has_side_effects = has_side_effects;
6206   inst->send_is_volatile = !has_side_effects;
6207
6208   inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6209
6210   const uint32_t desc = brw_dp_oword_block_rw_desc(devinfo, align_16B,
6211                                                    arg.ud, write);
6212   setup_surface_descriptors(bld, inst, desc, surface, surface_handle);
6213
6214   inst->src[2] = header;
6215   inst->src[3] = data;
6216
6217   inst->resize_sources(4);
6218}
6219
6220static fs_reg
6221emit_a64_oword_block_header(const fs_builder &bld, const fs_reg &addr)
6222{
6223   const fs_builder ubld = bld.exec_all().group(8, 0);
6224   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6225   ubld.MOV(header, brw_imm_ud(0));
6226
6227   /* Use a 2-wide MOV to fill out the address */
6228   assert(type_sz(addr.type) == 8 && addr.stride == 0);
6229   fs_reg addr_vec2 = addr;
6230   addr_vec2.type = BRW_REGISTER_TYPE_UD;
6231   addr_vec2.stride = 1;
6232   ubld.group(2, 0).MOV(header, addr_vec2);
6233
6234   return header;
6235}
6236
6237static void
6238lower_lsc_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6239{
6240   const intel_device_info *devinfo = bld.shader->devinfo;
6241
6242   /* Get the logical send arguments. */
6243   const fs_reg &addr = inst->src[0];
6244   const fs_reg &src = inst->src[1];
6245   const unsigned src_sz = type_sz(src.type);
6246
6247   const unsigned src_comps = inst->components_read(1);
6248   assert(inst->src[2].file == IMM);
6249   const unsigned arg = inst->src[2].ud;
6250   const bool has_side_effects = inst->has_side_effects();
6251
6252   /* If the surface message has side effects and we're a fragment shader, we
6253    * have to predicate with the sample mask to avoid helper invocations.
6254    */
6255   if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6256      emit_predicate_on_sample_mask(bld, inst);
6257
6258   fs_reg payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6259   fs_reg payload2 = retype(bld.move_to_vgrf(src, src_comps),
6260                            BRW_REGISTER_TYPE_UD);
6261   unsigned ex_mlen = src_comps * src_sz * inst->exec_size / REG_SIZE;
6262
6263   switch (inst->opcode) {
6264   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6265      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6266                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6267                                1 /* num_coordinates */,
6268                                LSC_DATA_SIZE_D32, arg /* num_channels */,
6269                                false /* transpose */,
6270                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6271                                true /* has_dest */);
6272      break;
6273   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6274      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE_CMASK, inst->exec_size,
6275                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6276                                1 /* num_coordinates */,
6277                                LSC_DATA_SIZE_D32, arg /* num_channels */,
6278                                false /* transpose */,
6279                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6280                                false /* has_dest */);
6281      break;
6282   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6283      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6284                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6285                                1 /* num_coordinates */,
6286                                lsc_bits_to_data_size(arg),
6287                                1 /* num_channels */,
6288                                false /* transpose */,
6289                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6290                                true /* has_dest */);
6291      break;
6292   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6293      inst->desc = lsc_msg_desc(devinfo, LSC_OP_STORE, inst->exec_size,
6294                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6295                                1 /* num_coordinates */,
6296                                lsc_bits_to_data_size(arg),
6297                                1 /* num_channels */,
6298                                false /* transpose */,
6299                                LSC_CACHE_STORE_L1STATE_L3MOCS,
6300                                false /* has_dest */);
6301      break;
6302   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6303   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6304   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL: {
6305   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6306   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6307   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
6308      /* Bspec: Atomic instruction -> Cache section:
6309       *
6310       *    Atomic messages are always forced to "un-cacheable" in the L1
6311       *    cache.
6312       */
6313      enum lsc_opcode opcode =
6314         (inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL ||
6315          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL ||
6316          inst->opcode == SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL) ?
6317         brw_atomic_op_to_lsc_atomic_op(arg) :
6318         brw_atomic_op_to_lsc_fatomic_op(arg);
6319      inst->desc = lsc_msg_desc(devinfo, opcode, inst->exec_size,
6320                                LSC_ADDR_SURFTYPE_FLAT, LSC_ADDR_SIZE_A64,
6321                                1 /* num_coordinates */,
6322                                lsc_bits_to_data_size(src_sz * 8),
6323                                1 /* num_channels */,
6324                                false /* transpose */,
6325                                LSC_CACHE_STORE_L1UC_L3WB,
6326                                !inst->dst.is_null());
6327      break;
6328   }
6329   default:
6330      unreachable("Unknown A64 logical instruction");
6331   }
6332
6333   /* Update the original instruction. */
6334   inst->opcode = SHADER_OPCODE_SEND;
6335   inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6336   inst->ex_mlen = ex_mlen;
6337   inst->header_size = 0;
6338   inst->send_has_side_effects = has_side_effects;
6339   inst->send_is_volatile = !has_side_effects;
6340
6341   /* Set up SFID and descriptors */
6342   inst->sfid = GFX12_SFID_UGM;
6343   inst->resize_sources(4);
6344   inst->src[0] = brw_imm_ud(0); /* desc */
6345   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6346   inst->src[2] = payload;
6347   inst->src[3] = payload2;
6348}
6349
6350static void
6351lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
6352{
6353   const intel_device_info *devinfo = bld.shader->devinfo;
6354
6355   const fs_reg &addr = inst->src[0];
6356   const fs_reg &src = inst->src[1];
6357   const unsigned src_comps = inst->components_read(1);
6358   assert(inst->src[2].file == IMM);
6359   const unsigned arg = inst->src[2].ud;
6360   const bool has_side_effects = inst->has_side_effects();
6361
6362   /* If the surface message has side effects and we're a fragment shader, we
6363    * have to predicate with the sample mask to avoid helper invocations.
6364    */
6365   if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT)
6366      emit_predicate_on_sample_mask(bld, inst);
6367
6368   fs_reg payload, payload2;
6369   unsigned mlen, ex_mlen = 0, header_size = 0;
6370   if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL ||
6371       inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL ||
6372       inst->opcode == SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL) {
6373      assert(devinfo->ver >= 9);
6374
6375      /* OWORD messages only take a scalar address in a header */
6376      mlen = 1;
6377      header_size = 1;
6378      payload = emit_a64_oword_block_header(bld, addr);
6379
6380      if (inst->opcode == SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL) {
6381         ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6382         payload2 = retype(bld.move_to_vgrf(src, src_comps),
6383                           BRW_REGISTER_TYPE_UD);
6384      }
6385   } else if (devinfo->ver >= 9) {
6386      /* On Skylake and above, we have SENDS */
6387      mlen = 2 * (inst->exec_size / 8);
6388      ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
6389      payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
6390      payload2 = retype(bld.move_to_vgrf(src, src_comps),
6391                        BRW_REGISTER_TYPE_UD);
6392   } else {
6393      /* Add two because the address is 64-bit */
6394      const unsigned dwords = 2 + src_comps;
6395      mlen = dwords * (inst->exec_size / 8);
6396
6397      fs_reg sources[5];
6398
6399      sources[0] = addr;
6400
6401      for (unsigned i = 0; i < src_comps; i++)
6402         sources[1 + i] = offset(src, bld, i);
6403
6404      payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
6405      bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
6406   }
6407
6408   uint32_t desc;
6409   switch (inst->opcode) {
6410   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6411      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6412                                                arg,   /* num_channels */
6413                                                false  /* write */);
6414      break;
6415
6416   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6417      desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
6418                                                arg,   /* num_channels */
6419                                                true   /* write */);
6420      break;
6421
6422   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6423      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6424                                            true,    /* align_16B */
6425                                            arg,     /* num_dwords */
6426                                            false    /* write */);
6427      break;
6428
6429   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6430      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6431                                            false,   /* align_16B */
6432                                            arg,     /* num_dwords */
6433                                            false    /* write */);
6434      break;
6435
6436   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6437      desc = brw_dp_a64_oword_block_rw_desc(devinfo,
6438                                            true,    /* align_16B */
6439                                            arg,     /* num_dwords */
6440                                            true     /* write */);
6441      break;
6442
6443   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6444      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6445                                               arg,   /* bit_size */
6446                                               false  /* write */);
6447      break;
6448
6449   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6450      desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
6451                                               arg,   /* bit_size */
6452                                               true   /* write */);
6453      break;
6454
6455   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6456      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
6457                                            arg,   /* atomic_op */
6458                                            !inst->dst.is_null());
6459      break;
6460
6461   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6462      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 16,
6463                                            arg,   /* atomic_op */
6464                                            !inst->dst.is_null());
6465      break;
6466
6467   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6468      desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
6469                                            arg,   /* atomic_op */
6470                                            !inst->dst.is_null());
6471      break;
6472
6473   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6474      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6475                                                  16, /* bit_size */
6476                                                  arg,   /* atomic_op */
6477                                                  !inst->dst.is_null());
6478      break;
6479
6480   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6481      desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
6482                                                  32, /* bit_size */
6483                                                  arg,   /* atomic_op */
6484                                                  !inst->dst.is_null());
6485      break;
6486
6487   default:
6488      unreachable("Unknown A64 logical instruction");
6489   }
6490
6491   /* Update the original instruction. */
6492   inst->opcode = SHADER_OPCODE_SEND;
6493   inst->mlen = mlen;
6494   inst->ex_mlen = ex_mlen;
6495   inst->header_size = header_size;
6496   inst->send_has_side_effects = has_side_effects;
6497   inst->send_is_volatile = !has_side_effects;
6498
6499   /* Set up SFID and descriptors */
6500   inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
6501   inst->desc = desc;
6502   inst->resize_sources(4);
6503   inst->src[0] = brw_imm_ud(0); /* desc */
6504   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6505   inst->src[2] = payload;
6506   inst->src[3] = payload2;
6507}
6508
6509static void
6510lower_lsc_varying_pull_constant_logical_send(const fs_builder &bld,
6511                                             fs_inst *inst)
6512{
6513   const intel_device_info *devinfo = bld.shader->devinfo;
6514   ASSERTED const brw_compiler *compiler = bld.shader->compiler;
6515
6516   fs_reg index = inst->src[0];
6517
6518   /* We are switching the instruction from an ALU-like instruction to a
6519    * send-from-grf instruction.  Since sends can't handle strides or
6520    * source modifiers, we have to make a copy of the offset source.
6521    */
6522   fs_reg ubo_offset = bld.move_to_vgrf(inst->src[1], 1);
6523
6524   assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6525   unsigned alignment = inst->src[2].ud;
6526
6527   inst->opcode = SHADER_OPCODE_SEND;
6528   inst->sfid = GFX12_SFID_UGM;
6529   inst->resize_sources(3);
6530   inst->src[0] = brw_imm_ud(0);
6531
6532   if (index.file == IMM) {
6533      inst->src[1] = brw_imm_ud(lsc_bti_ex_desc(devinfo, index.ud));
6534   } else {
6535      const fs_builder ubld = bld.exec_all().group(1, 0);
6536      fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6537      ubld.SHL(tmp, index, brw_imm_ud(24));
6538      inst->src[1] = component(tmp, 0);
6539   }
6540
6541   assert(!compiler->indirect_ubos_use_sampler);
6542
6543   inst->src[2] = ubo_offset; /* payload */
6544   if (alignment >= 4) {
6545      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD_CMASK, inst->exec_size,
6546                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6547                                1 /* num_coordinates */,
6548                                LSC_DATA_SIZE_D32,
6549                                4 /* num_channels */,
6550                                false /* transpose */,
6551                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6552                                true /* has_dest */);
6553      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6554   } else {
6555      inst->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD, inst->exec_size,
6556                                LSC_ADDR_SURFTYPE_BTI, LSC_ADDR_SIZE_A32,
6557                                1 /* num_coordinates */,
6558                                LSC_DATA_SIZE_D32,
6559                                1 /* num_channels */,
6560                                false /* transpose */,
6561                                LSC_CACHE_LOAD_L1STATE_L3MOCS,
6562                                true /* has_dest */);
6563      inst->mlen = lsc_msg_desc_src0_len(devinfo, inst->desc);
6564      /* The byte scattered messages can only read one dword at a time so
6565       * we have to duplicate the message 4 times to read the full vec4.
6566       * Hopefully, dead code will clean up the mess if some of them aren't
6567       * needed.
6568       */
6569      assert(inst->size_written == 16 * inst->exec_size);
6570      inst->size_written /= 4;
6571      for (unsigned c = 1; c < 4; c++) {
6572         /* Emit a copy of the instruction because we're about to modify
6573          * it.  Because this loop starts at 1, we will emit copies for the
6574          * first 3 and the final one will be the modified instruction.
6575          */
6576         bld.emit(*inst);
6577
6578         /* Offset the source */
6579         inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6580         bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6581
6582         /* Offset the destination */
6583         inst->dst = offset(inst->dst, bld, 1);
6584      }
6585   }
6586}
6587
6588static void
6589lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
6590{
6591   const intel_device_info *devinfo = bld.shader->devinfo;
6592   const brw_compiler *compiler = bld.shader->compiler;
6593
6594   if (devinfo->ver >= 7) {
6595      fs_reg index = inst->src[0];
6596      /* We are switching the instruction from an ALU-like instruction to a
6597       * send-from-grf instruction.  Since sends can't handle strides or
6598       * source modifiers, we have to make a copy of the offset source.
6599       */
6600      fs_reg ubo_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
6601      bld.MOV(ubo_offset, inst->src[1]);
6602
6603      assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
6604      unsigned alignment = inst->src[2].ud;
6605
6606      inst->opcode = SHADER_OPCODE_SEND;
6607      inst->mlen = inst->exec_size / 8;
6608      inst->resize_sources(3);
6609
6610      if (index.file == IMM) {
6611         inst->desc = index.ud & 0xff;
6612         inst->src[0] = brw_imm_ud(0);
6613      } else {
6614         inst->desc = 0;
6615         const fs_builder ubld = bld.exec_all().group(1, 0);
6616         fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6617         ubld.AND(tmp, index, brw_imm_ud(0xff));
6618         inst->src[0] = component(tmp, 0);
6619      }
6620      inst->src[1] = brw_imm_ud(0); /* ex_desc */
6621      inst->src[2] = ubo_offset; /* payload */
6622
6623      if (compiler->indirect_ubos_use_sampler) {
6624         const unsigned simd_mode =
6625            inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
6626                                   BRW_SAMPLER_SIMD_MODE_SIMD16;
6627
6628         inst->sfid = BRW_SFID_SAMPLER;
6629         inst->desc |= brw_sampler_desc(devinfo, 0, 0,
6630                                        GFX5_SAMPLER_MESSAGE_SAMPLE_LD,
6631                                        simd_mode, 0);
6632      } else if (alignment >= 4) {
6633         inst->sfid = (devinfo->verx10 >= 75 ?
6634                       HSW_SFID_DATAPORT_DATA_CACHE_1 :
6635                       GFX7_SFID_DATAPORT_DATA_CACHE);
6636         inst->desc |= brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
6637                                                      4, /* num_channels */
6638                                                      false   /* write */);
6639      } else {
6640         inst->sfid = GFX7_SFID_DATAPORT_DATA_CACHE;
6641         inst->desc |= brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
6642                                                     32,     /* bit_size */
6643                                                     false   /* write */);
6644         /* The byte scattered messages can only read one dword at a time so
6645          * we have to duplicate the message 4 times to read the full vec4.
6646          * Hopefully, dead code will clean up the mess if some of them aren't
6647          * needed.
6648          */
6649         assert(inst->size_written == 16 * inst->exec_size);
6650         inst->size_written /= 4;
6651         for (unsigned c = 1; c < 4; c++) {
6652            /* Emit a copy of the instruction because we're about to modify
6653             * it.  Because this loop starts at 1, we will emit copies for the
6654             * first 3 and the final one will be the modified instruction.
6655             */
6656            bld.emit(*inst);
6657
6658            /* Offset the source */
6659            inst->src[2] = bld.vgrf(BRW_REGISTER_TYPE_UD);
6660            bld.ADD(inst->src[2], ubo_offset, brw_imm_ud(c * 4));
6661
6662            /* Offset the destination */
6663            inst->dst = offset(inst->dst, bld, 1);
6664         }
6665      }
6666   } else {
6667      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->ver),
6668                           BRW_REGISTER_TYPE_UD);
6669
6670      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
6671
6672      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4;
6673      inst->resize_sources(1);
6674      inst->base_mrf = payload.nr;
6675      inst->header_size = 1;
6676      inst->mlen = 1 + inst->exec_size / 8;
6677   }
6678}
6679
6680static void
6681lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
6682{
6683   assert(bld.shader->devinfo->ver < 6);
6684
6685   inst->base_mrf = 2;
6686   inst->mlen = inst->sources * inst->exec_size / 8;
6687
6688   if (inst->sources > 1) {
6689      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
6690       * "Message Payload":
6691       *
6692       * "Operand0[7].  For the INT DIV functions, this operand is the
6693       *  denominator."
6694       *  ...
6695       * "Operand1[7].  For the INT DIV functions, this operand is the
6696       *  numerator."
6697       */
6698      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
6699      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
6700      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
6701
6702      inst->resize_sources(1);
6703      inst->src[0] = src0;
6704
6705      assert(inst->exec_size == 8);
6706      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
6707   }
6708}
6709
6710static void
6711lower_btd_logical_send(const fs_builder &bld, fs_inst *inst)
6712{
6713   const intel_device_info *devinfo = bld.shader->devinfo;
6714   fs_reg global_addr = inst->src[0];
6715   const fs_reg &btd_record = inst->src[1];
6716
6717   const unsigned mlen = 2;
6718   const fs_builder ubld = bld.exec_all().group(8, 0);
6719   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
6720
6721   ubld.MOV(header, brw_imm_ud(0));
6722   switch (inst->opcode) {
6723   case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6724      assert(type_sz(global_addr.type) == 8 && global_addr.stride == 0);
6725      global_addr.type = BRW_REGISTER_TYPE_UD;
6726      global_addr.stride = 1;
6727      ubld.group(2, 0).MOV(header, global_addr);
6728      break;
6729
6730   case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6731      /* The bottom bit is the Stack ID release bit */
6732      ubld.group(1, 0).MOV(header, brw_imm_ud(1));
6733      break;
6734
6735   default:
6736      unreachable("Invalid BTD message");
6737   }
6738
6739   /* Stack IDs are always in R1 regardless of whether we're coming from a
6740    * bindless shader or a regular compute shader.
6741    */
6742   fs_reg stack_ids =
6743      retype(byte_offset(header, REG_SIZE), BRW_REGISTER_TYPE_UW);
6744   bld.MOV(stack_ids, retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW));
6745
6746   unsigned ex_mlen = 0;
6747   fs_reg payload;
6748   if (inst->opcode == SHADER_OPCODE_BTD_SPAWN_LOGICAL) {
6749      ex_mlen = 2 * (inst->exec_size / 8);
6750      payload = bld.move_to_vgrf(btd_record, 1);
6751   } else {
6752      assert(inst->opcode == SHADER_OPCODE_BTD_RETIRE_LOGICAL);
6753      /* All these messages take a BTD and things complain if we don't provide
6754       * one for RETIRE.  However, it shouldn't ever actually get used so fill
6755       * it with zero.
6756       */
6757      ex_mlen = 2 * (inst->exec_size / 8);
6758      payload = bld.move_to_vgrf(brw_imm_uq(0), 1);
6759   }
6760
6761   /* Update the original instruction. */
6762   inst->opcode = SHADER_OPCODE_SEND;
6763   inst->mlen = mlen;
6764   inst->ex_mlen = ex_mlen;
6765   inst->header_size = 0; /* HW docs require has_header = false */
6766   inst->send_has_side_effects = true;
6767   inst->send_is_volatile = false;
6768
6769   /* Set up SFID and descriptors */
6770   inst->sfid = GEN_RT_SFID_BINDLESS_THREAD_DISPATCH;
6771   inst->desc = brw_btd_spawn_desc(devinfo, inst->exec_size,
6772                                   GEN_RT_BTD_MESSAGE_SPAWN);
6773   inst->resize_sources(4);
6774   inst->src[0] = brw_imm_ud(0); /* desc */
6775   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6776   inst->src[2] = header;
6777   inst->src[3] = payload;
6778}
6779
6780static void
6781lower_trace_ray_logical_send(const fs_builder &bld, fs_inst *inst)
6782{
6783   const intel_device_info *devinfo = bld.shader->devinfo;
6784   const fs_reg &bvh_level = inst->src[0];
6785   assert(inst->src[1].file == BRW_IMMEDIATE_VALUE);
6786   const uint32_t trace_ray_control = inst->src[1].ud;
6787
6788   const unsigned mlen = 1;
6789   const fs_builder ubld = bld.exec_all().group(8, 0);
6790   fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD);
6791   ubld.MOV(header, brw_imm_ud(0));
6792   ubld.group(2, 0).MOV(header,
6793      retype(brw_vec2_grf(2, 0), BRW_REGISTER_TYPE_UD));
6794   /* TODO: Bit 128 is ray_query */
6795
6796   const unsigned ex_mlen = inst->exec_size / 8;
6797   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD);
6798   const uint32_t trc_bits = SET_BITS(trace_ray_control, 9, 8);
6799   if (bvh_level.file == BRW_IMMEDIATE_VALUE) {
6800      bld.MOV(payload, brw_imm_ud(trc_bits | (bvh_level.ud & 0x7)));
6801   } else {
6802      bld.AND(payload, bvh_level, brw_imm_ud(0x7));
6803      if (trc_bits != 0)
6804         bld.OR(payload, payload, brw_imm_ud(trc_bits));
6805   }
6806   bld.AND(subscript(payload, BRW_REGISTER_TYPE_UW, 1),
6807           retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW),
6808           brw_imm_uw(0x7ff));
6809
6810   /* Update the original instruction. */
6811   inst->opcode = SHADER_OPCODE_SEND;
6812   inst->mlen = mlen;
6813   inst->ex_mlen = ex_mlen;
6814   inst->header_size = 0; /* HW docs require has_header = false */
6815   inst->send_has_side_effects = true;
6816   inst->send_is_volatile = false;
6817
6818   /* Set up SFID and descriptors */
6819   inst->sfid = GEN_RT_SFID_RAY_TRACE_ACCELERATOR;
6820   inst->desc = brw_rt_trace_ray_desc(devinfo, inst->exec_size);
6821   inst->resize_sources(4);
6822   inst->src[0] = brw_imm_ud(0); /* desc */
6823   inst->src[1] = brw_imm_ud(0); /* ex_desc */
6824   inst->src[2] = header;
6825   inst->src[3] = payload;
6826}
6827
6828bool
6829fs_visitor::lower_logical_sends()
6830{
6831   bool progress = false;
6832
6833   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
6834      const fs_builder ibld(this, block, inst);
6835
6836      switch (inst->opcode) {
6837      case FS_OPCODE_FB_WRITE_LOGICAL:
6838         assert(stage == MESA_SHADER_FRAGMENT);
6839         lower_fb_write_logical_send(ibld, inst,
6840                                     brw_wm_prog_data(prog_data),
6841                                     (const brw_wm_prog_key *)key,
6842                                     payload);
6843         break;
6844
6845      case FS_OPCODE_FB_READ_LOGICAL:
6846         lower_fb_read_logical_send(ibld, inst);
6847         break;
6848
6849      case SHADER_OPCODE_TEX_LOGICAL:
6850         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
6851         break;
6852
6853      case SHADER_OPCODE_TXD_LOGICAL:
6854         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
6855         break;
6856
6857      case SHADER_OPCODE_TXF_LOGICAL:
6858         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
6859         break;
6860
6861      case SHADER_OPCODE_TXL_LOGICAL:
6862         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
6863         break;
6864
6865      case SHADER_OPCODE_TXS_LOGICAL:
6866         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
6867         break;
6868
6869      case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
6870         lower_sampler_logical_send(ibld, inst,
6871                                    SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
6872         break;
6873
6874      case FS_OPCODE_TXB_LOGICAL:
6875         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
6876         break;
6877
6878      case SHADER_OPCODE_TXF_CMS_LOGICAL:
6879         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
6880         break;
6881
6882      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
6883         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
6884         break;
6885
6886      case SHADER_OPCODE_TXF_UMS_LOGICAL:
6887         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
6888         break;
6889
6890      case SHADER_OPCODE_TXF_MCS_LOGICAL:
6891         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
6892         break;
6893
6894      case SHADER_OPCODE_LOD_LOGICAL:
6895         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
6896         break;
6897
6898      case SHADER_OPCODE_TG4_LOGICAL:
6899         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
6900         break;
6901
6902      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
6903         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
6904         break;
6905
6906      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
6907         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
6908         break;
6909
6910      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
6911      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
6912      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
6913      case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
6914      case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
6915      case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
6916         if (devinfo->has_lsc) {
6917            lower_lsc_surface_logical_send(ibld, inst);
6918            break;
6919         }
6920      case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
6921      case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
6922      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
6923      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
6924      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
6925         lower_surface_logical_send(ibld, inst);
6926         break;
6927
6928      case SHADER_OPCODE_OWORD_BLOCK_READ_LOGICAL:
6929      case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6930      case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
6931         lower_surface_block_logical_send(ibld, inst);
6932         break;
6933
6934      case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
6935      case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
6936      case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
6937      case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
6938      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
6939      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
6940      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
6941      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
6942      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
6943      case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
6944         if (devinfo->has_lsc) {
6945            lower_lsc_a64_logical_send(ibld, inst);
6946            break;
6947         }
6948      case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
6949      case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
6950      case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
6951         lower_a64_logical_send(ibld, inst);
6952         break;
6953
6954      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
6955         if (devinfo->has_lsc && !compiler->indirect_ubos_use_sampler)
6956            lower_lsc_varying_pull_constant_logical_send(ibld, inst);
6957         else
6958            lower_varying_pull_constant_logical_send(ibld, inst);
6959         break;
6960
6961      case SHADER_OPCODE_RCP:
6962      case SHADER_OPCODE_RSQ:
6963      case SHADER_OPCODE_SQRT:
6964      case SHADER_OPCODE_EXP2:
6965      case SHADER_OPCODE_LOG2:
6966      case SHADER_OPCODE_SIN:
6967      case SHADER_OPCODE_COS:
6968      case SHADER_OPCODE_POW:
6969      case SHADER_OPCODE_INT_QUOTIENT:
6970      case SHADER_OPCODE_INT_REMAINDER:
6971         /* The math opcodes are overloaded for the send-like and
6972          * expression-like instructions which seems kind of icky.  Gfx6+ has
6973          * a native (but rather quirky) MATH instruction so we don't need to
6974          * do anything here.  On Gfx4-5 we'll have to lower the Gfx6-like
6975          * logical instructions (which we can easily recognize because they
6976          * have mlen = 0) into send-like virtual instructions.
6977          */
6978         if (devinfo->ver < 6 && inst->mlen == 0) {
6979            lower_math_logical_send(ibld, inst);
6980            break;
6981
6982         } else {
6983            continue;
6984         }
6985
6986      case SHADER_OPCODE_BTD_SPAWN_LOGICAL:
6987      case SHADER_OPCODE_BTD_RETIRE_LOGICAL:
6988         lower_btd_logical_send(ibld, inst);
6989         break;
6990
6991      case RT_OPCODE_TRACE_RAY_LOGICAL:
6992         lower_trace_ray_logical_send(ibld, inst);
6993         break;
6994
6995      default:
6996         continue;
6997      }
6998
6999      progress = true;
7000   }
7001
7002   if (progress)
7003      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7004
7005   return progress;
7006}
7007
7008static bool
7009is_mixed_float_with_fp32_dst(const fs_inst *inst)
7010{
7011   /* This opcode sometimes uses :W type on the source even if the operand is
7012    * a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
7013    */
7014   if (inst->opcode == BRW_OPCODE_F16TO32)
7015      return true;
7016
7017   if (inst->dst.type != BRW_REGISTER_TYPE_F)
7018      return false;
7019
7020   for (int i = 0; i < inst->sources; i++) {
7021      if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
7022         return true;
7023   }
7024
7025   return false;
7026}
7027
7028static bool
7029is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
7030{
7031   /* This opcode sometimes uses :W type on the destination even if the
7032    * destination is a :HF, because in gfx7 there is no support for :HF, and
7033    * thus it uses :W.
7034    */
7035   if (inst->opcode == BRW_OPCODE_F32TO16 &&
7036       inst->dst.stride == 1)
7037      return true;
7038
7039   if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
7040       inst->dst.stride != 1)
7041      return false;
7042
7043   for (int i = 0; i < inst->sources; i++) {
7044      if (inst->src[i].type == BRW_REGISTER_TYPE_F)
7045         return true;
7046   }
7047
7048   return false;
7049}
7050
7051/**
7052 * Get the closest allowed SIMD width for instruction \p inst accounting for
7053 * some common regioning and execution control restrictions that apply to FPU
7054 * instructions.  These restrictions don't necessarily have any relevance to
7055 * instructions not executed by the FPU pipeline like extended math, control
7056 * flow or send message instructions.
7057 *
7058 * For virtual opcodes it's really up to the instruction -- In some cases
7059 * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
7060 * instructions) it may simplify virtual instruction lowering if we can
7061 * enforce FPU-like regioning restrictions already on the virtual instruction,
7062 * in other cases (e.g. virtual send-like instructions) this may be
7063 * excessively restrictive.
7064 */
7065static unsigned
7066get_fpu_lowered_simd_width(const struct intel_device_info *devinfo,
7067                           const fs_inst *inst)
7068{
7069   /* Maximum execution size representable in the instruction controls. */
7070   unsigned max_width = MIN2(32, inst->exec_size);
7071
7072   /* According to the PRMs:
7073    *  "A. In Direct Addressing mode, a source cannot span more than 2
7074    *      adjacent GRF registers.
7075    *   B. A destination cannot span more than 2 adjacent GRF registers."
7076    *
7077    * Look for the source or destination with the largest register region
7078    * which is the one that is going to limit the overall execution size of
7079    * the instruction due to this rule.
7080    */
7081   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7082
7083   for (unsigned i = 0; i < inst->sources; i++)
7084      reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
7085
7086   /* Calculate the maximum execution size of the instruction based on the
7087    * factor by which it goes over the hardware limit of 2 GRFs.
7088    */
7089   if (reg_count > 2)
7090      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
7091
7092   /* According to the IVB PRMs:
7093    *  "When destination spans two registers, the source MUST span two
7094    *   registers. The exception to the above rule:
7095    *
7096    *    - When source is scalar, the source registers are not incremented.
7097    *    - When source is packed integer Word and destination is packed
7098    *      integer DWord, the source register is not incremented but the
7099    *      source sub register is incremented."
7100    *
7101    * The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
7102    * restrictions.  The code below intentionally doesn't check whether the
7103    * destination type is integer because empirically the hardware doesn't
7104    * seem to care what the actual type is as long as it's dword-aligned.
7105    */
7106   if (devinfo->ver < 8) {
7107      for (unsigned i = 0; i < inst->sources; i++) {
7108         /* IVB implements DF scalars as <0;2,1> regions. */
7109         const bool is_scalar_exception = is_uniform(inst->src[i]) &&
7110            (devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
7111         const bool is_packed_word_exception =
7112            type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
7113            type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
7114
7115         /* We check size_read(i) against size_written instead of REG_SIZE
7116          * because we want to properly handle SIMD32.  In SIMD32, you can end
7117          * up with writes to 4 registers and a source that reads 2 registers
7118          * and we may still need to lower all the way to SIMD8 in that case.
7119          */
7120         if (inst->size_written > REG_SIZE &&
7121             inst->size_read(i) != 0 &&
7122             inst->size_read(i) < inst->size_written &&
7123             !is_scalar_exception && !is_packed_word_exception) {
7124            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
7125            max_width = MIN2(max_width, inst->exec_size / reg_count);
7126         }
7127      }
7128   }
7129
7130   if (devinfo->ver < 6) {
7131      /* From the G45 PRM, Volume 4 Page 361:
7132       *
7133       *    "Operand Alignment Rule: With the exceptions listed below, a
7134       *     source/destination operand in general should be aligned to even
7135       *     256-bit physical register with a region size equal to two 256-bit
7136       *     physical registers."
7137       *
7138       * Normally we enforce this by allocating virtual registers to the
7139       * even-aligned class.  But we need to handle payload registers.
7140       */
7141      for (unsigned i = 0; i < inst->sources; i++) {
7142         if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
7143             inst->size_read(i) > REG_SIZE) {
7144            max_width = MIN2(max_width, 8);
7145         }
7146      }
7147   }
7148
7149   /* From the IVB PRMs:
7150    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
7151    *   are applied for both halves of the SIMD32 instruction. If different
7152    *   execution mask channels are required, split the instruction into two
7153    *   SIMD16 instructions."
7154    *
7155    * There is similar text in the HSW PRMs.  Gfx4-6 don't even implement
7156    * 32-wide control flow support in hardware and will behave similarly.
7157    */
7158   if (devinfo->ver < 8 && !inst->force_writemask_all)
7159      max_width = MIN2(max_width, 16);
7160
7161   /* From the IVB PRMs (applies to HSW too):
7162    *  "Instructions with condition modifiers must not use SIMD32."
7163    *
7164    * From the BDW PRMs (applies to later hardware too):
7165    *  "Ternary instruction with condition modifiers must not use SIMD32."
7166    */
7167   if (inst->conditional_mod && (devinfo->ver < 8 || inst->is_3src(devinfo)))
7168      max_width = MIN2(max_width, 16);
7169
7170   /* From the IVB PRMs (applies to other devices that don't have the
7171    * intel_device_info::supports_simd16_3src flag set):
7172    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
7173    *   SIMD8 is not allowed for DF operations."
7174    */
7175   if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
7176      max_width = MIN2(max_width, inst->exec_size / reg_count);
7177
7178   /* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
7179    * the 8-bit quarter of the execution mask signals specified in the
7180    * instruction control fields) for the second compressed half of any
7181    * single-precision instruction (for double-precision instructions
7182    * it's hardwired to use NibCtrl+1, at least on HSW), which means that
7183    * the EU will apply the wrong execution controls for the second
7184    * sequential GRF write if the number of channels per GRF is not exactly
7185    * eight in single-precision mode (or four in double-float mode).
7186    *
7187    * In this situation we calculate the maximum size of the split
7188    * instructions so they only ever write to a single register.
7189    */
7190   if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
7191       !inst->force_writemask_all) {
7192      const unsigned channels_per_grf = inst->exec_size /
7193         DIV_ROUND_UP(inst->size_written, REG_SIZE);
7194      const unsigned exec_type_size = get_exec_type_size(inst);
7195      assert(exec_type_size);
7196
7197      /* The hardware shifts exactly 8 channels per compressed half of the
7198       * instruction in single-precision mode and exactly 4 in double-precision.
7199       */
7200      if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
7201         max_width = MIN2(max_width, channels_per_grf);
7202
7203      /* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
7204       * because HW applies the same channel enable signals to both halves of
7205       * the compressed instruction which will be just wrong under
7206       * non-uniform control flow.
7207       */
7208      if (devinfo->verx10 == 70 &&
7209          (exec_type_size == 8 || type_sz(inst->dst.type) == 8))
7210         max_width = MIN2(max_width, 4);
7211   }
7212
7213   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7214    * Float Operations:
7215    *
7216    *    "No SIMD16 in mixed mode when destination is f32. Instruction
7217    *     execution size must be no more than 8."
7218    *
7219    * FIXME: the simulator doesn't seem to complain if we don't do this and
7220    * empirical testing with existing CTS tests show that they pass just fine
7221    * without implementing this, however, since our interpretation of the PRM
7222    * is that conversion MOVs between HF and F are still mixed-float
7223    * instructions (and therefore subject to this restriction) we decided to
7224    * split them to be safe. Might be useful to do additional investigation to
7225    * lift the restriction if we can ensure that it is safe though, since these
7226    * conversions are common when half-float types are involved since many
7227    * instructions do not support HF types and conversions from/to F are
7228    * required.
7229    */
7230   if (is_mixed_float_with_fp32_dst(inst))
7231      max_width = MIN2(max_width, 8);
7232
7233   /* From the SKL PRM, Special Restrictions for Handling Mixed Mode
7234    * Float Operations:
7235    *
7236    *    "No SIMD16 in mixed mode when destination is packed f16 for both
7237    *     Align1 and Align16."
7238    */
7239   if (is_mixed_float_with_packed_fp16_dst(inst))
7240      max_width = MIN2(max_width, 8);
7241
7242   /* Only power-of-two execution sizes are representable in the instruction
7243    * control fields.
7244    */
7245   return 1 << util_logbase2(max_width);
7246}
7247
7248/**
7249 * Get the maximum allowed SIMD width for instruction \p inst accounting for
7250 * various payload size restrictions that apply to sampler message
7251 * instructions.
7252 *
7253 * This is only intended to provide a maximum theoretical bound for the
7254 * execution size of the message based on the number of argument components
7255 * alone, which in most cases will determine whether the SIMD8 or SIMD16
7256 * variant of the message can be used, though some messages may have
7257 * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
7258 * the message length to determine the exact SIMD width and argument count,
7259 * which makes a number of sampler message combinations impossible to
7260 * represent).
7261 */
7262static unsigned
7263get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
7264                               const fs_inst *inst)
7265{
7266   /* If we have a min_lod parameter on anything other than a simple sample
7267    * message, it will push it over 5 arguments and we have to fall back to
7268    * SIMD8.
7269    */
7270   if (inst->opcode != SHADER_OPCODE_TEX &&
7271       inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
7272      return 8;
7273
7274   /* Calculate the number of coordinate components that have to be present
7275    * assuming that additional arguments follow the texel coordinates in the
7276    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
7277    * need to pad to four or three components depending on the message,
7278    * pre-ILK we need to pad to at most three components.
7279    */
7280   const unsigned req_coord_components =
7281      (devinfo->ver >= 7 ||
7282       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
7283      (devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
7284                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
7285      3;
7286
7287   /* On Gfx9+ the LOD argument is for free if we're able to use the LZ
7288    * variant of the TXL or TXF message.
7289    */
7290   const bool implicit_lod = devinfo->ver >= 9 &&
7291                             (inst->opcode == SHADER_OPCODE_TXL ||
7292                              inst->opcode == SHADER_OPCODE_TXF) &&
7293                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
7294
7295   /* Calculate the total number of argument components that need to be passed
7296    * to the sampler unit.
7297    */
7298   const unsigned num_payload_components =
7299      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
7300           req_coord_components) +
7301      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
7302      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
7303      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
7304      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
7305      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
7306       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
7307      inst->components_read(TEX_LOGICAL_SRC_MCS);
7308
7309   /* SIMD16 messages with more than five arguments exceed the maximum message
7310    * size supported by the sampler, regardless of whether a header is
7311    * provided or not.
7312    */
7313   return MIN2(inst->exec_size,
7314               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
7315}
7316
7317/**
7318 * Get the closest native SIMD width supported by the hardware for instruction
7319 * \p inst.  The instruction will be left untouched by
7320 * fs_visitor::lower_simd_width() if the returned value is equal to the
7321 * original execution size.
7322 */
7323static unsigned
7324get_lowered_simd_width(const struct intel_device_info *devinfo,
7325                       const fs_inst *inst)
7326{
7327   switch (inst->opcode) {
7328   case BRW_OPCODE_MOV:
7329   case BRW_OPCODE_SEL:
7330   case BRW_OPCODE_NOT:
7331   case BRW_OPCODE_AND:
7332   case BRW_OPCODE_OR:
7333   case BRW_OPCODE_XOR:
7334   case BRW_OPCODE_SHR:
7335   case BRW_OPCODE_SHL:
7336   case BRW_OPCODE_ASR:
7337   case BRW_OPCODE_ROR:
7338   case BRW_OPCODE_ROL:
7339   case BRW_OPCODE_CMPN:
7340   case BRW_OPCODE_CSEL:
7341   case BRW_OPCODE_F32TO16:
7342   case BRW_OPCODE_F16TO32:
7343   case BRW_OPCODE_BFREV:
7344   case BRW_OPCODE_BFE:
7345   case BRW_OPCODE_ADD:
7346   case BRW_OPCODE_MUL:
7347   case BRW_OPCODE_AVG:
7348   case BRW_OPCODE_FRC:
7349   case BRW_OPCODE_RNDU:
7350   case BRW_OPCODE_RNDD:
7351   case BRW_OPCODE_RNDE:
7352   case BRW_OPCODE_RNDZ:
7353   case BRW_OPCODE_LZD:
7354   case BRW_OPCODE_FBH:
7355   case BRW_OPCODE_FBL:
7356   case BRW_OPCODE_CBIT:
7357   case BRW_OPCODE_SAD2:
7358   case BRW_OPCODE_MAD:
7359   case BRW_OPCODE_LRP:
7360   case BRW_OPCODE_ADD3:
7361   case FS_OPCODE_PACK:
7362   case SHADER_OPCODE_SEL_EXEC:
7363   case SHADER_OPCODE_CLUSTER_BROADCAST:
7364   case SHADER_OPCODE_MOV_RELOC_IMM:
7365      return get_fpu_lowered_simd_width(devinfo, inst);
7366
7367   case BRW_OPCODE_CMP: {
7368      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
7369       * when the destination is a GRF the dependency-clear bit on the flag
7370       * register is cleared early.
7371       *
7372       * Suggested workarounds are to disable coissuing CMP instructions
7373       * or to split CMP(16) instructions into two CMP(8) instructions.
7374       *
7375       * We choose to split into CMP(8) instructions since disabling
7376       * coissuing would affect CMP instructions not otherwise affected by
7377       * the errata.
7378       */
7379      const unsigned max_width = (devinfo->verx10 == 70 &&
7380                                  !inst->dst.is_null() ? 8 : ~0);
7381      return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
7382   }
7383   case BRW_OPCODE_BFI1:
7384   case BRW_OPCODE_BFI2:
7385      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
7386       * should
7387       *  "Force BFI instructions to be executed always in SIMD8."
7388       */
7389      return MIN2(devinfo->is_haswell ? 8 : ~0u,
7390                  get_fpu_lowered_simd_width(devinfo, inst));
7391
7392   case BRW_OPCODE_IF:
7393      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
7394      return inst->exec_size;
7395
7396   case SHADER_OPCODE_RCP:
7397   case SHADER_OPCODE_RSQ:
7398   case SHADER_OPCODE_SQRT:
7399   case SHADER_OPCODE_EXP2:
7400   case SHADER_OPCODE_LOG2:
7401   case SHADER_OPCODE_SIN:
7402   case SHADER_OPCODE_COS: {
7403      /* Unary extended math instructions are limited to SIMD8 on Gfx4 and
7404       * Gfx6. Extended Math Function is limited to SIMD8 with half-float.
7405       */
7406      if (devinfo->ver == 6 || (devinfo->ver == 4 && !devinfo->is_g4x))
7407         return MIN2(8, inst->exec_size);
7408      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7409         return MIN2(8, inst->exec_size);
7410      return MIN2(16, inst->exec_size);
7411   }
7412
7413   case SHADER_OPCODE_POW: {
7414      /* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
7415       * to SIMD8 with half-float
7416       */
7417      if (devinfo->ver < 7)
7418         return MIN2(8, inst->exec_size);
7419      if (inst->dst.type == BRW_REGISTER_TYPE_HF)
7420         return MIN2(8, inst->exec_size);
7421      return MIN2(16, inst->exec_size);
7422   }
7423
7424   case SHADER_OPCODE_USUB_SAT:
7425   case SHADER_OPCODE_ISUB_SAT:
7426      return get_fpu_lowered_simd_width(devinfo, inst);
7427
7428   case SHADER_OPCODE_INT_QUOTIENT:
7429   case SHADER_OPCODE_INT_REMAINDER:
7430      /* Integer division is limited to SIMD8 on all generations. */
7431      return MIN2(8, inst->exec_size);
7432
7433   case FS_OPCODE_LINTERP:
7434   case SHADER_OPCODE_GET_BUFFER_SIZE:
7435   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
7436   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
7437   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7438   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7439   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
7440      return MIN2(16, inst->exec_size);
7441
7442   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
7443      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
7444       * message used to implement varying pull constant loads, so expand it
7445       * to SIMD16.  An alternative with longer message payload length but
7446       * shorter return payload would be to use the SIMD8 sampler message that
7447       * takes (header, u, v, r) as parameters instead of (header, u).
7448       */
7449      return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
7450
7451   case FS_OPCODE_DDX_COARSE:
7452   case FS_OPCODE_DDX_FINE:
7453   case FS_OPCODE_DDY_COARSE:
7454   case FS_OPCODE_DDY_FINE:
7455      /* The implementation of this virtual opcode may require emitting
7456       * compressed Align16 instructions, which are severely limited on some
7457       * generations.
7458       *
7459       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
7460       * Region Restrictions):
7461       *
7462       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
7463       *   and SIMD8 is not allowed for DF operations."
7464       *
7465       * In this context, "DW operations" means "operations acting on 32-bit
7466       * values", so it includes operations on floats.
7467       *
7468       * Gfx4 has a similar restriction.  From the i965 PRM, section 11.5.3
7469       * (Instruction Compression -> Rules and Restrictions):
7470       *
7471       *  "A compressed instruction must be in Align1 access mode. Align16
7472       *   mode instructions cannot be compressed."
7473       *
7474       * Similar text exists in the g45 PRM.
7475       *
7476       * Empirically, compressed align16 instructions using odd register
7477       * numbers don't appear to work on Sandybridge either.
7478       */
7479      return (devinfo->ver == 4 || devinfo->ver == 6 ||
7480              (devinfo->verx10 == 70) ?
7481              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
7482
7483   case SHADER_OPCODE_MULH:
7484      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
7485       * is 8-wide on Gfx7+.
7486       */
7487      return (devinfo->ver >= 7 ? 8 :
7488              get_fpu_lowered_simd_width(devinfo, inst));
7489
7490   case FS_OPCODE_FB_WRITE_LOGICAL:
7491      /* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
7492       * here.
7493       */
7494      assert(devinfo->ver != 6 ||
7495             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
7496             inst->exec_size == 8);
7497      /* Dual-source FB writes are unsupported in SIMD16 mode. */
7498      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
7499              8 : MIN2(16, inst->exec_size));
7500
7501   case FS_OPCODE_FB_READ_LOGICAL:
7502      return MIN2(16, inst->exec_size);
7503
7504   case SHADER_OPCODE_TEX_LOGICAL:
7505   case SHADER_OPCODE_TXF_CMS_LOGICAL:
7506   case SHADER_OPCODE_TXF_UMS_LOGICAL:
7507   case SHADER_OPCODE_TXF_MCS_LOGICAL:
7508   case SHADER_OPCODE_LOD_LOGICAL:
7509   case SHADER_OPCODE_TG4_LOGICAL:
7510   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
7511   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
7512   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
7513      return get_sampler_lowered_simd_width(devinfo, inst);
7514
7515   case SHADER_OPCODE_TXD_LOGICAL:
7516      /* TXD is unsupported in SIMD16 mode. */
7517      return 8;
7518
7519   case SHADER_OPCODE_TXL_LOGICAL:
7520   case FS_OPCODE_TXB_LOGICAL:
7521      /* Only one execution size is representable pre-ILK depending on whether
7522       * the shadow reference argument is present.
7523       */
7524      if (devinfo->ver == 4)
7525         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
7526      else
7527         return get_sampler_lowered_simd_width(devinfo, inst);
7528
7529   case SHADER_OPCODE_TXF_LOGICAL:
7530   case SHADER_OPCODE_TXS_LOGICAL:
7531      /* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
7532       * messages.  Use SIMD16 instead.
7533       */
7534      if (devinfo->ver == 4)
7535         return 16;
7536      else
7537         return get_sampler_lowered_simd_width(devinfo, inst);
7538
7539   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
7540   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
7541   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
7542      return 8;
7543
7544   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
7545   case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
7546   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
7547   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
7548   case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
7549   case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
7550   case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
7551   case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
7552      return MIN2(16, inst->exec_size);
7553
7554   case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
7555   case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
7556   case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
7557   case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
7558      return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
7559
7560   case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
7561   case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
7562   case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
7563      assert(inst->exec_size <= 16);
7564      return inst->exec_size;
7565
7566   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
7567   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL:
7568   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
7569   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL:
7570   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL:
7571   case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL:
7572      return 8;
7573
7574   case SHADER_OPCODE_URB_READ_SIMD8:
7575   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
7576   case SHADER_OPCODE_URB_WRITE_SIMD8:
7577   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
7578   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
7579   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
7580      return MIN2(8, inst->exec_size);
7581
7582   case SHADER_OPCODE_QUAD_SWIZZLE: {
7583      const unsigned swiz = inst->src[1].ud;
7584      return (is_uniform(inst->src[0]) ?
7585                 get_fpu_lowered_simd_width(devinfo, inst) :
7586              devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
7587              swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
7588              get_fpu_lowered_simd_width(devinfo, inst));
7589   }
7590   case SHADER_OPCODE_MOV_INDIRECT: {
7591      /* From IVB and HSW PRMs:
7592       *
7593       * "2.When the destination requires two registers and the sources are
7594       *  indirect, the sources must use 1x1 regioning mode.
7595       *
7596       * In case of DF instructions in HSW/IVB, the exec_size is limited by
7597       * the EU decompression logic not handling VxH indirect addressing
7598       * correctly.
7599       */
7600      const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
7601      /* Prior to Broadwell, we only have 8 address subregisters. */
7602      return MIN3(devinfo->ver >= 8 ? 16 : 8,
7603                  max_size / (inst->dst.stride * type_sz(inst->dst.type)),
7604                  inst->exec_size);
7605   }
7606
7607   case SHADER_OPCODE_LOAD_PAYLOAD: {
7608      const unsigned reg_count =
7609         DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
7610
7611      if (reg_count > 2) {
7612         /* Only LOAD_PAYLOAD instructions with per-channel destination region
7613          * can be easily lowered (which excludes headers and heterogeneous
7614          * types).
7615          */
7616         assert(!inst->header_size);
7617         for (unsigned i = 0; i < inst->sources; i++)
7618            assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
7619                   inst->src[i].file == BAD_FILE);
7620
7621         return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
7622      } else {
7623         return inst->exec_size;
7624      }
7625   }
7626   default:
7627      return inst->exec_size;
7628   }
7629}
7630
7631/**
7632 * Return true if splitting out the group of channels of instruction \p inst
7633 * given by lbld.group() requires allocating a temporary for the i-th source
7634 * of the lowered instruction.
7635 */
7636static inline bool
7637needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
7638{
7639   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
7640            (inst->components_read(i) == 1 &&
7641             lbld.dispatch_width() <= inst->exec_size)) ||
7642          (inst->flags_written(lbld.shader->devinfo) &
7643           flag_mask(inst->src[i], type_sz(inst->src[i].type)));
7644}
7645
7646/**
7647 * Extract the data that would be consumed by the channel group given by
7648 * lbld.group() from the i-th source region of instruction \p inst and return
7649 * it as result in packed form.
7650 */
7651static fs_reg
7652emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
7653{
7654   assert(lbld.group() >= inst->group);
7655
7656   /* Specified channel group from the source region. */
7657   const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
7658
7659   if (needs_src_copy(lbld, inst, i)) {
7660      /* Builder of the right width to perform the copy avoiding uninitialized
7661       * data if the lowered execution size is greater than the original
7662       * execution size of the instruction.
7663       */
7664      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
7665                                              inst->exec_size), 0);
7666      const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
7667
7668      for (unsigned k = 0; k < inst->components_read(i); ++k)
7669         cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
7670
7671      return tmp;
7672
7673   } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
7674      /* The source is invariant for all dispatch_width-wide groups of the
7675       * original region.
7676       */
7677      return inst->src[i];
7678
7679   } else {
7680      /* We can just point the lowered instruction at the right channel group
7681       * from the original region.
7682       */
7683      return src;
7684   }
7685}
7686
7687/**
7688 * Return true if splitting out the group of channels of instruction \p inst
7689 * given by lbld.group() requires allocating a temporary for the destination
7690 * of the lowered instruction and copying the data back to the original
7691 * destination region.
7692 */
7693static inline bool
7694needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
7695{
7696   /* If the instruction writes more than one component we'll have to shuffle
7697    * the results of multiple lowered instructions in order to make sure that
7698    * they end up arranged correctly in the original destination region.
7699    */
7700   if (inst->size_written > inst->dst.component_size(inst->exec_size))
7701      return true;
7702
7703   /* If the lowered execution size is larger than the original the result of
7704    * the instruction won't fit in the original destination, so we'll have to
7705    * allocate a temporary in any case.
7706    */
7707   if (lbld.dispatch_width() > inst->exec_size)
7708      return true;
7709
7710   for (unsigned i = 0; i < inst->sources; i++) {
7711      /* If we already made a copy of the source for other reasons there won't
7712       * be any overlap with the destination.
7713       */
7714      if (needs_src_copy(lbld, inst, i))
7715         continue;
7716
7717      /* In order to keep the logic simple we emit a copy whenever the
7718       * destination region doesn't exactly match an overlapping source, which
7719       * may point at the source and destination not being aligned group by
7720       * group which could cause one of the lowered instructions to overwrite
7721       * the data read from the same source by other lowered instructions.
7722       */
7723      if (regions_overlap(inst->dst, inst->size_written,
7724                          inst->src[i], inst->size_read(i)) &&
7725          !inst->dst.equals(inst->src[i]))
7726        return true;
7727   }
7728
7729   return false;
7730}
7731
7732/**
7733 * Insert data from a packed temporary into the channel group given by
7734 * lbld.group() of the destination region of instruction \p inst and return
7735 * the temporary as result.  Any copy instructions that are required for
7736 * unzipping the previous value (in the case of partial writes) will be
7737 * inserted using \p lbld_before and any copy instructions required for
7738 * zipping up the destination of \p inst will be inserted using \p lbld_after.
7739 */
7740static fs_reg
7741emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
7742         fs_inst *inst)
7743{
7744   assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
7745   assert(lbld_before.group() == lbld_after.group());
7746   assert(lbld_after.group() >= inst->group);
7747
7748   /* Specified channel group from the destination region. */
7749   const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
7750   const unsigned dst_size = inst->size_written /
7751      inst->dst.component_size(inst->exec_size);
7752
7753   if (needs_dst_copy(lbld_after, inst)) {
7754      const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
7755
7756      if (inst->predicate) {
7757         /* Handle predication by copying the original contents of
7758          * the destination into the temporary before emitting the
7759          * lowered instruction.
7760          */
7761         const fs_builder gbld_before =
7762            lbld_before.group(MIN2(lbld_before.dispatch_width(),
7763                                   inst->exec_size), 0);
7764         for (unsigned k = 0; k < dst_size; ++k) {
7765            gbld_before.MOV(offset(tmp, lbld_before, k),
7766                            offset(dst, inst->exec_size, k));
7767         }
7768      }
7769
7770      const fs_builder gbld_after =
7771         lbld_after.group(MIN2(lbld_after.dispatch_width(),
7772                               inst->exec_size), 0);
7773      for (unsigned k = 0; k < dst_size; ++k) {
7774         /* Use a builder of the right width to perform the copy avoiding
7775          * uninitialized data if the lowered execution size is greater than
7776          * the original execution size of the instruction.
7777          */
7778         gbld_after.MOV(offset(dst, inst->exec_size, k),
7779                        offset(tmp, lbld_after, k));
7780      }
7781
7782      return tmp;
7783
7784   } else {
7785      /* No need to allocate a temporary for the lowered instruction, just
7786       * take the right group of channels from the original region.
7787       */
7788      return dst;
7789   }
7790}
7791
7792bool
7793fs_visitor::lower_simd_width()
7794{
7795   bool progress = false;
7796
7797   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7798      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
7799
7800      if (lower_width != inst->exec_size) {
7801         /* Builder matching the original instruction.  We may also need to
7802          * emit an instruction of width larger than the original, set the
7803          * execution size of the builder to the highest of both for now so
7804          * we're sure that both cases can be handled.
7805          */
7806         const unsigned max_width = MAX2(inst->exec_size, lower_width);
7807         const fs_builder ibld = bld.at(block, inst)
7808                                    .exec_all(inst->force_writemask_all)
7809                                    .group(max_width, inst->group / max_width);
7810
7811         /* Split the copies in chunks of the execution width of either the
7812          * original or the lowered instruction, whichever is lower.
7813          */
7814         const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
7815         const unsigned dst_size = inst->size_written /
7816            inst->dst.component_size(inst->exec_size);
7817
7818         assert(!inst->writes_accumulator && !inst->mlen);
7819
7820         /* Inserting the zip, unzip, and duplicated instructions in all of
7821          * the right spots is somewhat tricky.  All of the unzip and any
7822          * instructions from the zip which unzip the destination prior to
7823          * writing need to happen before all of the per-group instructions
7824          * and the zip instructions need to happen after.  In order to sort
7825          * this all out, we insert the unzip instructions before \p inst,
7826          * insert the per-group instructions after \p inst (i.e. before
7827          * inst->next), and insert the zip instructions before the
7828          * instruction after \p inst.  Since we are inserting instructions
7829          * after \p inst, inst->next is a moving target and we need to save
7830          * it off here so that we insert the zip instructions in the right
7831          * place.
7832          *
7833          * Since we're inserting split instructions after after_inst, the
7834          * instructions will end up in the reverse order that we insert them.
7835          * However, certain render target writes require that the low group
7836          * instructions come before the high group.  From the Ivy Bridge PRM
7837          * Vol. 4, Pt. 1, Section 3.9.11:
7838          *
7839          *    "If multiple SIMD8 Dual Source messages are delivered by the
7840          *    pixel shader thread, each SIMD8_DUALSRC_LO message must be
7841          *    issued before the SIMD8_DUALSRC_HI message with the same Slot
7842          *    Group Select setting."
7843          *
7844          * And, from Section 3.9.11.1 of the same PRM:
7845          *
7846          *    "When SIMD32 or SIMD16 PS threads send render target writes
7847          *    with multiple SIMD8 and SIMD16 messages, the following must
7848          *    hold:
7849          *
7850          *    All the slots (as described above) must have a corresponding
7851          *    render target write irrespective of the slot's validity. A slot
7852          *    is considered valid when at least one sample is enabled. For
7853          *    example, a SIMD16 PS thread must send two SIMD8 render target
7854          *    writes to cover all the slots.
7855          *
7856          *    PS thread must send SIMD render target write messages with
7857          *    increasing slot numbers. For example, SIMD16 thread has
7858          *    Slot[15:0] and if two SIMD8 render target writes are used, the
7859          *    first SIMD8 render target write must send Slot[7:0] and the
7860          *    next one must send Slot[15:8]."
7861          *
7862          * In order to make low group instructions come before high group
7863          * instructions (this is required for some render target writes), we
7864          * split from the highest group to lowest.
7865          */
7866         exec_node *const after_inst = inst->next;
7867         for (int i = n - 1; i >= 0; i--) {
7868            /* Emit a copy of the original instruction with the lowered width.
7869             * If the EOT flag was set throw it away except for the last
7870             * instruction to avoid killing the thread prematurely.
7871             */
7872            fs_inst split_inst = *inst;
7873            split_inst.exec_size = lower_width;
7874            split_inst.eot = inst->eot && i == int(n - 1);
7875
7876            /* Select the correct channel enables for the i-th group, then
7877             * transform the sources and destination and emit the lowered
7878             * instruction.
7879             */
7880            const fs_builder lbld = ibld.group(lower_width, i);
7881
7882            for (unsigned j = 0; j < inst->sources; j++)
7883               split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
7884
7885            split_inst.dst = emit_zip(lbld.at(block, inst),
7886                                      lbld.at(block, after_inst), inst);
7887            split_inst.size_written =
7888               split_inst.dst.component_size(lower_width) * dst_size;
7889
7890            lbld.at(block, inst->next).emit(split_inst);
7891         }
7892
7893         inst->remove(block);
7894         progress = true;
7895      }
7896   }
7897
7898   if (progress)
7899      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7900
7901   return progress;
7902}
7903
7904/**
7905 * Transform barycentric vectors into the interleaved form expected by the PLN
7906 * instruction and returned by the Gfx7+ PI shared function.
7907 *
7908 * For channels 0-15 in SIMD16 mode they are expected to be laid out as
7909 * follows in the register file:
7910 *
7911 *    rN+0: X[0-7]
7912 *    rN+1: Y[0-7]
7913 *    rN+2: X[8-15]
7914 *    rN+3: Y[8-15]
7915 *
7916 * There is no need to handle SIMD32 here -- This is expected to be run after
7917 * SIMD lowering, since SIMD lowering relies on vectors having the standard
7918 * component layout.
7919 */
7920bool
7921fs_visitor::lower_barycentrics()
7922{
7923   const bool has_interleaved_layout = devinfo->has_pln || devinfo->ver >= 7;
7924   bool progress = false;
7925
7926   if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
7927      return false;
7928
7929   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
7930      if (inst->exec_size < 16)
7931         continue;
7932
7933      const fs_builder ibld(this, block, inst);
7934      const fs_builder ubld = ibld.exec_all().group(8, 0);
7935
7936      switch (inst->opcode) {
7937      case FS_OPCODE_LINTERP : {
7938         assert(inst->exec_size == 16);
7939         const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
7940         fs_reg srcs[4];
7941
7942         for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
7943            srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
7944                                   8 * (i / 2));
7945
7946         ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
7947
7948         inst->src[0] = tmp;
7949         progress = true;
7950         break;
7951      }
7952      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
7953      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
7954      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
7955         assert(inst->exec_size == 16);
7956         const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
7957
7958         for (unsigned i = 0; i < 2; i++) {
7959            for (unsigned g = 0; g < inst->exec_size / 8; g++) {
7960               fs_inst *mov = ibld.at(block, inst->next).group(8, g)
7961                                  .MOV(horiz_offset(offset(inst->dst, ibld, i),
7962                                                    8 * g),
7963                                       offset(tmp, ubld, 2 * g + i));
7964               mov->predicate = inst->predicate;
7965               mov->predicate_inverse = inst->predicate_inverse;
7966               mov->flag_subreg = inst->flag_subreg;
7967            }
7968         }
7969
7970         inst->dst = tmp;
7971         progress = true;
7972         break;
7973      }
7974      default:
7975         break;
7976      }
7977   }
7978
7979   if (progress)
7980      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
7981
7982   return progress;
7983}
7984
7985/**
7986 * Lower a derivative instruction as the floating-point difference of two
7987 * swizzles of the source, specified as \p swz0 and \p swz1.
7988 */
7989static bool
7990lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,
7991                 unsigned swz0, unsigned swz1)
7992{
7993   const fs_builder ibld(v, block, inst);
7994   const fs_reg tmp0 = ibld.vgrf(inst->src[0].type);
7995   const fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
7996
7997   ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
7998   ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
7999
8000   inst->resize_sources(2);
8001   inst->src[0] = negate(tmp0);
8002   inst->src[1] = tmp1;
8003   inst->opcode = BRW_OPCODE_ADD;
8004
8005   return true;
8006}
8007
8008/**
8009 * Lower derivative instructions on platforms where codegen cannot implement
8010 * them efficiently (i.e. XeHP).
8011 */
8012bool
8013fs_visitor::lower_derivatives()
8014{
8015   bool progress = false;
8016
8017   if (devinfo->verx10 < 125)
8018      return false;
8019
8020   foreach_block_and_inst(block, fs_inst, inst, cfg) {
8021      if (inst->opcode == FS_OPCODE_DDX_COARSE)
8022         progress |= lower_derivative(this, block, inst,
8023                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
8024
8025      else if (inst->opcode == FS_OPCODE_DDX_FINE)
8026         progress |= lower_derivative(this, block, inst,
8027                                      BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
8028
8029      else if (inst->opcode == FS_OPCODE_DDY_COARSE)
8030         progress |= lower_derivative(this, block, inst,
8031                                      BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
8032
8033      else if (inst->opcode == FS_OPCODE_DDY_FINE)
8034         progress |= lower_derivative(this, block, inst,
8035                                      BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
8036   }
8037
8038   if (progress)
8039      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8040
8041   return progress;
8042}
8043
8044void
8045fs_visitor::dump_instructions() const
8046{
8047   dump_instructions(NULL);
8048}
8049
8050void
8051fs_visitor::dump_instructions(const char *name) const
8052{
8053   FILE *file = stderr;
8054   if (name && geteuid() != 0) {
8055      file = fopen(name, "w");
8056      if (!file)
8057         file = stderr;
8058   }
8059
8060   if (cfg) {
8061      const register_pressure &rp = regpressure_analysis.require();
8062      unsigned ip = 0, max_pressure = 0;
8063      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
8064         max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
8065         fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
8066         dump_instruction(inst, file);
8067         ip++;
8068      }
8069      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
8070   } else {
8071      int ip = 0;
8072      foreach_in_list(backend_instruction, inst, &instructions) {
8073         fprintf(file, "%4d: ", ip++);
8074         dump_instruction(inst, file);
8075      }
8076   }
8077
8078   if (file != stderr) {
8079      fclose(file);
8080   }
8081}
8082
8083void
8084fs_visitor::dump_instruction(const backend_instruction *be_inst) const
8085{
8086   dump_instruction(be_inst, stderr);
8087}
8088
8089void
8090fs_visitor::dump_instruction(const backend_instruction *be_inst, FILE *file) const
8091{
8092   const fs_inst *inst = (const fs_inst *)be_inst;
8093
8094   if (inst->predicate) {
8095      fprintf(file, "(%cf%d.%d) ",
8096              inst->predicate_inverse ? '-' : '+',
8097              inst->flag_subreg / 2,
8098              inst->flag_subreg % 2);
8099   }
8100
8101   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
8102   if (inst->saturate)
8103      fprintf(file, ".sat");
8104   if (inst->conditional_mod) {
8105      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
8106      if (!inst->predicate &&
8107          (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
8108                                inst->opcode != BRW_OPCODE_CSEL &&
8109                                inst->opcode != BRW_OPCODE_IF &&
8110                                inst->opcode != BRW_OPCODE_WHILE))) {
8111         fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
8112                 inst->flag_subreg % 2);
8113      }
8114   }
8115   fprintf(file, "(%d) ", inst->exec_size);
8116
8117   if (inst->mlen) {
8118      fprintf(file, "(mlen: %d) ", inst->mlen);
8119   }
8120
8121   if (inst->ex_mlen) {
8122      fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
8123   }
8124
8125   if (inst->eot) {
8126      fprintf(file, "(EOT) ");
8127   }
8128
8129   switch (inst->dst.file) {
8130   case VGRF:
8131      fprintf(file, "vgrf%d", inst->dst.nr);
8132      break;
8133   case FIXED_GRF:
8134      fprintf(file, "g%d", inst->dst.nr);
8135      break;
8136   case MRF:
8137      fprintf(file, "m%d", inst->dst.nr);
8138      break;
8139   case BAD_FILE:
8140      fprintf(file, "(null)");
8141      break;
8142   case UNIFORM:
8143      fprintf(file, "***u%d***", inst->dst.nr);
8144      break;
8145   case ATTR:
8146      fprintf(file, "***attr%d***", inst->dst.nr);
8147      break;
8148   case ARF:
8149      switch (inst->dst.nr) {
8150      case BRW_ARF_NULL:
8151         fprintf(file, "null");
8152         break;
8153      case BRW_ARF_ADDRESS:
8154         fprintf(file, "a0.%d", inst->dst.subnr);
8155         break;
8156      case BRW_ARF_ACCUMULATOR:
8157         fprintf(file, "acc%d", inst->dst.subnr);
8158         break;
8159      case BRW_ARF_FLAG:
8160         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8161         break;
8162      default:
8163         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
8164         break;
8165      }
8166      break;
8167   case IMM:
8168      unreachable("not reached");
8169   }
8170
8171   if (inst->dst.offset ||
8172       (inst->dst.file == VGRF &&
8173        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
8174      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
8175      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
8176              inst->dst.offset % reg_size);
8177   }
8178
8179   if (inst->dst.stride != 1)
8180      fprintf(file, "<%u>", inst->dst.stride);
8181   fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
8182
8183   for (int i = 0; i < inst->sources; i++) {
8184      if (inst->src[i].negate)
8185         fprintf(file, "-");
8186      if (inst->src[i].abs)
8187         fprintf(file, "|");
8188      switch (inst->src[i].file) {
8189      case VGRF:
8190         fprintf(file, "vgrf%d", inst->src[i].nr);
8191         break;
8192      case FIXED_GRF:
8193         fprintf(file, "g%d", inst->src[i].nr);
8194         break;
8195      case MRF:
8196         fprintf(file, "***m%d***", inst->src[i].nr);
8197         break;
8198      case ATTR:
8199         fprintf(file, "attr%d", inst->src[i].nr);
8200         break;
8201      case UNIFORM:
8202         fprintf(file, "u%d", inst->src[i].nr);
8203         break;
8204      case BAD_FILE:
8205         fprintf(file, "(null)");
8206         break;
8207      case IMM:
8208         switch (inst->src[i].type) {
8209         case BRW_REGISTER_TYPE_HF:
8210            fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
8211            break;
8212         case BRW_REGISTER_TYPE_F:
8213            fprintf(file, "%-gf", inst->src[i].f);
8214            break;
8215         case BRW_REGISTER_TYPE_DF:
8216            fprintf(file, "%fdf", inst->src[i].df);
8217            break;
8218         case BRW_REGISTER_TYPE_W:
8219         case BRW_REGISTER_TYPE_D:
8220            fprintf(file, "%dd", inst->src[i].d);
8221            break;
8222         case BRW_REGISTER_TYPE_UW:
8223         case BRW_REGISTER_TYPE_UD:
8224            fprintf(file, "%uu", inst->src[i].ud);
8225            break;
8226         case BRW_REGISTER_TYPE_Q:
8227            fprintf(file, "%" PRId64 "q", inst->src[i].d64);
8228            break;
8229         case BRW_REGISTER_TYPE_UQ:
8230            fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
8231            break;
8232         case BRW_REGISTER_TYPE_VF:
8233            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
8234                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
8235                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
8236                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
8237                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
8238            break;
8239         case BRW_REGISTER_TYPE_V:
8240         case BRW_REGISTER_TYPE_UV:
8241            fprintf(file, "%08x%s", inst->src[i].ud,
8242                    inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
8243            break;
8244         default:
8245            fprintf(file, "???");
8246            break;
8247         }
8248         break;
8249      case ARF:
8250         switch (inst->src[i].nr) {
8251         case BRW_ARF_NULL:
8252            fprintf(file, "null");
8253            break;
8254         case BRW_ARF_ADDRESS:
8255            fprintf(file, "a0.%d", inst->src[i].subnr);
8256            break;
8257         case BRW_ARF_ACCUMULATOR:
8258            fprintf(file, "acc%d", inst->src[i].subnr);
8259            break;
8260         case BRW_ARF_FLAG:
8261            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8262            break;
8263         default:
8264            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
8265            break;
8266         }
8267         break;
8268      }
8269
8270      if (inst->src[i].offset ||
8271          (inst->src[i].file == VGRF &&
8272           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
8273         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
8274         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
8275                 inst->src[i].offset % reg_size);
8276      }
8277
8278      if (inst->src[i].abs)
8279         fprintf(file, "|");
8280
8281      if (inst->src[i].file != IMM) {
8282         unsigned stride;
8283         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
8284            unsigned hstride = inst->src[i].hstride;
8285            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
8286         } else {
8287            stride = inst->src[i].stride;
8288         }
8289         if (stride != 1)
8290            fprintf(file, "<%u>", stride);
8291
8292         fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
8293      }
8294
8295      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
8296         fprintf(file, ", ");
8297   }
8298
8299   fprintf(file, " ");
8300
8301   if (inst->force_writemask_all)
8302      fprintf(file, "NoMask ");
8303
8304   if (inst->exec_size != dispatch_width)
8305      fprintf(file, "group%d ", inst->group);
8306
8307   fprintf(file, "\n");
8308}
8309
8310void
8311fs_visitor::setup_fs_payload_gfx6()
8312{
8313   assert(stage == MESA_SHADER_FRAGMENT);
8314   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
8315   const unsigned payload_width = MIN2(16, dispatch_width);
8316   assert(dispatch_width % payload_width == 0);
8317   assert(devinfo->ver >= 6);
8318
8319   /* R0: PS thread payload header. */
8320   payload.num_regs++;
8321
8322   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8323      /* R1: masks, pixel X/Y coordinates. */
8324      payload.subspan_coord_reg[j] = payload.num_regs++;
8325   }
8326
8327   for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
8328      /* R3-26: barycentric interpolation coordinates.  These appear in the
8329       * same order that they appear in the brw_barycentric_mode enum.  Each
8330       * set of coordinates occupies 2 registers if dispatch width == 8 and 4
8331       * registers if dispatch width == 16.  Coordinates only appear if they
8332       * were enabled using the "Barycentric Interpolation Mode" bits in
8333       * WM_STATE.
8334       */
8335      for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
8336         if (prog_data->barycentric_interp_modes & (1 << i)) {
8337            payload.barycentric_coord_reg[i][j] = payload.num_regs;
8338            payload.num_regs += payload_width / 4;
8339         }
8340      }
8341
8342      /* R27-28: interpolated depth if uses source depth */
8343      if (prog_data->uses_src_depth) {
8344         payload.source_depth_reg[j] = payload.num_regs;
8345         payload.num_regs += payload_width / 8;
8346      }
8347
8348      /* R29-30: interpolated W set if GFX6_WM_USES_SOURCE_W. */
8349      if (prog_data->uses_src_w) {
8350         payload.source_w_reg[j] = payload.num_regs;
8351         payload.num_regs += payload_width / 8;
8352      }
8353
8354      /* R31: MSAA position offsets. */
8355      if (prog_data->uses_pos_offset) {
8356         payload.sample_pos_reg[j] = payload.num_regs;
8357         payload.num_regs++;
8358      }
8359
8360      /* R32-33: MSAA input coverage mask */
8361      if (prog_data->uses_sample_mask) {
8362         assert(devinfo->ver >= 7);
8363         payload.sample_mask_in_reg[j] = payload.num_regs;
8364         payload.num_regs += payload_width / 8;
8365      }
8366
8367      /* R66: Source Depth and/or W Attribute Vertex Deltas */
8368      if (prog_data->uses_depth_w_coefficients) {
8369         payload.depth_w_coef_reg[j] = payload.num_regs;
8370         payload.num_regs++;
8371      }
8372   }
8373
8374   if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
8375      source_depth_to_render_target = true;
8376   }
8377}
8378
8379void
8380fs_visitor::setup_vs_payload()
8381{
8382   /* R0: thread header, R1: urb handles */
8383   payload.num_regs = 2;
8384}
8385
8386void
8387fs_visitor::setup_gs_payload()
8388{
8389   assert(stage == MESA_SHADER_GEOMETRY);
8390
8391   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
8392   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
8393
8394   /* R0: thread header, R1: output URB handles */
8395   payload.num_regs = 2;
8396
8397   if (gs_prog_data->include_primitive_id) {
8398      /* R2: Primitive ID 0..7 */
8399      payload.num_regs++;
8400   }
8401
8402   /* Always enable VUE handles so we can safely use pull model if needed.
8403    *
8404    * The push model for a GS uses a ton of register space even for trivial
8405    * scenarios with just a few inputs, so just make things easier and a bit
8406    * safer by always having pull model available.
8407    */
8408   gs_prog_data->base.include_vue_handles = true;
8409
8410   /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
8411   payload.num_regs += nir->info.gs.vertices_in;
8412
8413   /* Use a maximum of 24 registers for push-model inputs. */
8414   const unsigned max_push_components = 24;
8415
8416   /* If pushing our inputs would take too many registers, reduce the URB read
8417    * length (which is in HWords, or 8 registers), and resort to pulling.
8418    *
8419    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
8420    * have to multiply by VerticesIn to obtain the total storage requirement.
8421    */
8422   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
8423       max_push_components) {
8424      vue_prog_data->urb_read_length =
8425         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
8426   }
8427}
8428
8429void
8430fs_visitor::setup_cs_payload()
8431{
8432   assert(devinfo->ver >= 7);
8433   /* TODO: Fill out uses_btd_stack_ids automatically */
8434   payload.num_regs = 1 + brw_cs_prog_data(prog_data)->uses_btd_stack_ids;
8435}
8436
8437brw::register_pressure::register_pressure(const fs_visitor *v)
8438{
8439   const fs_live_variables &live = v->live_analysis.require();
8440   const unsigned num_instructions = v->cfg->num_blocks ?
8441      v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
8442
8443   regs_live_at_ip = new unsigned[num_instructions]();
8444
8445   for (unsigned reg = 0; reg < v->alloc.count; reg++) {
8446      for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
8447         regs_live_at_ip[ip] += v->alloc.sizes[reg];
8448   }
8449}
8450
8451brw::register_pressure::~register_pressure()
8452{
8453   delete[] regs_live_at_ip;
8454}
8455
8456void
8457fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
8458{
8459   backend_shader::invalidate_analysis(c);
8460   live_analysis.invalidate(c);
8461   regpressure_analysis.invalidate(c);
8462}
8463
8464void
8465fs_visitor::optimize()
8466{
8467   /* Start by validating the shader we currently have. */
8468   validate();
8469
8470   /* bld is the common builder object pointing at the end of the program we
8471    * used to translate it into i965 IR.  For the optimization and lowering
8472    * passes coming next, any code added after the end of the program without
8473    * having explicitly called fs_builder::at() clearly points at a mistake.
8474    * Ideally optimization passes wouldn't be part of the visitor so they
8475    * wouldn't have access to bld at all, but they do, so just in case some
8476    * pass forgets to ask for a location explicitly set it to NULL here to
8477    * make it trip.  The dispatch width is initialized to a bogus value to
8478    * make sure that optimizations set the execution controls explicitly to
8479    * match the code they are manipulating instead of relying on the defaults.
8480    */
8481   bld = fs_builder(this, 64);
8482
8483   assign_constant_locations();
8484   lower_constant_loads();
8485
8486   validate();
8487
8488   split_virtual_grfs();
8489   validate();
8490
8491#define OPT(pass, args...) ({                                           \
8492      pass_num++;                                                       \
8493      bool this_progress = pass(args);                                  \
8494                                                                        \
8495      if (INTEL_DEBUG(DEBUG_OPTIMIZER) && this_progress) {              \
8496         char filename[64];                                             \
8497         snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
8498                  stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
8499                                                                        \
8500         backend_shader::dump_instructions(filename);                   \
8501      }                                                                 \
8502                                                                        \
8503      validate();                                                       \
8504                                                                        \
8505      progress = progress || this_progress;                             \
8506      this_progress;                                                    \
8507   })
8508
8509   if (INTEL_DEBUG(DEBUG_OPTIMIZER)) {
8510      char filename[64];
8511      snprintf(filename, 64, "%s%d-%s-00-00-start",
8512               stage_abbrev, dispatch_width, nir->info.name);
8513
8514      backend_shader::dump_instructions(filename);
8515   }
8516
8517   bool progress = false;
8518   int iteration = 0;
8519   int pass_num = 0;
8520
8521   /* Before anything else, eliminate dead code.  The results of some NIR
8522    * instructions may effectively be calculated twice.  Once when the
8523    * instruction is encountered, and again when the user of that result is
8524    * encountered.  Wipe those away before algebraic optimizations and
8525    * especially copy propagation can mix things up.
8526    */
8527   OPT(dead_code_eliminate);
8528
8529   OPT(remove_extra_rounding_modes);
8530
8531   do {
8532      progress = false;
8533      pass_num = 0;
8534      iteration++;
8535
8536      OPT(remove_duplicate_mrf_writes);
8537
8538      OPT(opt_algebraic);
8539      OPT(opt_cse);
8540      OPT(opt_copy_propagation);
8541      OPT(opt_predicated_break, this);
8542      OPT(opt_cmod_propagation);
8543      OPT(dead_code_eliminate);
8544      OPT(opt_peephole_sel);
8545      OPT(dead_control_flow_eliminate, this);
8546      OPT(opt_register_renaming);
8547      OPT(opt_saturate_propagation);
8548      OPT(register_coalesce);
8549      OPT(compute_to_mrf);
8550      OPT(eliminate_find_live_channel);
8551
8552      OPT(compact_virtual_grfs);
8553   } while (progress);
8554
8555   progress = false;
8556   pass_num = 0;
8557
8558   if (OPT(lower_pack)) {
8559      OPT(register_coalesce);
8560      OPT(dead_code_eliminate);
8561   }
8562
8563   OPT(lower_simd_width);
8564   OPT(lower_barycentrics);
8565   OPT(lower_logical_sends);
8566
8567   /* After logical SEND lowering. */
8568   OPT(fixup_nomask_control_flow);
8569
8570   if (progress) {
8571      OPT(opt_copy_propagation);
8572      /* Only run after logical send lowering because it's easier to implement
8573       * in terms of physical sends.
8574       */
8575      if (OPT(opt_zero_samples))
8576         OPT(opt_copy_propagation);
8577      /* Run after logical send lowering to give it a chance to CSE the
8578       * LOAD_PAYLOAD instructions created to construct the payloads of
8579       * e.g. texturing messages in cases where it wasn't possible to CSE the
8580       * whole logical instruction.
8581       */
8582      OPT(opt_cse);
8583      OPT(register_coalesce);
8584      OPT(compute_to_mrf);
8585      OPT(dead_code_eliminate);
8586      OPT(remove_duplicate_mrf_writes);
8587      OPT(opt_peephole_sel);
8588   }
8589
8590   OPT(opt_redundant_halt);
8591
8592   if (OPT(lower_load_payload)) {
8593      split_virtual_grfs();
8594
8595      /* Lower 64 bit MOVs generated by payload lowering. */
8596      if (!devinfo->has_64bit_float && !devinfo->has_64bit_int)
8597         OPT(opt_algebraic);
8598
8599      OPT(register_coalesce);
8600      OPT(lower_simd_width);
8601      OPT(compute_to_mrf);
8602      OPT(dead_code_eliminate);
8603   }
8604
8605   OPT(opt_combine_constants);
8606   if (OPT(lower_integer_multiplication)) {
8607      /* If lower_integer_multiplication made progress, it may have produced
8608       * some 32x32-bit MULs in the process of lowering 64-bit MULs.  Run it
8609       * one more time to clean those up if they exist.
8610       */
8611      OPT(lower_integer_multiplication);
8612   }
8613   OPT(lower_sub_sat);
8614
8615   if (devinfo->ver <= 5 && OPT(lower_minmax)) {
8616      OPT(opt_cmod_propagation);
8617      OPT(opt_cse);
8618      OPT(opt_copy_propagation);
8619      OPT(dead_code_eliminate);
8620   }
8621
8622   progress = false;
8623   OPT(lower_derivatives);
8624   OPT(lower_regioning);
8625   if (progress) {
8626      OPT(opt_copy_propagation);
8627      OPT(dead_code_eliminate);
8628      OPT(lower_simd_width);
8629   }
8630
8631   OPT(fixup_sends_duplicate_payload);
8632
8633   lower_uniform_pull_constant_loads();
8634
8635   validate();
8636}
8637
8638/**
8639 * From the Skylake PRM Vol. 2a docs for sends:
8640 *
8641 *    "It is required that the second block of GRFs does not overlap with the
8642 *    first block."
8643 *
8644 * There are plenty of cases where we may accidentally violate this due to
8645 * having, for instance, both sources be the constant 0.  This little pass
8646 * just adds a new vgrf for the second payload and copies it over.
8647 */
8648bool
8649fs_visitor::fixup_sends_duplicate_payload()
8650{
8651   bool progress = false;
8652
8653   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8654      if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
8655          regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
8656                          inst->src[3], inst->ex_mlen * REG_SIZE)) {
8657         fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
8658                             BRW_REGISTER_TYPE_UD);
8659         /* Sadly, we've lost all notion of channels and bit sizes at this
8660          * point.  Just WE_all it.
8661          */
8662         const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);
8663         fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
8664         fs_reg copy_dst = tmp;
8665         for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
8666            if (inst->ex_mlen == i + 1) {
8667               /* Only one register left; do SIMD8 */
8668               ibld.group(8, 0).MOV(copy_dst, copy_src);
8669            } else {
8670               ibld.MOV(copy_dst, copy_src);
8671            }
8672            copy_src = offset(copy_src, ibld, 1);
8673            copy_dst = offset(copy_dst, ibld, 1);
8674         }
8675         inst->src[3] = tmp;
8676         progress = true;
8677      }
8678   }
8679
8680   if (progress)
8681      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8682
8683   return progress;
8684}
8685
8686/**
8687 * Three source instruction must have a GRF/MRF destination register.
8688 * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
8689 */
8690void
8691fs_visitor::fixup_3src_null_dest()
8692{
8693   bool progress = false;
8694
8695   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
8696      if (inst->is_3src(devinfo) && inst->dst.is_null()) {
8697         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
8698                            inst->dst.type);
8699         progress = true;
8700      }
8701   }
8702
8703   if (progress)
8704      invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
8705                          DEPENDENCY_VARIABLES);
8706}
8707
8708/**
8709 * Find the first instruction in the program that might start a region of
8710 * divergent control flow due to a HALT jump.  There is no
8711 * find_halt_control_flow_region_end(), the region of divergence extends until
8712 * the only SHADER_OPCODE_HALT_TARGET in the program.
8713 */
8714static const fs_inst *
8715find_halt_control_flow_region_start(const fs_visitor *v)
8716{
8717   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
8718      if (inst->opcode == BRW_OPCODE_HALT ||
8719          inst->opcode == SHADER_OPCODE_HALT_TARGET)
8720         return inst;
8721   }
8722
8723   return NULL;
8724}
8725
8726/**
8727 * Work around the Gfx12 hardware bug filed as Wa_1407528679.  EU fusion
8728 * can cause a BB to be executed with all channels disabled, which will lead
8729 * to the execution of any NoMask instructions in it, even though any
8730 * execution-masked instructions will be correctly shot down.  This may break
8731 * assumptions of some NoMask SEND messages whose descriptor depends on data
8732 * generated by live invocations of the shader.
8733 *
8734 * This avoids the problem by predicating certain instructions on an ANY
8735 * horizontal predicate that makes sure that their execution is omitted when
8736 * all channels of the program are disabled.
8737 */
8738bool
8739fs_visitor::fixup_nomask_control_flow()
8740{
8741   if (devinfo->ver != 12)
8742      return false;
8743
8744   const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
8745                              dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
8746                              BRW_PREDICATE_ALIGN1_ANY8H;
8747   const fs_inst *halt_start = find_halt_control_flow_region_start(this);
8748   unsigned depth = 0;
8749   bool progress = false;
8750
8751   const fs_live_variables &live_vars = live_analysis.require();
8752
8753   /* Scan the program backwards in order to be able to easily determine
8754    * whether the flag register is live at any point.
8755    */
8756   foreach_block_reverse_safe(block, cfg) {
8757      BITSET_WORD flag_liveout = live_vars.block_data[block->num]
8758                                               .flag_liveout[0];
8759      STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
8760
8761      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
8762         if (!inst->predicate && inst->exec_size >= 8)
8763            flag_liveout &= ~inst->flags_written(devinfo);
8764
8765         switch (inst->opcode) {
8766         case BRW_OPCODE_DO:
8767         case BRW_OPCODE_IF:
8768            /* Note that this doesn't handle BRW_OPCODE_HALT since only
8769             * the first one in the program closes the region of divergent
8770             * control flow due to any HALT instructions -- Instead this is
8771             * handled with the halt_start check below.
8772             */
8773            depth--;
8774            break;
8775
8776         case BRW_OPCODE_WHILE:
8777         case BRW_OPCODE_ENDIF:
8778         case SHADER_OPCODE_HALT_TARGET:
8779            depth++;
8780            break;
8781
8782         default:
8783            /* Note that the vast majority of NoMask SEND instructions in the
8784             * program are harmless while executed in a block with all
8785             * channels disabled, since any instructions with side effects we
8786             * could hit here should be execution-masked.
8787             *
8788             * The main concern is NoMask SEND instructions where the message
8789             * descriptor or header depends on data generated by live
8790             * invocations of the shader (RESINFO and
8791             * FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
8792             * computed surface index seem to be the only examples right now
8793             * where this could easily lead to GPU hangs).  Unfortunately we
8794             * have no straightforward way to detect that currently, so just
8795             * predicate any NoMask SEND instructions we find under control
8796             * flow.
8797             *
8798             * If this proves to have a measurable performance impact it can
8799             * be easily extended with a whitelist of messages we know we can
8800             * safely omit the predication for.
8801             */
8802            if (depth && inst->force_writemask_all &&
8803                is_send(inst) && !inst->predicate) {
8804               /* We need to load the execution mask into the flag register by
8805                * using a builder with channel group matching the whole shader
8806                * (rather than the default which is derived from the original
8807                * instruction), in order to avoid getting a right-shifted
8808                * value.
8809                */
8810               const fs_builder ubld = fs_builder(this, block, inst)
8811                                       .exec_all().group(dispatch_width, 0);
8812               const fs_reg flag = retype(brw_flag_reg(0, 0),
8813                                          BRW_REGISTER_TYPE_UD);
8814
8815               /* Due to the lack of flag register allocation we need to save
8816                * and restore the flag register if it's live.
8817                */
8818               const bool save_flag = flag_liveout &
8819                                      flag_mask(flag, dispatch_width / 8);
8820               const fs_reg tmp = ubld.group(1, 0).vgrf(flag.type);
8821
8822               if (save_flag)
8823                  ubld.group(1, 0).MOV(tmp, flag);
8824
8825               ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
8826
8827               set_predicate(pred, inst);
8828               inst->flag_subreg = 0;
8829
8830               if (save_flag)
8831                  ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
8832
8833               progress = true;
8834            }
8835            break;
8836         }
8837
8838         if (inst == halt_start)
8839            depth--;
8840
8841         flag_liveout |= inst->flags_read(devinfo);
8842      }
8843   }
8844
8845   if (progress)
8846      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
8847
8848   return progress;
8849}
8850
8851void
8852fs_visitor::allocate_registers(bool allow_spilling)
8853{
8854   bool allocated;
8855
8856   static const enum instruction_scheduler_mode pre_modes[] = {
8857      SCHEDULE_PRE,
8858      SCHEDULE_PRE_NON_LIFO,
8859      SCHEDULE_PRE_LIFO,
8860   };
8861
8862   static const char *scheduler_mode_name[] = {
8863      "top-down",
8864      "non-lifo",
8865      "lifo"
8866   };
8867
8868   bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
8869
8870   /* Try each scheduling heuristic to see if it can successfully register
8871    * allocate without spilling.  They should be ordered by decreasing
8872    * performance but increasing likelihood of allocating.
8873    */
8874   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
8875      schedule_instructions(pre_modes[i]);
8876      this->shader_stats.scheduler_mode = scheduler_mode_name[i];
8877
8878      if (0) {
8879         assign_regs_trivial();
8880         allocated = true;
8881         break;
8882      }
8883
8884      /* Scheduling may create additional opportunities for CMOD propagation,
8885       * so let's do it again.  If CMOD propagation made any progress,
8886       * eliminate dead code one more time.
8887       */
8888      bool progress = false;
8889      const int iteration = 99;
8890      int pass_num = 0;
8891
8892      if (OPT(opt_cmod_propagation)) {
8893         /* dead_code_eliminate "undoes" the fixing done by
8894          * fixup_3src_null_dest, so we have to do it again if
8895          * dead_code_eliminiate makes any progress.
8896          */
8897         if (OPT(dead_code_eliminate))
8898            fixup_3src_null_dest();
8899      }
8900
8901      bool can_spill = allow_spilling &&
8902                       (i == ARRAY_SIZE(pre_modes) - 1);
8903
8904      /* We should only spill registers on the last scheduling. */
8905      assert(!spilled_any_registers);
8906
8907      allocated = assign_regs(can_spill, spill_all);
8908      if (allocated)
8909         break;
8910   }
8911
8912   if (!allocated) {
8913      fail("Failure to register allocate.  Reduce number of "
8914           "live scalar values to avoid this.");
8915   } else if (spilled_any_registers) {
8916      brw_shader_perf_log(compiler, log_data,
8917                          "%s shader triggered register spilling.  "
8918                          "Try reducing the number of live scalar "
8919                          "values to improve performance.\n",
8920                          stage_name);
8921   }
8922
8923   /* This must come after all optimization and register allocation, since
8924    * it inserts dead code that happens to have side effects, and it does
8925    * so based on the actual physical registers in use.
8926    */
8927   insert_gfx4_send_dependency_workarounds();
8928
8929   if (failed)
8930      return;
8931
8932   opt_bank_conflicts();
8933
8934   schedule_instructions(SCHEDULE_POST);
8935
8936   if (last_scratch > 0) {
8937      ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
8938
8939      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
8940
8941      if (stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) {
8942         if (devinfo->is_haswell) {
8943            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8944             * field documentation, Haswell supports a minimum of 2kB of
8945             * scratch space for compute shaders, unlike every other stage
8946             * and platform.
8947             */
8948            prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
8949         } else if (devinfo->ver <= 7) {
8950            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
8951             * field documentation, platforms prior to Haswell measure scratch
8952             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
8953             */
8954            prog_data->total_scratch = ALIGN(last_scratch, 1024);
8955            max_scratch_size = 12 * 1024;
8956         }
8957      }
8958
8959      /* We currently only support up to 2MB of scratch space.  If we
8960       * need to support more eventually, the documentation suggests
8961       * that we could allocate a larger buffer, and partition it out
8962       * ourselves.  We'd just have to undo the hardware's address
8963       * calculation by subtracting (FFTID * Per Thread Scratch Space)
8964       * and then add FFTID * (Larger Per Thread Scratch Space).
8965       *
8966       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
8967       * Thread Group Tracking > Local Memory/Scratch Space.
8968       */
8969      assert(prog_data->total_scratch < max_scratch_size);
8970   }
8971
8972   lower_scoreboard();
8973}
8974
8975bool
8976fs_visitor::run_vs()
8977{
8978   assert(stage == MESA_SHADER_VERTEX);
8979
8980   setup_vs_payload();
8981
8982   if (shader_time_index >= 0)
8983      emit_shader_time_begin();
8984
8985   emit_nir_code();
8986
8987   if (failed)
8988      return false;
8989
8990   emit_urb_writes();
8991
8992   if (shader_time_index >= 0)
8993      emit_shader_time_end();
8994
8995   calculate_cfg();
8996
8997   optimize();
8998
8999   assign_curb_setup();
9000   assign_vs_urb_setup();
9001
9002   fixup_3src_null_dest();
9003   allocate_registers(true /* allow_spilling */);
9004
9005   return !failed;
9006}
9007
9008void
9009fs_visitor::set_tcs_invocation_id()
9010{
9011   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
9012   struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
9013
9014   const unsigned instance_id_mask =
9015      devinfo->ver >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
9016   const unsigned instance_id_shift =
9017      devinfo->ver >= 11 ? 16 : 17;
9018
9019   /* Get instance number from g0.2 bits 22:16 or 23:17 */
9020   fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
9021   bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
9022           brw_imm_ud(instance_id_mask));
9023
9024   invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
9025
9026   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
9027      /* gl_InvocationID is just the thread number */
9028      bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
9029      return;
9030   }
9031
9032   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
9033
9034   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
9035   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
9036   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
9037   bld.MOV(channels_ud, channels_uw);
9038
9039   if (tcs_prog_data->instances == 1) {
9040      invocation_id = channels_ud;
9041   } else {
9042      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
9043      bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
9044      bld.ADD(invocation_id, instance_times_8, channels_ud);
9045   }
9046}
9047
9048bool
9049fs_visitor::run_tcs()
9050{
9051   assert(stage == MESA_SHADER_TESS_CTRL);
9052
9053   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
9054   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
9055   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
9056
9057   assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
9058          vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9059
9060   if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
9061      /* r1-r4 contain the ICP handles. */
9062      payload.num_regs = 5;
9063   } else {
9064      assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
9065      assert(tcs_key->input_vertices > 0);
9066      /* r1 contains output handles, r2 may contain primitive ID, then the
9067       * ICP handles occupy the next 1-32 registers.
9068       */
9069      payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
9070                         tcs_key->input_vertices;
9071   }
9072
9073   if (shader_time_index >= 0)
9074      emit_shader_time_begin();
9075
9076   /* Initialize gl_InvocationID */
9077   set_tcs_invocation_id();
9078
9079   const bool fix_dispatch_mask =
9080      vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
9081      (nir->info.tess.tcs_vertices_out % 8) != 0;
9082
9083   /* Fix the disptach mask */
9084   if (fix_dispatch_mask) {
9085      bld.CMP(bld.null_reg_ud(), invocation_id,
9086              brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
9087      bld.IF(BRW_PREDICATE_NORMAL);
9088   }
9089
9090   emit_nir_code();
9091
9092   if (fix_dispatch_mask) {
9093      bld.emit(BRW_OPCODE_ENDIF);
9094   }
9095
9096   /* Emit EOT write; set TR DS Cache bit */
9097   fs_reg srcs[3] = {
9098      fs_reg(get_tcs_output_urb_handle()),
9099      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
9100      fs_reg(brw_imm_ud(0)),
9101   };
9102   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
9103   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
9104
9105   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
9106                            bld.null_reg_ud(), payload);
9107   inst->mlen = 3;
9108   inst->eot = true;
9109
9110   if (shader_time_index >= 0)
9111      emit_shader_time_end();
9112
9113   if (failed)
9114      return false;
9115
9116   calculate_cfg();
9117
9118   optimize();
9119
9120   assign_curb_setup();
9121   assign_tcs_urb_setup();
9122
9123   fixup_3src_null_dest();
9124   allocate_registers(true /* allow_spilling */);
9125
9126   return !failed;
9127}
9128
9129bool
9130fs_visitor::run_tes()
9131{
9132   assert(stage == MESA_SHADER_TESS_EVAL);
9133
9134   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
9135   payload.num_regs = 5;
9136
9137   if (shader_time_index >= 0)
9138      emit_shader_time_begin();
9139
9140   emit_nir_code();
9141
9142   if (failed)
9143      return false;
9144
9145   emit_urb_writes();
9146
9147   if (shader_time_index >= 0)
9148      emit_shader_time_end();
9149
9150   calculate_cfg();
9151
9152   optimize();
9153
9154   assign_curb_setup();
9155   assign_tes_urb_setup();
9156
9157   fixup_3src_null_dest();
9158   allocate_registers(true /* allow_spilling */);
9159
9160   return !failed;
9161}
9162
9163bool
9164fs_visitor::run_gs()
9165{
9166   assert(stage == MESA_SHADER_GEOMETRY);
9167
9168   setup_gs_payload();
9169
9170   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
9171
9172   if (gs_compile->control_data_header_size_bits > 0) {
9173      /* Create a VGRF to store accumulated control data bits. */
9174      this->control_data_bits = vgrf(glsl_type::uint_type);
9175
9176      /* If we're outputting more than 32 control data bits, then EmitVertex()
9177       * will set control_data_bits to 0 after emitting the first vertex.
9178       * Otherwise, we need to initialize it to 0 here.
9179       */
9180      if (gs_compile->control_data_header_size_bits <= 32) {
9181         const fs_builder abld = bld.annotate("initialize control data bits");
9182         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
9183      }
9184   }
9185
9186   if (shader_time_index >= 0)
9187      emit_shader_time_begin();
9188
9189   emit_nir_code();
9190
9191   emit_gs_thread_end();
9192
9193   if (shader_time_index >= 0)
9194      emit_shader_time_end();
9195
9196   if (failed)
9197      return false;
9198
9199   calculate_cfg();
9200
9201   optimize();
9202
9203   assign_curb_setup();
9204   assign_gs_urb_setup();
9205
9206   fixup_3src_null_dest();
9207   allocate_registers(true /* allow_spilling */);
9208
9209   return !failed;
9210}
9211
9212/* From the SKL PRM, Volume 16, Workarounds:
9213 *
9214 *   0877  3D   Pixel Shader Hang possible when pixel shader dispatched with
9215 *              only header phases (R0-R2)
9216 *
9217 *   WA: Enable a non-header phase (e.g. push constant) when dispatch would
9218 *       have been header only.
9219 *
9220 * Instead of enabling push constants one can alternatively enable one of the
9221 * inputs. Here one simply chooses "layer" which shouldn't impose much
9222 * overhead.
9223 */
9224static void
9225gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
9226{
9227   if (wm_prog_data->num_varying_inputs)
9228      return;
9229
9230   if (wm_prog_data->base.curb_read_length)
9231      return;
9232
9233   wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
9234   wm_prog_data->num_varying_inputs = 1;
9235
9236   brw_compute_urb_setup_index(wm_prog_data);
9237}
9238
9239bool
9240fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
9241{
9242   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
9243   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
9244
9245   assert(stage == MESA_SHADER_FRAGMENT);
9246
9247   if (devinfo->ver >= 6)
9248      setup_fs_payload_gfx6();
9249   else
9250      setup_fs_payload_gfx4();
9251
9252   if (0) {
9253      emit_dummy_fs();
9254   } else if (do_rep_send) {
9255      assert(dispatch_width == 16);
9256      emit_repclear_shader();
9257   } else {
9258      if (shader_time_index >= 0)
9259         emit_shader_time_begin();
9260
9261      if (nir->info.inputs_read > 0 ||
9262          BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
9263          (nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
9264         if (devinfo->ver < 6)
9265            emit_interpolation_setup_gfx4();
9266         else
9267            emit_interpolation_setup_gfx6();
9268      }
9269
9270      /* We handle discards by keeping track of the still-live pixels in f0.1.
9271       * Initialize it with the dispatched pixels.
9272       */
9273      if (wm_prog_data->uses_kill) {
9274         const unsigned lower_width = MIN2(dispatch_width, 16);
9275         for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
9276            const fs_reg dispatch_mask =
9277               devinfo->ver >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
9278               brw_vec1_grf(0, 0);
9279            bld.exec_all().group(1, 0)
9280               .MOV(sample_mask_reg(bld.group(lower_width, i)),
9281                    retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
9282         }
9283      }
9284
9285      if (nir->info.writes_memory)
9286         wm_prog_data->has_side_effects = true;
9287
9288      emit_nir_code();
9289
9290      if (failed)
9291	 return false;
9292
9293      if (wm_key->alpha_test_func)
9294         emit_alpha_test();
9295
9296      emit_fb_writes();
9297
9298      if (shader_time_index >= 0)
9299         emit_shader_time_end();
9300
9301      calculate_cfg();
9302
9303      optimize();
9304
9305      assign_curb_setup();
9306
9307      if (devinfo->ver >= 9)
9308         gfx9_ps_header_only_workaround(wm_prog_data);
9309
9310      assign_urb_setup();
9311
9312      fixup_3src_null_dest();
9313
9314      allocate_registers(allow_spilling);
9315
9316      if (failed)
9317         return false;
9318   }
9319
9320   return !failed;
9321}
9322
9323bool
9324fs_visitor::run_cs(bool allow_spilling)
9325{
9326   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9327
9328   setup_cs_payload();
9329
9330   if (shader_time_index >= 0)
9331      emit_shader_time_begin();
9332
9333   if (devinfo->is_haswell && prog_data->total_shared > 0) {
9334      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
9335      const fs_builder abld = bld.exec_all().group(1, 0);
9336      abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
9337               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
9338   }
9339
9340   emit_nir_code();
9341
9342   if (failed)
9343      return false;
9344
9345   emit_cs_terminate();
9346
9347   if (shader_time_index >= 0)
9348      emit_shader_time_end();
9349
9350   calculate_cfg();
9351
9352   optimize();
9353
9354   assign_curb_setup();
9355
9356   fixup_3src_null_dest();
9357   allocate_registers(allow_spilling);
9358
9359   if (failed)
9360      return false;
9361
9362   return !failed;
9363}
9364
9365bool
9366fs_visitor::run_bs(bool allow_spilling)
9367{
9368   assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
9369
9370   /* R0: thread header, R1: stack IDs, R2: argument addresses */
9371   payload.num_regs = 3;
9372
9373   if (shader_time_index >= 0)
9374      emit_shader_time_begin();
9375
9376   emit_nir_code();
9377
9378   if (failed)
9379      return false;
9380
9381   /* TODO(RT): Perhaps rename this? */
9382   emit_cs_terminate();
9383
9384   if (shader_time_index >= 0)
9385      emit_shader_time_end();
9386
9387   calculate_cfg();
9388
9389   optimize();
9390
9391   assign_curb_setup();
9392
9393   fixup_3src_null_dest();
9394   allocate_registers(allow_spilling);
9395
9396   if (failed)
9397      return false;
9398
9399   return !failed;
9400}
9401
9402static bool
9403is_used_in_not_interp_frag_coord(nir_ssa_def *def)
9404{
9405   nir_foreach_use(src, def) {
9406      if (src->parent_instr->type != nir_instr_type_intrinsic)
9407         return true;
9408
9409      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
9410      if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
9411         return true;
9412   }
9413
9414   nir_foreach_if_use(src, def)
9415      return true;
9416
9417   return false;
9418}
9419
9420/**
9421 * Return a bitfield where bit n is set if barycentric interpolation mode n
9422 * (see enum brw_barycentric_mode) is needed by the fragment shader.
9423 *
9424 * We examine the load_barycentric intrinsics rather than looking at input
9425 * variables so that we catch interpolateAtCentroid() messages too, which
9426 * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
9427 */
9428static unsigned
9429brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
9430                                     const nir_shader *shader)
9431{
9432   unsigned barycentric_interp_modes = 0;
9433
9434   nir_foreach_function(f, shader) {
9435      if (!f->impl)
9436         continue;
9437
9438      nir_foreach_block(block, f->impl) {
9439         nir_foreach_instr(instr, block) {
9440            if (instr->type != nir_instr_type_intrinsic)
9441               continue;
9442
9443            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9444            switch (intrin->intrinsic) {
9445            case nir_intrinsic_load_barycentric_pixel:
9446            case nir_intrinsic_load_barycentric_centroid:
9447            case nir_intrinsic_load_barycentric_sample:
9448               break;
9449            default:
9450               continue;
9451            }
9452
9453            /* Ignore WPOS; it doesn't require interpolation. */
9454            assert(intrin->dest.is_ssa);
9455            if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))
9456               continue;
9457
9458            enum glsl_interp_mode interp = (enum glsl_interp_mode)
9459               nir_intrinsic_interp_mode(intrin);
9460            nir_intrinsic_op bary_op = intrin->intrinsic;
9461            enum brw_barycentric_mode bary =
9462               brw_barycentric_mode(interp, bary_op);
9463
9464            barycentric_interp_modes |= 1 << bary;
9465
9466            if (devinfo->needs_unlit_centroid_workaround &&
9467                bary_op == nir_intrinsic_load_barycentric_centroid)
9468               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
9469         }
9470      }
9471   }
9472
9473   return barycentric_interp_modes;
9474}
9475
9476static void
9477brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
9478                        const nir_shader *shader)
9479{
9480   prog_data->flat_inputs = 0;
9481
9482   nir_foreach_shader_in_variable(var, shader) {
9483      unsigned slots = glsl_count_attribute_slots(var->type, false);
9484      for (unsigned s = 0; s < slots; s++) {
9485         int input_index = prog_data->urb_setup[var->data.location + s];
9486
9487         if (input_index < 0)
9488            continue;
9489
9490         /* flat shading */
9491         if (var->data.interpolation == INTERP_MODE_FLAT)
9492            prog_data->flat_inputs |= 1 << input_index;
9493      }
9494   }
9495}
9496
9497static uint8_t
9498computed_depth_mode(const nir_shader *shader)
9499{
9500   if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
9501      switch (shader->info.fs.depth_layout) {
9502      case FRAG_DEPTH_LAYOUT_NONE:
9503      case FRAG_DEPTH_LAYOUT_ANY:
9504         return BRW_PSCDEPTH_ON;
9505      case FRAG_DEPTH_LAYOUT_GREATER:
9506         return BRW_PSCDEPTH_ON_GE;
9507      case FRAG_DEPTH_LAYOUT_LESS:
9508         return BRW_PSCDEPTH_ON_LE;
9509      case FRAG_DEPTH_LAYOUT_UNCHANGED:
9510         return BRW_PSCDEPTH_OFF;
9511      }
9512   }
9513   return BRW_PSCDEPTH_OFF;
9514}
9515
9516/**
9517 * Move load_interpolated_input with simple (payload-based) barycentric modes
9518 * to the top of the program so we don't emit multiple PLNs for the same input.
9519 *
9520 * This works around CSE not being able to handle non-dominating cases
9521 * such as:
9522 *
9523 *    if (...) {
9524 *       interpolate input
9525 *    } else {
9526 *       interpolate the same exact input
9527 *    }
9528 *
9529 * This should be replaced by global value numbering someday.
9530 */
9531bool
9532brw_nir_move_interpolation_to_top(nir_shader *nir)
9533{
9534   bool progress = false;
9535
9536   nir_foreach_function(f, nir) {
9537      if (!f->impl)
9538         continue;
9539
9540      nir_block *top = nir_start_block(f->impl);
9541      exec_node *cursor_node = NULL;
9542
9543      nir_foreach_block(block, f->impl) {
9544         if (block == top)
9545            continue;
9546
9547         nir_foreach_instr_safe(instr, block) {
9548            if (instr->type != nir_instr_type_intrinsic)
9549               continue;
9550
9551            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9552            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
9553               continue;
9554            nir_intrinsic_instr *bary_intrinsic =
9555               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
9556            nir_intrinsic_op op = bary_intrinsic->intrinsic;
9557
9558            /* Leave interpolateAtSample/Offset() where they are. */
9559            if (op == nir_intrinsic_load_barycentric_at_sample ||
9560                op == nir_intrinsic_load_barycentric_at_offset)
9561               continue;
9562
9563            nir_instr *move[3] = {
9564               &bary_intrinsic->instr,
9565               intrin->src[1].ssa->parent_instr,
9566               instr
9567            };
9568
9569            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
9570               if (move[i]->block != top) {
9571                  move[i]->block = top;
9572                  exec_node_remove(&move[i]->node);
9573                  if (cursor_node) {
9574                     exec_node_insert_after(cursor_node, &move[i]->node);
9575                  } else {
9576                     exec_list_push_head(&top->instr_list, &move[i]->node);
9577                  }
9578                  cursor_node = &move[i]->node;
9579                  progress = true;
9580               }
9581            }
9582         }
9583      }
9584      nir_metadata_preserve(f->impl, nir_metadata_block_index |
9585                                     nir_metadata_dominance);
9586   }
9587
9588   return progress;
9589}
9590
9591static bool
9592brw_nir_demote_sample_qualifiers_instr(nir_builder *b,
9593                                       nir_instr *instr,
9594                                       UNUSED void *cb_data)
9595{
9596   if (instr->type != nir_instr_type_intrinsic)
9597      return false;
9598
9599   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
9600   if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
9601       intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
9602      return false;
9603
9604   b->cursor = nir_before_instr(instr);
9605   nir_ssa_def *centroid =
9606      nir_load_barycentric(b, nir_intrinsic_load_barycentric_centroid,
9607                           nir_intrinsic_interp_mode(intrin));
9608   nir_ssa_def_rewrite_uses(&intrin->dest.ssa, centroid);
9609   nir_instr_remove(instr);
9610   return true;
9611}
9612
9613/**
9614 * Demote per-sample barycentric intrinsics to centroid.
9615 *
9616 * Useful when rendering to a non-multisampled buffer.
9617 */
9618bool
9619brw_nir_demote_sample_qualifiers(nir_shader *nir)
9620{
9621   return nir_shader_instructions_pass(nir,
9622                                       brw_nir_demote_sample_qualifiers_instr,
9623                                       nir_metadata_block_index |
9624                                       nir_metadata_dominance,
9625                                       NULL);
9626}
9627
9628void
9629brw_nir_populate_wm_prog_data(const nir_shader *shader,
9630                              const struct intel_device_info *devinfo,
9631                              const struct brw_wm_prog_key *key,
9632                              struct brw_wm_prog_data *prog_data)
9633{
9634   /* key->alpha_test_func means simulating alpha testing via discards,
9635    * so the shader definitely kills pixels.
9636    */
9637   prog_data->uses_kill = shader->info.fs.uses_discard ||
9638      key->alpha_test_func;
9639   prog_data->uses_omask = !key->ignore_sample_mask_out &&
9640      (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
9641   prog_data->computed_depth_mode = computed_depth_mode(shader);
9642   prog_data->computed_stencil =
9643      shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
9644
9645   prog_data->persample_dispatch =
9646      key->multisample_fbo &&
9647      (key->persample_interp ||
9648       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_ID) ||
9649       BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) ||
9650       shader->info.fs.uses_sample_qualifier ||
9651       shader->info.outputs_read);
9652
9653   if (devinfo->ver >= 6) {
9654      prog_data->uses_sample_mask =
9655         BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
9656
9657      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
9658       *
9659       *    "MSDISPMODE_PERSAMPLE is required in order to select
9660       *    POSOFFSET_SAMPLE"
9661       *
9662       * So we can only really get sample positions if we are doing real
9663       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
9664       * persample dispatch, we hard-code it to 0.5.
9665       */
9666      prog_data->uses_pos_offset = prog_data->persample_dispatch &&
9667         BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS);
9668   }
9669
9670   prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
9671
9672   prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
9673   prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
9674   prog_data->inner_coverage = shader->info.fs.inner_coverage;
9675
9676   prog_data->barycentric_interp_modes =
9677      brw_compute_barycentric_interp_modes(devinfo, shader);
9678
9679   prog_data->per_coarse_pixel_dispatch =
9680      key->coarse_pixel &&
9681      !prog_data->uses_omask &&
9682      !prog_data->persample_dispatch &&
9683      !prog_data->uses_sample_mask &&
9684      (prog_data->computed_depth_mode == BRW_PSCDEPTH_OFF) &&
9685      !prog_data->computed_stencil;
9686
9687   prog_data->uses_src_w =
9688      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
9689   prog_data->uses_src_depth =
9690      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9691      !prog_data->per_coarse_pixel_dispatch;
9692   prog_data->uses_depth_w_coefficients =
9693      BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
9694      prog_data->per_coarse_pixel_dispatch;
9695
9696   calculate_urb_setup(devinfo, key, prog_data, shader);
9697   brw_compute_flat_inputs(prog_data, shader);
9698}
9699
9700/**
9701 * Pre-gfx6, the register file of the EUs was shared between threads,
9702 * and each thread used some subset allocated on a 16-register block
9703 * granularity.  The unit states wanted these block counts.
9704 */
9705static inline int
9706brw_register_blocks(int reg_count)
9707{
9708   return ALIGN(reg_count, 16) / 16 - 1;
9709}
9710
9711const unsigned *
9712brw_compile_fs(const struct brw_compiler *compiler,
9713               void *mem_ctx,
9714               struct brw_compile_fs_params *params)
9715{
9716   struct nir_shader *nir = params->nir;
9717   const struct brw_wm_prog_key *key = params->key;
9718   struct brw_wm_prog_data *prog_data = params->prog_data;
9719   bool allow_spilling = params->allow_spilling;
9720   const bool debug_enabled =
9721      INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_WM);
9722
9723   prog_data->base.stage = MESA_SHADER_FRAGMENT;
9724
9725   const struct intel_device_info *devinfo = compiler->devinfo;
9726   const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
9727
9728   brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size, true);
9729   brw_nir_lower_fs_inputs(nir, devinfo, key);
9730   brw_nir_lower_fs_outputs(nir);
9731
9732   if (devinfo->ver < 6)
9733      brw_setup_vue_interpolation(params->vue_map, nir, prog_data);
9734
9735   /* From the SKL PRM, Volume 7, "Alpha Coverage":
9736    *  "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
9737    *   hardware, regardless of the state setting for this feature."
9738    */
9739   if (devinfo->ver > 6 && key->alpha_to_coverage) {
9740      /* Run constant fold optimization in order to get the correct source
9741       * offset to determine render target 0 store instruction in
9742       * emit_alpha_to_coverage pass.
9743       */
9744      NIR_PASS_V(nir, nir_opt_constant_folding);
9745      NIR_PASS_V(nir, brw_nir_lower_alpha_to_coverage);
9746   }
9747
9748   if (!key->multisample_fbo)
9749      NIR_PASS_V(nir, brw_nir_demote_sample_qualifiers);
9750   NIR_PASS_V(nir, brw_nir_move_interpolation_to_top);
9751   brw_postprocess_nir(nir, compiler, true, debug_enabled,
9752                       key->base.robust_buffer_access);
9753
9754   brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data);
9755
9756   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
9757   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
9758   float throughput = 0;
9759   bool has_spilled = false;
9760
9761   v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9762                       &prog_data->base, nir, 8,
9763                       params->shader_time ? params->shader_time_index8 : -1,
9764                       debug_enabled);
9765   if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
9766      params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
9767      delete v8;
9768      return NULL;
9769   } else if (!INTEL_DEBUG(DEBUG_NO8)) {
9770      simd8_cfg = v8->cfg;
9771      prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs;
9772      prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
9773      const performance &perf = v8->performance_analysis.require();
9774      throughput = MAX2(throughput, perf.throughput);
9775      has_spilled = v8->spilled_any_registers;
9776      allow_spilling = false;
9777   }
9778
9779   /* Limit dispatch width to simd8 with dual source blending on gfx8.
9780    * See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
9781    */
9782   if (devinfo->ver == 8 && prog_data->dual_src_blend &&
9783       !INTEL_DEBUG(DEBUG_NO8)) {
9784      assert(!params->use_rep_send);
9785      v8->limit_dispatch_width(8, "gfx8 workaround: "
9786                               "using SIMD8 when dual src blending.\n");
9787   }
9788
9789   if (key->coarse_pixel) {
9790      if (prog_data->dual_src_blend) {
9791         v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
9792                                  " use SIMD8 messages.\n");
9793      }
9794      v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
9795                               " pixel shading.\n");
9796   }
9797
9798   if (!has_spilled &&
9799       v8->max_dispatch_width >= 16 &&
9800       (!INTEL_DEBUG(DEBUG_NO16) || params->use_rep_send)) {
9801      /* Try a SIMD16 compile */
9802      v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9803                           &prog_data->base, nir, 16,
9804                           params->shader_time ? params->shader_time_index16 : -1,
9805                           debug_enabled);
9806      v16->import_uniforms(v8);
9807      if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
9808         brw_shader_perf_log(compiler, params->log_data,
9809                             "SIMD16 shader failed to compile: %s\n",
9810                             v16->fail_msg);
9811      } else {
9812         simd16_cfg = v16->cfg;
9813         prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs;
9814         prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
9815         const performance &perf = v16->performance_analysis.require();
9816         throughput = MAX2(throughput, perf.throughput);
9817         has_spilled = v16->spilled_any_registers;
9818         allow_spilling = false;
9819      }
9820   }
9821
9822   const bool simd16_failed = v16 && !simd16_cfg;
9823
9824   /* Currently, the compiler only supports SIMD32 on SNB+ */
9825   if (!has_spilled &&
9826       v8->max_dispatch_width >= 32 && !params->use_rep_send &&
9827       devinfo->ver >= 6 && !simd16_failed &&
9828       !INTEL_DEBUG(DEBUG_NO32)) {
9829      /* Try a SIMD32 compile */
9830      v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
9831                           &prog_data->base, nir, 32,
9832                           params->shader_time ? params->shader_time_index32 : -1,
9833                           debug_enabled);
9834      v32->import_uniforms(v8);
9835      if (!v32->run_fs(allow_spilling, false)) {
9836         brw_shader_perf_log(compiler, params->log_data,
9837                             "SIMD32 shader failed to compile: %s\n",
9838                             v32->fail_msg);
9839      } else {
9840         const performance &perf = v32->performance_analysis.require();
9841
9842         if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
9843            brw_shader_perf_log(compiler, params->log_data,
9844                                "SIMD32 shader inefficient\n");
9845         } else {
9846            simd32_cfg = v32->cfg;
9847            prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;
9848            prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
9849            throughput = MAX2(throughput, perf.throughput);
9850         }
9851      }
9852   }
9853
9854   /* When the caller requests a repclear shader, they want SIMD16-only */
9855   if (params->use_rep_send)
9856      simd8_cfg = NULL;
9857
9858   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
9859    * at the top to select the shader.  We've never implemented that.
9860    * Instead, we just give them exactly one shader and we pick the widest one
9861    * available.
9862    */
9863   if (compiler->devinfo->ver < 5) {
9864      if (simd32_cfg || simd16_cfg)
9865         simd8_cfg = NULL;
9866      if (simd32_cfg)
9867         simd16_cfg = NULL;
9868   }
9869
9870   /* If computed depth is enabled SNB only allows SIMD8. */
9871   if (compiler->devinfo->ver == 6 &&
9872       prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
9873      assert(simd16_cfg == NULL && simd32_cfg == NULL);
9874
9875   if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
9876      /* Iron lake and earlier only have one Dispatch GRF start field.  Make
9877       * the data available in the base prog data struct for convenience.
9878       */
9879      if (simd16_cfg) {
9880         prog_data->base.dispatch_grf_start_reg =
9881            prog_data->dispatch_grf_start_reg_16;
9882      } else if (simd32_cfg) {
9883         prog_data->base.dispatch_grf_start_reg =
9884            prog_data->dispatch_grf_start_reg_32;
9885      }
9886   }
9887
9888   if (prog_data->persample_dispatch) {
9889      /* Starting with SandyBridge (where we first get MSAA), the different
9890       * pixel dispatch combinations are grouped into classifications A
9891       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On most hardware
9892       * generations, the only configurations supporting persample dispatch
9893       * are those in which only one dispatch width is enabled.
9894       *
9895       * The Gfx12 hardware spec has a similar dispatch grouping table, but
9896       * the following conflicting restriction applies (from the page on
9897       * "Structure_3DSTATE_PS_BODY"), so we need to keep the SIMD16 shader:
9898       *
9899       *  "SIMD32 may only be enabled if SIMD16 or (dual)SIMD8 is also
9900       *   enabled."
9901       */
9902      if (simd32_cfg || simd16_cfg)
9903         simd8_cfg = NULL;
9904      if (simd32_cfg && devinfo->ver < 12)
9905         simd16_cfg = NULL;
9906   }
9907
9908   fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
9909                  v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
9910
9911   if (unlikely(debug_enabled)) {
9912      g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
9913                                     nir->info.label ?
9914                                        nir->info.label : "unnamed",
9915                                     nir->info.name));
9916   }
9917
9918   struct brw_compile_stats *stats = params->stats;
9919
9920   if (simd8_cfg) {
9921      prog_data->dispatch_8 = true;
9922      g.generate_code(simd8_cfg, 8, v8->shader_stats,
9923                      v8->performance_analysis.require(), stats);
9924      stats = stats ? stats + 1 : NULL;
9925   }
9926
9927   if (simd16_cfg) {
9928      prog_data->dispatch_16 = true;
9929      prog_data->prog_offset_16 = g.generate_code(
9930         simd16_cfg, 16, v16->shader_stats,
9931         v16->performance_analysis.require(), stats);
9932      stats = stats ? stats + 1 : NULL;
9933   }
9934
9935   if (simd32_cfg) {
9936      prog_data->dispatch_32 = true;
9937      prog_data->prog_offset_32 = g.generate_code(
9938         simd32_cfg, 32, v32->shader_stats,
9939         v32->performance_analysis.require(), stats);
9940      stats = stats ? stats + 1 : NULL;
9941   }
9942
9943   g.add_const_data(nir->constant_data, nir->constant_data_size);
9944
9945   delete v8;
9946   delete v16;
9947   delete v32;
9948
9949   return g.get_assembly();
9950}
9951
9952fs_reg *
9953fs_visitor::emit_cs_work_group_id_setup()
9954{
9955   assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL);
9956
9957   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
9958
9959   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
9960   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
9961   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
9962
9963   bld.MOV(*reg, r0_1);
9964   bld.MOV(offset(*reg, bld, 1), r0_6);
9965   bld.MOV(offset(*reg, bld, 2), r0_7);
9966
9967   return reg;
9968}
9969
9970unsigned
9971brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
9972                             unsigned threads)
9973{
9974   assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
9975   assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
9976   return cs_prog_data->push.per_thread.size * threads +
9977          cs_prog_data->push.cross_thread.size;
9978}
9979
9980static void
9981fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
9982{
9983   block->dwords = dwords;
9984   block->regs = DIV_ROUND_UP(dwords, 8);
9985   block->size = block->regs * 32;
9986}
9987
9988static void
9989cs_fill_push_const_info(const struct intel_device_info *devinfo,
9990                        struct brw_cs_prog_data *cs_prog_data)
9991{
9992   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
9993   int subgroup_id_index = get_subgroup_id_param_index(devinfo, prog_data);
9994   bool cross_thread_supported = devinfo->verx10 >= 75;
9995
9996   /* The thread ID should be stored in the last param dword */
9997   assert(subgroup_id_index == -1 ||
9998          subgroup_id_index == (int)prog_data->nr_params - 1);
9999
10000   unsigned cross_thread_dwords, per_thread_dwords;
10001   if (!cross_thread_supported) {
10002      cross_thread_dwords = 0u;
10003      per_thread_dwords = prog_data->nr_params;
10004   } else if (subgroup_id_index >= 0) {
10005      /* Fill all but the last register with cross-thread payload */
10006      cross_thread_dwords = 8 * (subgroup_id_index / 8);
10007      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
10008      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
10009   } else {
10010      /* Fill all data using cross-thread payload */
10011      cross_thread_dwords = prog_data->nr_params;
10012      per_thread_dwords = 0u;
10013   }
10014
10015   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
10016   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
10017
10018   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
10019          cs_prog_data->push.per_thread.size == 0);
10020   assert(cs_prog_data->push.cross_thread.dwords +
10021          cs_prog_data->push.per_thread.dwords ==
10022             prog_data->nr_params);
10023}
10024
10025static bool
10026filter_simd(const nir_instr *instr, const void * /* options */)
10027{
10028   if (instr->type != nir_instr_type_intrinsic)
10029      return false;
10030
10031   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10032   case nir_intrinsic_load_simd_width_intel:
10033   case nir_intrinsic_load_subgroup_id:
10034      return true;
10035
10036   default:
10037      return false;
10038   }
10039}
10040
10041static nir_ssa_def *
10042lower_simd(nir_builder *b, nir_instr *instr, void *options)
10043{
10044   uintptr_t simd_width = (uintptr_t)options;
10045
10046   switch (nir_instr_as_intrinsic(instr)->intrinsic) {
10047   case nir_intrinsic_load_simd_width_intel:
10048      return nir_imm_int(b, simd_width);
10049
10050   case nir_intrinsic_load_subgroup_id:
10051      /* If the whole workgroup fits in one thread, we can lower subgroup_id
10052       * to a constant zero.
10053       */
10054      if (!b->shader->info.workgroup_size_variable) {
10055         unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
10056                                         b->shader->info.workgroup_size[1] *
10057                                         b->shader->info.workgroup_size[2];
10058         if (local_workgroup_size <= simd_width)
10059            return nir_imm_int(b, 0);
10060      }
10061      return NULL;
10062
10063   default:
10064      return NULL;
10065   }
10066}
10067
10068static void
10069brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
10070{
10071   nir_shader_lower_instructions(nir, filter_simd, lower_simd,
10072                                 (void *)(uintptr_t)dispatch_width);
10073}
10074
10075static nir_shader *
10076compile_cs_to_nir(const struct brw_compiler *compiler,
10077                  void *mem_ctx,
10078                  const struct brw_cs_prog_key *key,
10079                  const nir_shader *src_shader,
10080                  unsigned dispatch_width,
10081                  bool debug_enabled)
10082{
10083   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
10084   brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
10085
10086   NIR_PASS_V(shader, brw_nir_lower_simd, dispatch_width);
10087
10088   /* Clean up after the local index and ID calculations. */
10089   NIR_PASS_V(shader, nir_opt_constant_folding);
10090   NIR_PASS_V(shader, nir_opt_dce);
10091
10092   brw_postprocess_nir(shader, compiler, true, debug_enabled,
10093                       key->base.robust_buffer_access);
10094
10095   return shader;
10096}
10097
10098const unsigned *
10099brw_compile_cs(const struct brw_compiler *compiler,
10100               void *mem_ctx,
10101               struct brw_compile_cs_params *params)
10102{
10103   const nir_shader *nir = params->nir;
10104   const struct brw_cs_prog_key *key = params->key;
10105   struct brw_cs_prog_data *prog_data = params->prog_data;
10106   int shader_time_index = params->shader_time ? params->shader_time_index : -1;
10107
10108   const bool debug_enabled =
10109      INTEL_DEBUG(params->debug_flag ? params->debug_flag : DEBUG_CS);
10110
10111   prog_data->base.stage = MESA_SHADER_COMPUTE;
10112   prog_data->base.total_shared = nir->info.shared_size;
10113
10114   /* Generate code for all the possible SIMD variants. */
10115   bool generate_all;
10116
10117   unsigned min_dispatch_width;
10118   unsigned max_dispatch_width;
10119
10120   if (nir->info.workgroup_size_variable) {
10121      generate_all = true;
10122      min_dispatch_width = 8;
10123      max_dispatch_width = 32;
10124   } else {
10125      generate_all = false;
10126      prog_data->local_size[0] = nir->info.workgroup_size[0];
10127      prog_data->local_size[1] = nir->info.workgroup_size[1];
10128      prog_data->local_size[2] = nir->info.workgroup_size[2];
10129      unsigned local_workgroup_size = prog_data->local_size[0] *
10130                                      prog_data->local_size[1] *
10131                                      prog_data->local_size[2];
10132
10133      /* Limit max_threads to 64 for the GPGPU_WALKER command */
10134      const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads;
10135      min_dispatch_width = util_next_power_of_two(
10136         MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
10137      assert(min_dispatch_width <= 32);
10138      max_dispatch_width = 32;
10139   }
10140
10141   unsigned required_dispatch_width = 0;
10142   if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
10143      /* These enum values are expressly chosen to be equal to the subgroup
10144       * size that they require.
10145       */
10146      required_dispatch_width = (unsigned)key->base.subgroup_size_type;
10147   }
10148
10149   if (nir->info.cs.subgroup_size > 0) {
10150      assert(required_dispatch_width == 0 ||
10151             required_dispatch_width == nir->info.cs.subgroup_size);
10152      required_dispatch_width = nir->info.cs.subgroup_size;
10153   }
10154
10155   if (required_dispatch_width > 0) {
10156      assert(required_dispatch_width == 8 ||
10157             required_dispatch_width == 16 ||
10158             required_dispatch_width == 32);
10159      if (required_dispatch_width < min_dispatch_width ||
10160          required_dispatch_width > max_dispatch_width) {
10161         params->error_str = ralloc_strdup(mem_ctx,
10162                                           "Cannot satisfy explicit subgroup size");
10163         return NULL;
10164      }
10165      min_dispatch_width = max_dispatch_width = required_dispatch_width;
10166   }
10167
10168   assert(min_dispatch_width <= max_dispatch_width);
10169
10170   fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
10171   fs_visitor *v = NULL;
10172
10173   if (!INTEL_DEBUG(DEBUG_NO8) &&
10174       min_dispatch_width <= 8 && max_dispatch_width >= 8) {
10175      nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
10176                                           nir, 8, debug_enabled);
10177      v8 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10178                          &prog_data->base,
10179                          nir8, 8, shader_time_index, debug_enabled);
10180      if (!v8->run_cs(true /* allow_spilling */)) {
10181         params->error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10182         delete v8;
10183         return NULL;
10184      }
10185
10186      /* We should always be able to do SIMD32 for compute shaders */
10187      assert(v8->max_dispatch_width >= 32);
10188
10189      v = v8;
10190      prog_data->prog_mask |= 1 << 0;
10191      if (v8->spilled_any_registers)
10192         prog_data->prog_spilled |= 1 << 0;
10193      cs_fill_push_const_info(compiler->devinfo, prog_data);
10194   }
10195
10196   if (!INTEL_DEBUG(DEBUG_NO16) &&
10197       (generate_all || !prog_data->prog_spilled) &&
10198       min_dispatch_width <= 16 && max_dispatch_width >= 16) {
10199      /* Try a SIMD16 compile */
10200      nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
10201                                            nir, 16, debug_enabled);
10202      v16 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10203                           &prog_data->base,
10204                           nir16, 16, shader_time_index, debug_enabled);
10205      if (v8)
10206         v16->import_uniforms(v8);
10207
10208      const bool allow_spilling = generate_all || v == NULL;
10209      if (!v16->run_cs(allow_spilling)) {
10210         brw_shader_perf_log(compiler, params->log_data,
10211                             "SIMD16 shader failed to compile: %s\n",
10212                             v16->fail_msg);
10213         if (!v) {
10214            assert(v8 == NULL);
10215            params->error_str = ralloc_asprintf(
10216               mem_ctx, "Not enough threads for SIMD8 and "
10217               "couldn't generate SIMD16: %s", v16->fail_msg);
10218            delete v16;
10219            return NULL;
10220         }
10221      } else {
10222         /* We should always be able to do SIMD32 for compute shaders */
10223         assert(v16->max_dispatch_width >= 32);
10224
10225         v = v16;
10226         prog_data->prog_mask |= 1 << 1;
10227         if (v16->spilled_any_registers)
10228            prog_data->prog_spilled |= 1 << 1;
10229         cs_fill_push_const_info(compiler->devinfo, prog_data);
10230      }
10231   }
10232
10233   /* The SIMD32 is only enabled for cases it is needed unless forced.
10234    *
10235    * TODO: Use performance_analysis and drop this boolean.
10236    */
10237   const bool needs_32 = v == NULL ||
10238                         INTEL_DEBUG(DEBUG_DO32) ||
10239                         generate_all;
10240
10241   if (!INTEL_DEBUG(DEBUG_NO32) &&
10242       (generate_all || !prog_data->prog_spilled) &&
10243       needs_32 &&
10244       min_dispatch_width <= 32 && max_dispatch_width >= 32) {
10245      /* Try a SIMD32 compile */
10246      nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
10247                                            nir, 32, debug_enabled);
10248      v32 = new fs_visitor(compiler, params->log_data, mem_ctx, &key->base,
10249                           &prog_data->base,
10250                           nir32, 32, shader_time_index, debug_enabled);
10251      if (v8)
10252         v32->import_uniforms(v8);
10253      else if (v16)
10254         v32->import_uniforms(v16);
10255
10256      const bool allow_spilling = generate_all || v == NULL;
10257      if (!v32->run_cs(allow_spilling)) {
10258         brw_shader_perf_log(compiler, params->log_data,
10259                             "SIMD32 shader failed to compile: %s\n",
10260                             v32->fail_msg);
10261         if (!v) {
10262            assert(v8 == NULL);
10263            assert(v16 == NULL);
10264            params->error_str = ralloc_asprintf(
10265               mem_ctx, "Not enough threads for SIMD16 and "
10266               "couldn't generate SIMD32: %s", v32->fail_msg);
10267            delete v32;
10268            return NULL;
10269         }
10270      } else {
10271         v = v32;
10272         prog_data->prog_mask |= 1 << 2;
10273         if (v32->spilled_any_registers)
10274            prog_data->prog_spilled |= 1 << 2;
10275         cs_fill_push_const_info(compiler->devinfo, prog_data);
10276      }
10277   }
10278
10279   if (unlikely(!v) && INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16 | DEBUG_NO32)) {
10280      params->error_str =
10281         ralloc_strdup(mem_ctx,
10282                       "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10283      return NULL;
10284   }
10285
10286   assert(v);
10287
10288   const unsigned *ret = NULL;
10289
10290   fs_generator g(compiler, params->log_data, mem_ctx, &prog_data->base,
10291                  v->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
10292   if (unlikely(debug_enabled)) {
10293      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
10294                                   nir->info.label ?
10295                                   nir->info.label : "unnamed",
10296                                   nir->info.name);
10297      g.enable_debug(name);
10298   }
10299
10300   struct brw_compile_stats *stats = params->stats;
10301   if (generate_all) {
10302      if (prog_data->prog_mask & (1 << 0)) {
10303         assert(v8);
10304         prog_data->prog_offset[0] =
10305            g.generate_code(v8->cfg, 8, v8->shader_stats,
10306                            v8->performance_analysis.require(), stats);
10307         stats = stats ? stats + 1 : NULL;
10308      }
10309
10310      if (prog_data->prog_mask & (1 << 1)) {
10311         assert(v16);
10312         prog_data->prog_offset[1] =
10313            g.generate_code(v16->cfg, 16, v16->shader_stats,
10314                            v16->performance_analysis.require(), stats);
10315         stats = stats ? stats + 1 : NULL;
10316      }
10317
10318      if (prog_data->prog_mask & (1 << 2)) {
10319         assert(v32);
10320         prog_data->prog_offset[2] =
10321            g.generate_code(v32->cfg, 32, v32->shader_stats,
10322                            v32->performance_analysis.require(), stats);
10323         stats = stats ? stats + 1 : NULL;
10324      }
10325   } else {
10326      /* Only one dispatch width will be valid, and will be at offset 0,
10327       * which is already the default value of prog_offset_* fields.
10328       */
10329      prog_data->prog_mask = 1 << (v->dispatch_width / 16);
10330      g.generate_code(v->cfg, v->dispatch_width, v->shader_stats,
10331                      v->performance_analysis.require(), stats);
10332   }
10333
10334   g.add_const_data(nir->constant_data, nir->constant_data_size);
10335
10336   ret = g.get_assembly();
10337
10338   delete v8;
10339   delete v16;
10340   delete v32;
10341
10342   return ret;
10343}
10344
10345static unsigned
10346brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
10347                                const struct brw_cs_prog_data *cs_prog_data,
10348                                unsigned group_size)
10349{
10350   const unsigned mask = cs_prog_data->prog_mask;
10351   assert(mask != 0);
10352
10353   static const unsigned simd8  = 1 << 0;
10354   static const unsigned simd16 = 1 << 1;
10355   static const unsigned simd32 = 1 << 2;
10356
10357   if (INTEL_DEBUG(DEBUG_DO32) && (mask & simd32))
10358      return 32;
10359
10360   const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
10361
10362   if ((mask & simd8) && group_size <= 8 * max_threads) {
10363      /* Prefer SIMD16 if can do without spilling.  Matches logic in
10364       * brw_compile_cs.
10365       */
10366      if ((mask & simd16) && (~cs_prog_data->prog_spilled & simd16))
10367         return 16;
10368      return 8;
10369   }
10370
10371   if ((mask & simd16) && group_size <= 16 * max_threads)
10372      return 16;
10373
10374   assert(mask & simd32);
10375   assert(group_size <= 32 * max_threads);
10376   return 32;
10377}
10378
10379struct brw_cs_dispatch_info
10380brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
10381                         const struct brw_cs_prog_data *prog_data,
10382                         const unsigned *override_local_size)
10383{
10384   struct brw_cs_dispatch_info info = {};
10385
10386   const unsigned *sizes =
10387      override_local_size ? override_local_size :
10388                            prog_data->local_size;
10389
10390   info.group_size = sizes[0] * sizes[1] * sizes[2];
10391   info.simd_size =
10392      brw_cs_simd_size_for_group_size(devinfo, prog_data, info.group_size);
10393   info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
10394
10395   const uint32_t remainder = info.group_size & (info.simd_size - 1);
10396   if (remainder > 0)
10397      info.right_mask = ~0u >> (32 - remainder);
10398   else
10399      info.right_mask = ~0u >> (32 - info.simd_size);
10400
10401   return info;
10402}
10403
10404static uint8_t
10405compile_single_bs(const struct brw_compiler *compiler, void *log_data,
10406                  void *mem_ctx,
10407                  const struct brw_bs_prog_key *key,
10408                  struct brw_bs_prog_data *prog_data,
10409                  nir_shader *shader,
10410                  fs_generator *g,
10411                  struct brw_compile_stats *stats,
10412                  int *prog_offset,
10413                  char **error_str)
10414{
10415   const bool debug_enabled = INTEL_DEBUG(DEBUG_RT);
10416
10417   prog_data->base.stage = shader->info.stage;
10418   prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
10419                                    shader->scratch_size);
10420
10421   const unsigned max_dispatch_width = 16;
10422   brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width, true);
10423   brw_postprocess_nir(shader, compiler, true, debug_enabled,
10424                       key->base.robust_buffer_access);
10425
10426   fs_visitor *v = NULL, *v8 = NULL, *v16 = NULL;
10427   bool has_spilled = false;
10428
10429   uint8_t simd_size = 0;
10430   if (!INTEL_DEBUG(DEBUG_NO8)) {
10431      v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10432                          &prog_data->base, shader,
10433                          8, -1 /* shader time */, debug_enabled);
10434      const bool allow_spilling = true;
10435      if (!v8->run_bs(allow_spilling)) {
10436         if (error_str)
10437            *error_str = ralloc_strdup(mem_ctx, v8->fail_msg);
10438         delete v8;
10439         return 0;
10440      } else {
10441         v = v8;
10442         simd_size = 8;
10443         if (v8->spilled_any_registers)
10444            has_spilled = true;
10445      }
10446   }
10447
10448   if (!has_spilled && !INTEL_DEBUG(DEBUG_NO16)) {
10449      v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
10450                           &prog_data->base, shader,
10451                           16, -1 /* shader time */, debug_enabled);
10452      const bool allow_spilling = (v == NULL);
10453      if (!v16->run_bs(allow_spilling)) {
10454         brw_shader_perf_log(compiler, log_data,
10455                             "SIMD16 shader failed to compile: %s\n",
10456                             v16->fail_msg);
10457         if (v == NULL) {
10458            assert(v8 == NULL);
10459            if (error_str) {
10460               *error_str = ralloc_asprintf(
10461                  mem_ctx, "SIMD8 disabled and couldn't generate SIMD16: %s",
10462                  v16->fail_msg);
10463            }
10464            delete v16;
10465            return 0;
10466         }
10467      } else {
10468         v = v16;
10469         simd_size = 16;
10470         if (v16->spilled_any_registers)
10471            has_spilled = true;
10472      }
10473   }
10474
10475   if (unlikely(v == NULL)) {
10476      assert(INTEL_DEBUG(DEBUG_NO8 | DEBUG_NO16));
10477      if (error_str) {
10478         *error_str = ralloc_strdup(mem_ctx,
10479            "Cannot satisfy INTEL_DEBUG flags SIMD restrictions");
10480      }
10481      return false;
10482   }
10483
10484   assert(v);
10485
10486   int offset = g->generate_code(v->cfg, simd_size, v->shader_stats,
10487                                 v->performance_analysis.require(), stats);
10488   if (prog_offset)
10489      *prog_offset = offset;
10490   else
10491      assert(offset == 0);
10492
10493   delete v8;
10494   delete v16;
10495
10496   return simd_size;
10497}
10498
10499uint64_t
10500brw_bsr(const struct intel_device_info *devinfo,
10501        uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
10502{
10503   assert(offset % 64 == 0);
10504   assert(simd_size == 8 || simd_size == 16);
10505   assert(local_arg_offset % 8 == 0);
10506
10507   return offset |
10508          SET_BITS(simd_size == 8, 4, 4) |
10509          SET_BITS(local_arg_offset / 8, 2, 0);
10510}
10511
10512const unsigned *
10513brw_compile_bs(const struct brw_compiler *compiler, void *log_data,
10514               void *mem_ctx,
10515               const struct brw_bs_prog_key *key,
10516               struct brw_bs_prog_data *prog_data,
10517               nir_shader *shader,
10518               unsigned num_resume_shaders,
10519               struct nir_shader **resume_shaders,
10520               struct brw_compile_stats *stats,
10521               char **error_str)
10522{
10523   const bool debug_enabled = INTEL_DEBUG(DEBUG_RT);
10524
10525   prog_data->base.stage = shader->info.stage;
10526   prog_data->max_stack_size = 0;
10527
10528   fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
10529                  false, shader->info.stage);
10530   if (unlikely(debug_enabled)) {
10531      char *name = ralloc_asprintf(mem_ctx, "%s %s shader %s",
10532                                   shader->info.label ?
10533                                      shader->info.label : "unnamed",
10534                                   gl_shader_stage_name(shader->info.stage),
10535                                   shader->info.name);
10536      g.enable_debug(name);
10537   }
10538
10539   prog_data->simd_size =
10540      compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10541                        shader, &g, stats, NULL, error_str);
10542   if (prog_data->simd_size == 0)
10543      return NULL;
10544
10545   uint64_t *resume_sbt = ralloc_array(mem_ctx, uint64_t, num_resume_shaders);
10546   for (unsigned i = 0; i < num_resume_shaders; i++) {
10547      if (INTEL_DEBUG(DEBUG_RT)) {
10548         char *name = ralloc_asprintf(mem_ctx, "%s %s resume(%u) shader %s",
10549                                      shader->info.label ?
10550                                         shader->info.label : "unnamed",
10551                                      gl_shader_stage_name(shader->info.stage),
10552                                      i, shader->info.name);
10553         g.enable_debug(name);
10554      }
10555
10556      /* TODO: Figure out shader stats etc. for resume shaders */
10557      int offset = 0;
10558      uint8_t simd_size =
10559         compile_single_bs(compiler, log_data, mem_ctx, key, prog_data,
10560                           resume_shaders[i], &g, NULL, &offset, error_str);
10561      if (simd_size == 0)
10562         return NULL;
10563
10564      assert(offset > 0);
10565      resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
10566   }
10567
10568   /* We only have one constant data so we want to make sure they're all the
10569    * same.
10570    */
10571   for (unsigned i = 0; i < num_resume_shaders; i++) {
10572      assert(resume_shaders[i]->constant_data_size ==
10573             shader->constant_data_size);
10574      assert(memcmp(resume_shaders[i]->constant_data,
10575                    shader->constant_data,
10576                    shader->constant_data_size) == 0);
10577   }
10578
10579   g.add_const_data(shader->constant_data, shader->constant_data_size);
10580   g.add_resume_sbt(num_resume_shaders, resume_sbt);
10581
10582   return g.get_assembly();
10583}
10584
10585/**
10586 * Test the dispatch mask packing assumptions of
10587 * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
10588 * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
10589 * executed with an unexpected dispatch mask.
10590 */
10591static UNUSED void
10592brw_fs_test_dispatch_packing(const fs_builder &bld)
10593{
10594   const gl_shader_stage stage = bld.shader->stage;
10595
10596   if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
10597                                     bld.shader->stage_prog_data)) {
10598      const fs_builder ubld = bld.exec_all().group(1, 0);
10599      const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
10600      const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
10601                           brw_dmask_reg());
10602
10603      ubld.ADD(tmp, mask, brw_imm_ud(1));
10604      ubld.AND(tmp, mask, tmp);
10605
10606      /* This will loop forever if the dispatch mask doesn't have the expected
10607       * form '2^n-1', in which case tmp will be non-zero.
10608       */
10609      bld.emit(BRW_OPCODE_DO);
10610      bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
10611      set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
10612   }
10613}
10614
10615unsigned
10616fs_visitor::workgroup_size() const
10617{
10618   assert(stage == MESA_SHADER_COMPUTE);
10619   const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
10620   return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
10621}
10622