1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_vec4.h"
25#include "brw_cfg.h"
26#include "brw_eu.h"
27#include "util/u_math.h"
28
29namespace brw {
30
31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32                                   const src_reg &src0, const src_reg &src1,
33                                   const src_reg &src2)
34{
35   this->opcode = opcode;
36   this->dst = dst;
37   this->src[0] = src0;
38   this->src[1] = src1;
39   this->src[2] = src2;
40   this->saturate = false;
41   this->force_writemask_all = false;
42   this->no_dd_clear = false;
43   this->no_dd_check = false;
44   this->writes_accumulator = false;
45   this->conditional_mod = BRW_CONDITIONAL_NONE;
46   this->predicate = BRW_PREDICATE_NONE;
47   this->predicate_inverse = false;
48   this->target = 0;
49   this->shadow_compare = false;
50   this->eot = false;
51   this->ir = NULL;
52   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53   this->header_size = 0;
54   this->flag_subreg = 0;
55   this->mlen = 0;
56   this->base_mrf = 0;
57   this->offset = 0;
58   this->exec_size = 8;
59   this->group = 0;
60   this->size_written = (dst.file == BAD_FILE ?
61                         0 : this->exec_size * type_sz(dst.type));
62   this->annotation = NULL;
63}
64
65vec4_instruction *
66vec4_visitor::emit(vec4_instruction *inst)
67{
68   inst->ir = this->base_ir;
69   inst->annotation = this->current_annotation;
70
71   this->instructions.push_tail(inst);
72
73   return inst;
74}
75
76vec4_instruction *
77vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78                          vec4_instruction *new_inst)
79{
80   new_inst->ir = inst->ir;
81   new_inst->annotation = inst->annotation;
82
83   inst->insert_before(block, new_inst);
84
85   return inst;
86}
87
88vec4_instruction *
89vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90                   const src_reg &src1, const src_reg &src2)
91{
92   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93}
94
95
96vec4_instruction *
97vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98                   const src_reg &src1)
99{
100   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101}
102
103vec4_instruction *
104vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105{
106   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107}
108
109vec4_instruction *
110vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111{
112   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113}
114
115vec4_instruction *
116vec4_visitor::emit(enum opcode opcode)
117{
118   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119}
120
121#define ALU1(op)							\
122   vec4_instruction *							\
123   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
124   {									\
125      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126   }
127
128#define ALU2(op)							\
129   vec4_instruction *							\
130   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
131                    const src_reg &src1)				\
132   {									\
133      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
134                                           src0, src1);                 \
135   }
136
137#define ALU2_ACC(op)							\
138   vec4_instruction *							\
139   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
140                    const src_reg &src1)				\
141   {									\
142      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
143                       BRW_OPCODE_##op, dst, src0, src1);		\
144      inst->writes_accumulator = true;                                  \
145      return inst;                                                      \
146   }
147
148#define ALU3(op)							\
149   vec4_instruction *							\
150   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
151                    const src_reg &src1, const src_reg &src2)		\
152   {									\
153      assert(devinfo->gen >= 6);						\
154      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
155					   src0, src1, src2);		\
156   }
157
158ALU1(NOT)
159ALU1(MOV)
160ALU1(FRC)
161ALU1(RNDD)
162ALU1(RNDE)
163ALU1(RNDZ)
164ALU1(F32TO16)
165ALU1(F16TO32)
166ALU2(ADD)
167ALU2(MUL)
168ALU2_ACC(MACH)
169ALU2(AND)
170ALU2(OR)
171ALU2(XOR)
172ALU2(DP3)
173ALU2(DP4)
174ALU2(DPH)
175ALU2(SHL)
176ALU2(SHR)
177ALU2(ASR)
178ALU3(LRP)
179ALU1(BFREV)
180ALU3(BFE)
181ALU2(BFI1)
182ALU3(BFI2)
183ALU1(FBH)
184ALU1(FBL)
185ALU1(CBIT)
186ALU3(MAD)
187ALU2_ACC(ADDC)
188ALU2_ACC(SUBB)
189ALU2(MAC)
190ALU1(DIM)
191
192/** Gen4 predicated IF. */
193vec4_instruction *
194vec4_visitor::IF(enum brw_predicate predicate)
195{
196   vec4_instruction *inst;
197
198   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199   inst->predicate = predicate;
200
201   return inst;
202}
203
204/** Gen6 IF with embedded comparison. */
205vec4_instruction *
206vec4_visitor::IF(src_reg src0, src_reg src1,
207                 enum brw_conditional_mod condition)
208{
209   assert(devinfo->gen == 6);
210
211   vec4_instruction *inst;
212
213   resolve_ud_negate(&src0);
214   resolve_ud_negate(&src1);
215
216   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217					src0, src1);
218   inst->conditional_mod = condition;
219
220   return inst;
221}
222
223/**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228vec4_instruction *
229vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230                  enum brw_conditional_mod condition)
231{
232   vec4_instruction *inst;
233
234   /* Take the instruction:
235    *
236    * CMP null<d> src0<f> src1<f>
237    *
238    * Original gen4 does type conversion to the destination type before
239    * comparison, producing garbage results for floating point comparisons.
240    *
241    * The destination type doesn't matter on newer generations, so we set the
242    * type to match src0 so we can compact the instruction.
243    */
244   dst.type = src0.type;
245
246   resolve_ud_negate(&src0);
247   resolve_ud_negate(&src1);
248
249   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250   inst->conditional_mod = condition;
251
252   return inst;
253}
254
255vec4_instruction *
256vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257{
258   vec4_instruction *inst;
259
260   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
261					dst, index);
262   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
263   inst->mlen = 2;
264
265   return inst;
266}
267
268vec4_instruction *
269vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270                            const src_reg &index)
271{
272   vec4_instruction *inst;
273
274   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
275					dst, src, index);
276   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
277   inst->mlen = 3;
278
279   return inst;
280}
281
282src_reg
283vec4_visitor::fix_3src_operand(const src_reg &src)
284{
285   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286    * able to use vertical stride of zero to replicate the vec4 uniform, like
287    *
288    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289    *
290    * But you can't, since vertical stride is always four in three-source
291    * instructions. Instead, insert a MOV instruction to do the replication so
292    * that the three-source instruction can consume it.
293    */
294
295   /* The MOV is only needed if the source is a uniform or immediate. */
296   if (src.file != UNIFORM && src.file != IMM)
297      return src;
298
299   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300      return src;
301
302   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303   expanded.type = src.type;
304   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305   return src_reg(expanded);
306}
307
308src_reg
309vec4_visitor::resolve_source_modifiers(const src_reg &src)
310{
311   if (!src.abs && !src.negate)
312      return src;
313
314   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
315   resolved.type = src.type;
316   emit(MOV(resolved, src));
317
318   return src_reg(resolved);
319}
320
321src_reg
322vec4_visitor::fix_math_operand(const src_reg &src)
323{
324   if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
325      return src;
326
327   /* The gen6 math instruction ignores the source modifiers --
328    * swizzle, abs, negate, and at least some parts of the register
329    * region description.
330    *
331    * Rather than trying to enumerate all these cases, *always* expand the
332    * operand to a temp GRF for gen6.
333    *
334    * For gen7, keep the operand as-is, except if immediate, which gen7 still
335    * can't use.
336    */
337
338   if (devinfo->gen == 7 && src.file != IMM)
339      return src;
340
341   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
342   expanded.type = src.type;
343   emit(MOV(expanded, src));
344   return src_reg(expanded);
345}
346
347vec4_instruction *
348vec4_visitor::emit_math(enum opcode opcode,
349                        const dst_reg &dst,
350                        const src_reg &src0, const src_reg &src1)
351{
352   vec4_instruction *math =
353      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
354
355   if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
356      /* MATH on Gen6 must be align1, so we can't do writemasks. */
357      math->dst = dst_reg(this, glsl_type::vec4_type);
358      math->dst.type = dst.type;
359      math = emit(MOV(dst, src_reg(math->dst)));
360   } else if (devinfo->gen < 6) {
361      math->base_mrf = 1;
362      math->mlen = src1.file == BAD_FILE ? 1 : 2;
363   }
364
365   return math;
366}
367
368void
369vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
370{
371   if (devinfo->gen < 7) {
372      unreachable("ir_unop_pack_half_2x16 should be lowered");
373   }
374
375   assert(dst.type == BRW_REGISTER_TYPE_UD);
376   assert(src0.type == BRW_REGISTER_TYPE_F);
377
378   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
379    *
380    *   Because this instruction does not have a 16-bit floating-point type,
381    *   the destination data type must be Word (W).
382    *
383    *   The destination must be DWord-aligned and specify a horizontal stride
384    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
385    *   each destination channel and the upper word is not modified.
386    *
387    * The above restriction implies that the f32to16 instruction must use
388    * align1 mode, because only in align1 mode is it possible to specify
389    * horizontal stride.  We choose here to defy the hardware docs and emit
390    * align16 instructions.
391    *
392    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
393    * instructions. I was partially successful in that the code passed all
394    * tests.  However, the code was dubiously correct and fragile, and the
395    * tests were not harsh enough to probe that frailty. Not trusting the
396    * code, I chose instead to remain in align16 mode in defiance of the hw
397    * docs).
398    *
399    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
400    * simulator, emitting a f32to16 in align16 mode with UD as destination
401    * data type is safe. The behavior differs from that specified in the PRM
402    * in that the upper word of each destination channel is cleared to 0.
403    */
404
405   dst_reg tmp_dst(this, glsl_type::uvec2_type);
406   src_reg tmp_src(tmp_dst);
407
408#if 0
409   /* Verify the undocumented behavior on which the following instructions
410    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
411    * then the result of the bit-or instruction below will be incorrect.
412    *
413    * You should inspect the disasm output in order to verify that the MOV is
414    * not optimized away.
415    */
416   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
417#endif
418
419   /* Give tmp the form below, where "." means untouched.
420    *
421    *     w z          y          x w z          y          x
422    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
423    *
424    * That the upper word of each write-channel be 0 is required for the
425    * following bit-shift and bit-or instructions to work. Note that this
426    * relies on the undocumented hardware behavior mentioned above.
427    */
428   tmp_dst.writemask = WRITEMASK_XY;
429   emit(F32TO16(tmp_dst, src0));
430
431   /* Give the write-channels of dst the form:
432    *   0xhhhh0000
433    */
434   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
435   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
436
437   /* Finally, give the write-channels of dst the form of packHalf2x16's
438    * output:
439    *   0xhhhhllll
440    */
441   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
442   emit(OR(dst, src_reg(dst), tmp_src));
443}
444
445void
446vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
447{
448   if (devinfo->gen < 7) {
449      unreachable("ir_unop_unpack_half_2x16 should be lowered");
450   }
451
452   assert(dst.type == BRW_REGISTER_TYPE_F);
453   assert(src0.type == BRW_REGISTER_TYPE_UD);
454
455   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
456    *
457    *   Because this instruction does not have a 16-bit floating-point type,
458    *   the source data type must be Word (W). The destination type must be
459    *   F (Float).
460    *
461    * To use W as the source data type, we must adjust horizontal strides,
462    * which is only possible in align1 mode. All my [chadv] attempts at
463    * emitting align1 instructions for unpackHalf2x16 failed to pass the
464    * Piglit tests, so I gave up.
465    *
466    * I've verified that, on gen7 hardware and the simulator, it is safe to
467    * emit f16to32 in align16 mode with UD as source data type.
468    */
469
470   dst_reg tmp_dst(this, glsl_type::uvec2_type);
471   src_reg tmp_src(tmp_dst);
472
473   tmp_dst.writemask = WRITEMASK_X;
474   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
475
476   tmp_dst.writemask = WRITEMASK_Y;
477   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
478
479   dst.writemask = WRITEMASK_XY;
480   emit(F16TO32(dst, tmp_src));
481}
482
483void
484vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
485{
486   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
487    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
488    * is not suitable to generate the shift values, but we can use the packed
489    * vector float and a type-converting MOV.
490    */
491   dst_reg shift(this, glsl_type::uvec4_type);
492   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
493
494   dst_reg shifted(this, glsl_type::uvec4_type);
495   src0.swizzle = BRW_SWIZZLE_XXXX;
496   emit(SHR(shifted, src0, src_reg(shift)));
497
498   shifted.type = BRW_REGISTER_TYPE_UB;
499   dst_reg f(this, glsl_type::vec4_type);
500   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
501
502   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
503}
504
505void
506vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
507{
508   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
509    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
510    * is not suitable to generate the shift values, but we can use the packed
511    * vector float and a type-converting MOV.
512    */
513   dst_reg shift(this, glsl_type::uvec4_type);
514   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
515
516   dst_reg shifted(this, glsl_type::uvec4_type);
517   src0.swizzle = BRW_SWIZZLE_XXXX;
518   emit(SHR(shifted, src0, src_reg(shift)));
519
520   shifted.type = BRW_REGISTER_TYPE_B;
521   dst_reg f(this, glsl_type::vec4_type);
522   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
523
524   dst_reg scaled(this, glsl_type::vec4_type);
525   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
526
527   dst_reg max(this, glsl_type::vec4_type);
528   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
529   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
530}
531
532void
533vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
534{
535   dst_reg saturated(this, glsl_type::vec4_type);
536   vec4_instruction *inst = emit(MOV(saturated, src0));
537   inst->saturate = true;
538
539   dst_reg scaled(this, glsl_type::vec4_type);
540   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
541
542   dst_reg rounded(this, glsl_type::vec4_type);
543   emit(RNDE(rounded, src_reg(scaled)));
544
545   dst_reg u(this, glsl_type::uvec4_type);
546   emit(MOV(u, src_reg(rounded)));
547
548   src_reg bytes(u);
549   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
550}
551
552void
553vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
554{
555   dst_reg max(this, glsl_type::vec4_type);
556   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
557
558   dst_reg min(this, glsl_type::vec4_type);
559   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
560
561   dst_reg scaled(this, glsl_type::vec4_type);
562   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
563
564   dst_reg rounded(this, glsl_type::vec4_type);
565   emit(RNDE(rounded, src_reg(scaled)));
566
567   dst_reg i(this, glsl_type::ivec4_type);
568   emit(MOV(i, src_reg(rounded)));
569
570   src_reg bytes(i);
571   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
572}
573
574/*
575 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
576 * false) elements needed to pack a type.
577 */
578static int
579type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
580{
581   unsigned int i;
582   int size;
583
584   switch (type->base_type) {
585   case GLSL_TYPE_UINT:
586   case GLSL_TYPE_INT:
587   case GLSL_TYPE_FLOAT:
588   case GLSL_TYPE_FLOAT16:
589   case GLSL_TYPE_BOOL:
590   case GLSL_TYPE_DOUBLE:
591   case GLSL_TYPE_UINT16:
592   case GLSL_TYPE_INT16:
593   case GLSL_TYPE_UINT8:
594   case GLSL_TYPE_INT8:
595   case GLSL_TYPE_UINT64:
596   case GLSL_TYPE_INT64:
597      if (type->is_matrix()) {
598         const glsl_type *col_type = type->column_type();
599         unsigned col_slots =
600            (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
601         return type->matrix_columns * col_slots;
602      } else {
603         /* Regardless of size of vector, it gets a vec4. This is bad
604          * packing for things like floats, but otherwise arrays become a
605          * mess.  Hopefully a later pass over the code can pack scalars
606          * down if appropriate.
607          */
608         return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
609      }
610   case GLSL_TYPE_ARRAY:
611      assert(type->length > 0);
612      return type_size_xvec4(type->fields.array, as_vec4, bindless) *
613             type->length;
614   case GLSL_TYPE_STRUCT:
615   case GLSL_TYPE_INTERFACE:
616      size = 0;
617      for (i = 0; i < type->length; i++) {
618	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
619                                 bindless);
620      }
621      return size;
622   case GLSL_TYPE_SUBROUTINE:
623      return 1;
624
625   case GLSL_TYPE_SAMPLER:
626      /* Samplers take up no register space, since they're baked in at
627       * link time.
628       */
629      return bindless ? 1 : 0;
630   case GLSL_TYPE_ATOMIC_UINT:
631      return 0;
632   case GLSL_TYPE_IMAGE:
633      return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
634   case GLSL_TYPE_VOID:
635   case GLSL_TYPE_ERROR:
636   case GLSL_TYPE_FUNCTION:
637      unreachable("not reached");
638   }
639
640   return 0;
641}
642
643/**
644 * Returns the minimum number of vec4 elements needed to pack a type.
645 *
646 * For simple types, it will return 1 (a single vec4); for matrices, the
647 * number of columns; for array and struct, the sum of the vec4_size of
648 * each of its elements; and for sampler and atomic, zero.
649 *
650 * This method is useful to calculate how much register space is needed to
651 * store a particular type.
652 */
653extern "C" int
654type_size_vec4(const struct glsl_type *type, bool bindless)
655{
656   return type_size_xvec4(type, true, bindless);
657}
658
659/**
660 * Returns the minimum number of dvec4 elements needed to pack a type.
661 *
662 * For simple types, it will return 1 (a single dvec4); for matrices, the
663 * number of columns; for array and struct, the sum of the dvec4_size of
664 * each of its elements; and for sampler and atomic, zero.
665 *
666 * This method is useful to calculate how much register space is needed to
667 * store a particular type.
668 *
669 * Measuring double-precision vertex inputs as dvec4 is required because
670 * ARB_vertex_attrib_64bit states that these uses the same number of locations
671 * than the single-precision version. That is, two consecutives dvec4 would be
672 * located in location "x" and location "x+1", not "x+2".
673 *
674 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
675 * remap_vs_attrs() will take in account both the location and also if the
676 * type fits in one or two vec4 slots.
677 */
678extern "C" int
679type_size_dvec4(const struct glsl_type *type, bool bindless)
680{
681   return type_size_xvec4(type, false, bindless);
682}
683
684src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
685{
686   init();
687
688   this->file = VGRF;
689   this->nr = v->alloc.allocate(type_size_vec4(type, false));
690
691   if (type->is_array() || type->is_struct()) {
692      this->swizzle = BRW_SWIZZLE_NOOP;
693   } else {
694      this->swizzle = brw_swizzle_for_size(type->vector_elements);
695   }
696
697   this->type = brw_type_for_base_type(type);
698}
699
700src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
701{
702   assert(size > 0);
703
704   init();
705
706   this->file = VGRF;
707   this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
708
709   this->swizzle = BRW_SWIZZLE_NOOP;
710
711   this->type = brw_type_for_base_type(type);
712}
713
714dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
715{
716   init();
717
718   this->file = VGRF;
719   this->nr = v->alloc.allocate(type_size_vec4(type, false));
720
721   if (type->is_array() || type->is_struct()) {
722      this->writemask = WRITEMASK_XYZW;
723   } else {
724      this->writemask = (1 << type->vector_elements) - 1;
725   }
726
727   this->type = brw_type_for_base_type(type);
728}
729
730vec4_instruction *
731vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
732                          src_reg src0, src_reg src1)
733{
734   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
735   inst->conditional_mod = conditionalmod;
736   return inst;
737}
738
739vec4_instruction *
740vec4_visitor::emit_lrp(const dst_reg &dst,
741                       const src_reg &x, const src_reg &y, const src_reg &a)
742{
743   if (devinfo->gen >= 6 && devinfo->gen <= 10) {
744      /* Note that the instruction's argument order is reversed from GLSL
745       * and the IR.
746       */
747     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
748                     fix_3src_operand(x)));
749   } else {
750      /* Earlier generations don't support three source operations, so we
751       * need to emit x*(1-a) + y*a.
752       */
753      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
754      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
755      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
756      y_times_a.writemask           = dst.writemask;
757      one_minus_a.writemask         = dst.writemask;
758      x_times_one_minus_a.writemask = dst.writemask;
759
760      emit(MUL(y_times_a, y, a));
761      emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
762      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
763      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
764   }
765}
766
767/**
768 * Emits the instructions needed to perform a pull constant load. before_block
769 * and before_inst can be NULL in which case the instruction will be appended
770 * to the end of the instruction list.
771 */
772void
773vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
774                                          src_reg surf_index,
775                                          src_reg offset_reg,
776                                          bblock_t *before_block,
777                                          vec4_instruction *before_inst)
778{
779   assert((before_inst == NULL && before_block == NULL) ||
780          (before_inst && before_block));
781
782   vec4_instruction *pull;
783
784   if (devinfo->gen >= 9) {
785      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
786      src_reg header(this, glsl_type::uvec4_type, 2);
787
788      pull = new(mem_ctx)
789         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
790                          dst_reg(header));
791
792      if (before_inst)
793         emit_before(before_block, before_inst, pull);
794      else
795         emit(pull);
796
797      dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
798                                 offset_reg.type);
799      pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
800
801      if (before_inst)
802         emit_before(before_block, before_inst, pull);
803      else
804         emit(pull);
805
806      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
807                                           dst,
808                                           surf_index,
809                                           header);
810      pull->mlen = 2;
811      pull->header_size = 1;
812   } else if (devinfo->gen >= 7) {
813      dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
814
815      grf_offset.type = offset_reg.type;
816
817      pull = MOV(grf_offset, offset_reg);
818
819      if (before_inst)
820         emit_before(before_block, before_inst, pull);
821      else
822         emit(pull);
823
824      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
825                                           dst,
826                                           surf_index,
827                                           src_reg(grf_offset));
828      pull->mlen = 1;
829   } else {
830      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
831                                           dst,
832                                           surf_index,
833                                           offset_reg);
834      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
835      pull->mlen = 1;
836   }
837
838   if (before_inst)
839      emit_before(before_block, before_inst, pull);
840   else
841      emit(pull);
842}
843
844src_reg
845vec4_visitor::emit_uniformize(const src_reg &src)
846{
847   const src_reg chan_index(this, glsl_type::uint_type);
848   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
849                              src.type);
850
851   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
852      ->force_writemask_all = true;
853   emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
854      ->force_writemask_all = true;
855
856   return src_reg(dst);
857}
858
859src_reg
860vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
861                             src_reg coordinate, src_reg surface)
862{
863   vec4_instruction *inst =
864      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
865                                    dst_reg(this, glsl_type::uvec4_type));
866   inst->base_mrf = 2;
867   inst->src[1] = surface;
868   inst->src[2] = brw_imm_ud(0); /* sampler */
869
870   int param_base;
871
872   if (devinfo->gen >= 9) {
873      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
874      vec4_instruction *header_inst = new(mem_ctx)
875         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
876                          dst_reg(MRF, inst->base_mrf));
877
878      emit(header_inst);
879
880      inst->mlen = 2;
881      inst->header_size = 1;
882      param_base = inst->base_mrf + 1;
883   } else {
884      inst->mlen = 1;
885      param_base = inst->base_mrf;
886   }
887
888   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
889   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
890   int zero_mask = 0xf & ~coord_mask;
891
892   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
893            coordinate));
894
895   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
896            brw_imm_d(0)));
897
898   emit(inst);
899   return src_reg(inst->dst);
900}
901
902bool
903vec4_visitor::is_high_sampler(src_reg sampler)
904{
905   if (devinfo->gen < 8 && !devinfo->is_haswell)
906      return false;
907
908   return sampler.file != IMM || sampler.ud >= 16;
909}
910
911void
912vec4_visitor::emit_texture(ir_texture_opcode op,
913                           dst_reg dest,
914                           const glsl_type *dest_type,
915                           src_reg coordinate,
916                           int coord_components,
917                           src_reg shadow_comparator,
918                           src_reg lod, src_reg lod2,
919                           src_reg sample_index,
920                           uint32_t constant_offset,
921                           src_reg offset_value,
922                           src_reg mcs,
923                           uint32_t surface,
924                           src_reg surface_reg,
925                           src_reg sampler_reg)
926{
927   enum opcode opcode;
928   switch (op) {
929   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
930   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
931   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
932   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
933   case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
934                             SHADER_OPCODE_TXF_CMS); break;
935   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
936   case ir_tg4: opcode = offset_value.file != BAD_FILE
937                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
938   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
939   case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
940   case ir_txb:
941      unreachable("TXB is not valid for vertex shaders.");
942   case ir_lod:
943      unreachable("LOD is not valid for vertex shaders.");
944   case ir_samples_identical: {
945      /* There are some challenges implementing this for vec4, and it seems
946       * unlikely to be used anyway.  For now, just return false ways.
947       */
948      emit(MOV(dest, brw_imm_ud(0u)));
949      return;
950   }
951   default:
952      unreachable("Unrecognized tex op");
953   }
954
955   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
956
957   inst->offset = constant_offset;
958
959   /* The message header is necessary for:
960    * - Gen4 (always)
961    * - Gen9+ for selecting SIMD4x2
962    * - Texel offsets
963    * - Gather channel selection
964    * - Sampler indices too large to fit in a 4-bit value.
965    * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
966    */
967   inst->header_size =
968      (devinfo->gen < 5 || devinfo->gen >= 9 ||
969       inst->offset != 0 || op == ir_tg4 ||
970       op == ir_texture_samples ||
971       is_high_sampler(sampler_reg)) ? 1 : 0;
972   inst->base_mrf = 2;
973   inst->mlen = inst->header_size;
974   inst->dst.writemask = WRITEMASK_XYZW;
975   inst->shadow_compare = shadow_comparator.file != BAD_FILE;
976
977   inst->src[1] = surface_reg;
978   inst->src[2] = sampler_reg;
979
980   /* MRF for the first parameter */
981   int param_base = inst->base_mrf + inst->header_size;
982
983   if (op == ir_txs || op == ir_query_levels) {
984      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
985      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
986      inst->mlen++;
987   } else if (op == ir_texture_samples) {
988      inst->dst.writemask = WRITEMASK_X;
989   } else {
990      /* Load the coordinate */
991      /* FINISHME: gl_clamp_mask and saturate */
992      int coord_mask = (1 << coord_components) - 1;
993      int zero_mask = 0xf & ~coord_mask;
994
995      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
996               coordinate));
997      inst->mlen++;
998
999      if (zero_mask != 0) {
1000         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001                  brw_imm_d(0)));
1002      }
1003      /* Load the shadow comparator */
1004      if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1006			  WRITEMASK_X),
1007		  shadow_comparator));
1008	 inst->mlen++;
1009      }
1010
1011      /* Load the LOD info */
1012      if (op == ir_tex || op == ir_txl) {
1013	 int mrf, writemask;
1014	 if (devinfo->gen >= 5) {
1015	    mrf = param_base + 1;
1016	    if (shadow_comparator.file != BAD_FILE) {
1017	       writemask = WRITEMASK_Y;
1018	       /* mlen already incremented */
1019	    } else {
1020	       writemask = WRITEMASK_X;
1021	       inst->mlen++;
1022	    }
1023	 } else /* devinfo->gen == 4 */ {
1024	    mrf = param_base;
1025	    writemask = WRITEMASK_W;
1026	 }
1027	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028      } else if (op == ir_txf) {
1029         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030      } else if (op == ir_txf_ms) {
1031         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032                  sample_index));
1033         if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034            /* MCS data is stored in the first two channels of ‘mcs’, but we
1035             * need to get it into the .y and .z channels of the second vec4
1036             * of params.
1037             */
1038            mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039            emit(MOV(dst_reg(MRF, param_base + 1,
1040                             glsl_type::uint_type, WRITEMASK_YZ),
1041                     mcs));
1042         } else if (devinfo->gen >= 7) {
1043            /* MCS data is in the first channel of `mcs`, but we need to get it into
1044             * the .y channel of the second vec4 of params, so replicate .x across
1045             * the whole vec4 and then mask off everything except .y
1046             */
1047            mcs.swizzle = BRW_SWIZZLE_XXXX;
1048            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049                     mcs));
1050         }
1051         inst->mlen++;
1052      } else if (op == ir_txd) {
1053         const brw_reg_type type = lod.type;
1054
1055	 if (devinfo->gen >= 5) {
1056	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060	    inst->mlen++;
1061
1062	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1063	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067	       inst->mlen++;
1068
1069               if (shadow_comparator.file != BAD_FILE) {
1070                  emit(MOV(dst_reg(MRF, param_base + 2,
1071                                   shadow_comparator.type, WRITEMASK_Z),
1072                           shadow_comparator));
1073               }
1074	    }
1075	 } else /* devinfo->gen == 4 */ {
1076	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078	    inst->mlen += 2;
1079	 }
1080      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081         if (shadow_comparator.file != BAD_FILE) {
1082            emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1083                     shadow_comparator));
1084         }
1085
1086         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087                  offset_value));
1088         inst->mlen++;
1089      }
1090   }
1091
1092   emit(inst);
1093
1094   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095    * spec requires layers.
1096    */
1097   if (op == ir_txs && devinfo->gen < 7) {
1098      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1099      emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1100                  src_reg(inst->dst), brw_imm_d(1));
1101   }
1102
1103   if (devinfo->gen == 6 && op == ir_tg4) {
1104      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1105   }
1106
1107   if (op == ir_query_levels) {
1108      /* # levels is in .w */
1109      src_reg swizzled(dest);
1110      swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1111                                      SWIZZLE_W, SWIZZLE_W);
1112      emit(MOV(dest, swizzled));
1113   }
1114}
1115
1116/**
1117 * Apply workarounds for Gen6 gather with UINT/SINT
1118 */
1119void
1120vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1121{
1122   if (!wa)
1123      return;
1124
1125   int width = (wa & WA_8BIT) ? 8 : 16;
1126   dst_reg dst_f = dst;
1127   dst_f.type = BRW_REGISTER_TYPE_F;
1128
1129   /* Convert from UNORM to UINT */
1130   emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1131   emit(MOV(dst, src_reg(dst_f)));
1132
1133   if (wa & WA_SIGN) {
1134      /* Reinterpret the UINT value as a signed INT value by
1135       * shifting the sign bit into place, then shifting back
1136       * preserving sign.
1137       */
1138      emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1139      emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1140   }
1141}
1142
1143void
1144vec4_visitor::gs_emit_vertex(int /* stream_id */)
1145{
1146   unreachable("not reached");
1147}
1148
1149void
1150vec4_visitor::gs_end_primitive()
1151{
1152   unreachable("not reached");
1153}
1154
1155void
1156vec4_visitor::emit_ndc_computation()
1157{
1158   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1159      return;
1160
1161   /* Get the position */
1162   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1163
1164   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1165   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1166   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1167   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1168
1169   current_annotation = "NDC";
1170   dst_reg ndc_w = ndc;
1171   ndc_w.writemask = WRITEMASK_W;
1172   src_reg pos_w = pos;
1173   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1174   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1175
1176   dst_reg ndc_xyz = ndc;
1177   ndc_xyz.writemask = WRITEMASK_XYZ;
1178
1179   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1180}
1181
1182void
1183vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1184{
1185   if (devinfo->gen < 6 &&
1186       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1187        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1188        devinfo->has_negative_rhw_bug)) {
1189      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1190      dst_reg header1_w = header1;
1191      header1_w.writemask = WRITEMASK_W;
1192
1193      emit(MOV(header1, brw_imm_ud(0u)));
1194
1195      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1196	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1197
1198	 current_annotation = "Point size";
1199	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1200	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1201      }
1202
1203      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1204         current_annotation = "Clipping flags";
1205         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1206
1207         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1209         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1210      }
1211
1212      if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1213         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1214         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218      }
1219
1220      /* i965 clipping workaround:
1221       * 1) Test for -ve rhw
1222       * 2) If set,
1223       *      set ndc = (0,0,0,0)
1224       *      set ucp[6] = 1
1225       *
1226       * Later, clipping will detect ucp[6] and ensure the primitive is
1227       * clipped against all fixed planes.
1228       */
1229      if (devinfo->has_negative_rhw_bug &&
1230          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234         vec4_instruction *inst;
1235         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236         inst->predicate = BRW_PREDICATE_NORMAL;
1237         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239         inst->predicate = BRW_PREDICATE_NORMAL;
1240      }
1241
1242      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243   } else if (devinfo->gen < 6) {
1244      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245   } else {
1246      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247      if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1248         dst_reg reg_w = reg;
1249         reg_w.writemask = WRITEMASK_W;
1250         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251         reg_as_src.type = reg_w.type;
1252         reg_as_src.swizzle = brw_swizzle_for_size(1);
1253         emit(MOV(reg_w, reg_as_src));
1254      }
1255      if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1256         dst_reg reg_y = reg;
1257         reg_y.writemask = WRITEMASK_Y;
1258         reg_y.type = BRW_REGISTER_TYPE_D;
1259         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261      }
1262      if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1263         dst_reg reg_z = reg;
1264         reg_z.writemask = WRITEMASK_Z;
1265         reg_z.type = BRW_REGISTER_TYPE_D;
1266         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268      }
1269   }
1270}
1271
1272vec4_instruction *
1273vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274{
1275   assert(varying < VARYING_SLOT_MAX);
1276
1277   unsigned num_comps = output_num_components[varying][component];
1278   if (num_comps == 0)
1279      return NULL;
1280
1281   assert(output_reg[varying][component].type == reg.type);
1282   current_annotation = output_reg_annotation[varying];
1283   if (output_reg[varying][component].file != BAD_FILE) {
1284      src_reg src = src_reg(output_reg[varying][component]);
1285      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286      reg.writemask =
1287         brw_writemask_for_component_packing(num_comps, component);
1288      return emit(MOV(reg, src));
1289   }
1290   return NULL;
1291}
1292
1293void
1294vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295{
1296   reg.type = BRW_REGISTER_TYPE_F;
1297   output_reg[varying][0].type = reg.type;
1298
1299   switch (varying) {
1300   case VARYING_SLOT_PSIZ:
1301   {
1302      /* PSIZ is always in slot 0, and is coupled with other flags. */
1303      current_annotation = "indices, point width, clip flags";
1304      emit_psiz_and_flags(reg);
1305      break;
1306   }
1307   case BRW_VARYING_SLOT_NDC:
1308      current_annotation = "NDC";
1309      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311      break;
1312   case VARYING_SLOT_POS:
1313      current_annotation = "gl_Position";
1314      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316      break;
1317   case VARYING_SLOT_EDGE: {
1318      /* This is present when doing unfilled polygons.  We're supposed to copy
1319       * the edge flag from the user-provided vertex array
1320       * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321       * of that attribute (starts as 1.0f).  This is then used in clipping to
1322       * determine which edges should be drawn as wireframe.
1323       */
1324      current_annotation = "edge flag";
1325      int edge_attr = util_bitcount64(nir->info.inputs_read &
1326                                        BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1327      emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1328                                    glsl_type::float_type, WRITEMASK_XYZW))));
1329      break;
1330   }
1331   case BRW_VARYING_SLOT_PAD:
1332      /* No need to write to this slot */
1333      break;
1334   default:
1335      for (int i = 0; i < 4; i++) {
1336         emit_generic_urb_slot(reg, varying, i);
1337      }
1338      break;
1339   }
1340}
1341
1342static unsigned
1343align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1344{
1345   if (devinfo->gen >= 6) {
1346      /* URB data written (does not include the message header reg) must
1347       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
1348       * section 5.4.3.2.2: URB_INTERLEAVED.
1349       *
1350       * URB entries are allocated on a multiple of 1024 bits, so an
1351       * extra 128 bits written here to make the end align to 256 is
1352       * no problem.
1353       */
1354      if ((mlen % 2) != 1)
1355	 mlen++;
1356   }
1357
1358   return mlen;
1359}
1360
1361
1362/**
1363 * Generates the VUE payload plus the necessary URB write instructions to
1364 * output it.
1365 *
1366 * The VUE layout is documented in Volume 2a.
1367 */
1368void
1369vec4_visitor::emit_vertex()
1370{
1371   /* MRF 0 is reserved for the debugger, so start with message header
1372    * in MRF 1.
1373    */
1374   int base_mrf = 1;
1375   int mrf = base_mrf;
1376   /* In the process of generating our URB write message contents, we
1377    * may need to unspill a register or load from an array.  Those
1378    * reads would use MRFs 14-15.
1379    */
1380   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1381
1382   /* The following assertion verifies that max_usable_mrf causes an
1383    * even-numbered amount of URB write data, which will meet gen6's
1384    * requirements for length alignment.
1385    */
1386   assert ((max_usable_mrf - base_mrf) % 2 == 0);
1387
1388   /* First mrf is the g0-based message header containing URB handles and
1389    * such.
1390    */
1391   emit_urb_write_header(mrf++);
1392
1393   if (devinfo->gen < 6) {
1394      emit_ndc_computation();
1395   }
1396
1397   /* We may need to split this up into several URB writes, so do them in a
1398    * loop.
1399    */
1400   int slot = 0;
1401   bool complete = false;
1402   do {
1403      /* URB offset is in URB row increments, and each of our MRFs is half of
1404       * one of those, since we're doing interleaved writes.
1405       */
1406      int offset = slot / 2;
1407
1408      mrf = base_mrf + 1;
1409      for (; slot < prog_data->vue_map.num_slots; ++slot) {
1410         emit_urb_slot(dst_reg(MRF, mrf++),
1411                       prog_data->vue_map.slot_to_varying[slot]);
1412
1413         /* If this was max_usable_mrf, we can't fit anything more into this
1414          * URB WRITE. Same thing if we reached the maximum length available.
1415          */
1416         if (mrf > max_usable_mrf ||
1417             align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1418            slot++;
1419            break;
1420         }
1421      }
1422
1423      complete = slot >= prog_data->vue_map.num_slots;
1424      current_annotation = "URB write";
1425      vec4_instruction *inst = emit_urb_write_opcode(complete);
1426      inst->base_mrf = base_mrf;
1427      inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1428      inst->offset += offset;
1429   } while(!complete);
1430}
1431
1432
1433src_reg
1434vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1435				 src_reg *reladdr, int reg_offset)
1436{
1437   /* Because we store the values to scratch interleaved like our
1438    * vertex data, we need to scale the vec4 index by 2.
1439    */
1440   int message_header_scale = 2;
1441
1442   /* Pre-gen6, the message header uses byte offsets instead of vec4
1443    * (16-byte) offset units.
1444    */
1445   if (devinfo->gen < 6)
1446      message_header_scale *= 16;
1447
1448   if (reladdr) {
1449      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1450       * to multiply the reladdr by 2. Notice that the reg_offset part
1451       * is in units of 16 bytes and is used to select the low/high 16-byte
1452       * chunk of a full dvec4, so we don't want to multiply that part.
1453       */
1454      src_reg index = src_reg(this, glsl_type::int_type);
1455      if (type_sz(inst->dst.type) < 8) {
1456         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1457                                      brw_imm_d(reg_offset)));
1458         emit_before(block, inst, MUL(dst_reg(index), index,
1459                                      brw_imm_d(message_header_scale)));
1460      } else {
1461         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1462                                      brw_imm_d(message_header_scale * 2)));
1463         emit_before(block, inst, ADD(dst_reg(index), index,
1464                                      brw_imm_d(reg_offset * message_header_scale)));
1465      }
1466      return index;
1467   } else {
1468      return brw_imm_d(reg_offset * message_header_scale);
1469   }
1470}
1471
1472/**
1473 * Emits an instruction before @inst to load the value named by @orig_src
1474 * from scratch space at @base_offset to @temp.
1475 *
1476 * @base_offset is measured in 32-byte units (the size of a register).
1477 */
1478void
1479vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1480				dst_reg temp, src_reg orig_src,
1481				int base_offset)
1482{
1483   assert(orig_src.offset % REG_SIZE == 0);
1484   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1485   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486                                      reg_offset);
1487
1488   if (type_sz(orig_src.type) < 8) {
1489      emit_before(block, inst, SCRATCH_READ(temp, index));
1490   } else {
1491      dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1492      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1493      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1494      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1495      vec4_instruction *last_read =
1496         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1497      emit_before(block, inst, last_read);
1498      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1499   }
1500}
1501
1502/**
1503 * Emits an instruction after @inst to store the value to be written
1504 * to @orig_dst to scratch space at @base_offset, from @temp.
1505 *
1506 * @base_offset is measured in 32-byte units (the size of a register).
1507 */
1508void
1509vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1510                                 int base_offset)
1511{
1512   assert(inst->dst.offset % REG_SIZE == 0);
1513   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1514   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1515                                      reg_offset);
1516
1517   /* Create a temporary register to store *inst's result in.
1518    *
1519    * We have to be careful in MOVing from our temporary result register in
1520    * the scratch write.  If we swizzle from channels of the temporary that
1521    * weren't initialized, it will confuse live interval analysis, which will
1522    * make spilling fail to make progress.
1523    */
1524   bool is_64bit = type_sz(inst->dst.type) == 8;
1525   const glsl_type *alloc_type =
1526      is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1527   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1528                                       inst->dst.type),
1529                                brw_swizzle_for_mask(inst->dst.writemask));
1530
1531   if (!is_64bit) {
1532      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1533				          inst->dst.writemask));
1534      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1535      if (inst->opcode != BRW_OPCODE_SEL)
1536         write->predicate = inst->predicate;
1537      write->ir = inst->ir;
1538      write->annotation = inst->annotation;
1539      inst->insert_after(block, write);
1540   } else {
1541      dst_reg shuffled = dst_reg(this, alloc_type);
1542      vec4_instruction *last =
1543         shuffle_64bit_data(shuffled, temp, true, block, inst);
1544      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1545
1546      uint8_t mask = 0;
1547      if (inst->dst.writemask & WRITEMASK_X)
1548         mask |= WRITEMASK_XY;
1549      if (inst->dst.writemask & WRITEMASK_Y)
1550         mask |= WRITEMASK_ZW;
1551      if (mask) {
1552         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1553
1554         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1555         if (inst->opcode != BRW_OPCODE_SEL)
1556            write->predicate = inst->predicate;
1557         write->ir = inst->ir;
1558         write->annotation = inst->annotation;
1559         last->insert_after(block, write);
1560      }
1561
1562      mask = 0;
1563      if (inst->dst.writemask & WRITEMASK_Z)
1564         mask |= WRITEMASK_XY;
1565      if (inst->dst.writemask & WRITEMASK_W)
1566         mask |= WRITEMASK_ZW;
1567      if (mask) {
1568         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1569
1570         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1571                                            reg_offset + 1);
1572         vec4_instruction *write =
1573            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1574         if (inst->opcode != BRW_OPCODE_SEL)
1575            write->predicate = inst->predicate;
1576         write->ir = inst->ir;
1577         write->annotation = inst->annotation;
1578         last->insert_after(block, write);
1579      }
1580   }
1581
1582   inst->dst.file = temp.file;
1583   inst->dst.nr = temp.nr;
1584   inst->dst.offset %= REG_SIZE;
1585   inst->dst.reladdr = NULL;
1586}
1587
1588/**
1589 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1590 * adds the scratch read(s) before \p inst. The function also checks for
1591 * recursive reladdr scratch accesses, issuing the corresponding scratch
1592 * loads and rewriting reladdr references accordingly.
1593 *
1594 * \return \p src if it did not require a scratch load, otherwise, the
1595 * register holding the result of the scratch load that the caller should
1596 * use to rewrite src.
1597 */
1598src_reg
1599vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1600                                   vec4_instruction *inst, src_reg src)
1601{
1602   /* Resolve recursive reladdr scratch access by calling ourselves
1603    * with src.reladdr
1604    */
1605   if (src.reladdr)
1606      *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1607                                          *src.reladdr);
1608
1609   /* Now handle scratch access on src */
1610   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1611      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1612         glsl_type::dvec4_type : glsl_type::vec4_type);
1613      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1614      src.nr = temp.nr;
1615      src.offset %= REG_SIZE;
1616      src.reladdr = NULL;
1617   }
1618
1619   return src;
1620}
1621
1622/**
1623 * We can't generally support array access in GRF space, because a
1624 * single instruction's destination can only span 2 contiguous
1625 * registers.  So, we send all GRF arrays that get variable index
1626 * access to scratch space.
1627 */
1628void
1629vec4_visitor::move_grf_array_access_to_scratch()
1630{
1631   int scratch_loc[this->alloc.count];
1632   memset(scratch_loc, -1, sizeof(scratch_loc));
1633
1634   /* First, calculate the set of virtual GRFs that need to be punted
1635    * to scratch due to having any array access on them, and where in
1636    * scratch.
1637    */
1638   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1639      if (inst->dst.file == VGRF && inst->dst.reladdr) {
1640         if (scratch_loc[inst->dst.nr] == -1) {
1641            scratch_loc[inst->dst.nr] = last_scratch;
1642            last_scratch += this->alloc.sizes[inst->dst.nr];
1643         }
1644
1645         for (src_reg *iter = inst->dst.reladdr;
1646              iter->reladdr;
1647              iter = iter->reladdr) {
1648            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1649               scratch_loc[iter->nr] = last_scratch;
1650               last_scratch += this->alloc.sizes[iter->nr];
1651            }
1652         }
1653      }
1654
1655      for (int i = 0 ; i < 3; i++) {
1656         for (src_reg *iter = &inst->src[i];
1657              iter->reladdr;
1658              iter = iter->reladdr) {
1659            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1660               scratch_loc[iter->nr] = last_scratch;
1661               last_scratch += this->alloc.sizes[iter->nr];
1662            }
1663         }
1664      }
1665   }
1666
1667   /* Now, for anything that will be accessed through scratch, rewrite
1668    * it to load/store.  Note that this is a _safe list walk, because
1669    * we may generate a new scratch_write instruction after the one
1670    * we're processing.
1671    */
1672   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1673      /* Set up the annotation tracking for new generated instructions. */
1674      base_ir = inst->ir;
1675      current_annotation = inst->annotation;
1676
1677      /* First handle scratch access on the dst. Notice we have to handle
1678       * the case where the dst's reladdr also points to scratch space.
1679       */
1680      if (inst->dst.reladdr)
1681         *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1682                                                   *inst->dst.reladdr);
1683
1684      /* Now that we have handled any (possibly recursive) reladdr scratch
1685       * accesses for dst we can safely do the scratch write for dst itself
1686       */
1687      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1688         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1689
1690      /* Now handle scratch access on any src. In this case, since inst->src[i]
1691       * already is a src_reg, we can just call emit_resolve_reladdr with
1692       * inst->src[i] and it will take care of handling scratch loads for
1693       * both src and src.reladdr (recursively).
1694       */
1695      for (int i = 0 ; i < 3; i++) {
1696         inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1697                                             inst->src[i]);
1698      }
1699   }
1700}
1701
1702/**
1703 * Emits an instruction before @inst to load the value named by @orig_src
1704 * from the pull constant buffer (surface) at @base_offset to @temp.
1705 */
1706void
1707vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1708                                      dst_reg temp, src_reg orig_src,
1709                                      int base_offset, src_reg indirect)
1710{
1711   assert(orig_src.offset % 16 == 0);
1712   const unsigned index = prog_data->base.binding_table.pull_constants_start;
1713
1714   /* For 64bit loads we need to emit two 32-bit load messages and we also
1715    * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1716    * that we emit the 32-bit loads into a temporary and we shuffle the result
1717    * into the original destination.
1718    */
1719   dst_reg orig_temp = temp;
1720   bool is_64bit = type_sz(orig_src.type) == 8;
1721   if (is_64bit) {
1722      assert(type_sz(temp.type) == 8);
1723      dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1724      temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1725   }
1726
1727   src_reg src = orig_src;
1728   for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1729      int reg_offset = base_offset + src.offset / 16;
1730
1731      src_reg offset;
1732      if (indirect.file != BAD_FILE) {
1733         offset = src_reg(this, glsl_type::uint_type);
1734         emit_before(block, inst, ADD(dst_reg(offset), indirect,
1735                                      brw_imm_ud(reg_offset * 16)));
1736      } else if (devinfo->gen >= 8) {
1737         /* Store the offset in a GRF so we can send-from-GRF. */
1738         offset = src_reg(this, glsl_type::uint_type);
1739         emit_before(block, inst, MOV(dst_reg(offset),
1740                                      brw_imm_ud(reg_offset * 16)));
1741      } else {
1742         offset = brw_imm_d(reg_offset * 16);
1743      }
1744
1745      emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1746                                  brw_imm_ud(index),
1747                                  offset,
1748                                  block, inst);
1749
1750      src = byte_offset(src, 16);
1751   }
1752
1753   if (is_64bit) {
1754      temp = retype(temp, BRW_REGISTER_TYPE_DF);
1755      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1756   }
1757}
1758
1759/**
1760 * Implements array access of uniforms by inserting a
1761 * PULL_CONSTANT_LOAD instruction.
1762 *
1763 * Unlike temporary GRF array access (where we don't support it due to
1764 * the difficulty of doing relative addressing on instruction
1765 * destinations), we could potentially do array access of uniforms
1766 * that were loaded in GRF space as push constants.  In real-world
1767 * usage we've seen, though, the arrays being used are always larger
1768 * than we could load as push constants, so just always move all
1769 * uniform array access out to a pull constant buffer.
1770 */
1771void
1772vec4_visitor::move_uniform_array_access_to_pull_constants()
1773{
1774   /* The vulkan dirver doesn't support pull constants other than UBOs so
1775    * everything has to be pushed regardless.
1776    */
1777   if (!compiler->supports_pull_constants) {
1778      split_uniform_registers();
1779      return;
1780   }
1781
1782   /* Allocate the pull_params array */
1783   assert(stage_prog_data->nr_pull_params == 0);
1784   stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1785                                              this->uniforms * 4);
1786
1787   int pull_constant_loc[this->uniforms];
1788   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1789
1790   /* First, walk through the instructions and determine which things need to
1791    * be pulled.  We mark something as needing to be pulled by setting
1792    * pull_constant_loc to 0.
1793    */
1794   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1795      /* We only care about MOV_INDIRECT of a uniform */
1796      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1797          inst->src[0].file != UNIFORM)
1798         continue;
1799
1800      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1801
1802      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1803         pull_constant_loc[uniform_nr + j] = 0;
1804   }
1805
1806   /* Next, we walk the list of uniforms and assign real pull constant
1807    * locations and set their corresponding entries in pull_param.
1808    */
1809   for (int j = 0; j < this->uniforms; j++) {
1810      if (pull_constant_loc[j] < 0)
1811         continue;
1812
1813      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1814
1815      for (int i = 0; i < 4; i++) {
1816         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1817            = stage_prog_data->param[j * 4 + i];
1818      }
1819   }
1820
1821   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1822    * instructions to actual uniform pulls.
1823    */
1824   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1825      /* We only care about MOV_INDIRECT of a uniform */
1826      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1827          inst->src[0].file != UNIFORM)
1828         continue;
1829
1830      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1831
1832      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1833
1834      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1835                              pull_constant_loc[uniform_nr], inst->src[1]);
1836      inst->remove(block);
1837   }
1838
1839   /* Now there are no accesses of the UNIFORM file with a reladdr, so
1840    * no need to track them as larger-than-vec4 objects.  This will be
1841    * relied on in cutting out unused uniform vectors from push
1842    * constants.
1843    */
1844   split_uniform_registers();
1845}
1846
1847void
1848vec4_visitor::resolve_ud_negate(src_reg *reg)
1849{
1850   if (reg->type != BRW_REGISTER_TYPE_UD ||
1851       !reg->negate)
1852      return;
1853
1854   src_reg temp = src_reg(this, glsl_type::uvec4_type);
1855   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1856   *reg = temp;
1857}
1858
1859vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1860                           void *log_data,
1861                           const struct brw_sampler_prog_key_data *key_tex,
1862                           struct brw_vue_prog_data *prog_data,
1863                           const nir_shader *shader,
1864			   void *mem_ctx,
1865                           bool no_spills,
1866                           int shader_time_index)
1867   : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1868     key_tex(key_tex),
1869     prog_data(prog_data),
1870     fail_msg(NULL),
1871     first_non_payload_grf(0),
1872     need_all_constants_in_pull_buffer(false),
1873     no_spills(no_spills),
1874     shader_time_index(shader_time_index),
1875     last_scratch(0)
1876{
1877   this->failed = false;
1878
1879   this->base_ir = NULL;
1880   this->current_annotation = NULL;
1881   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1882
1883   memset(this->output_num_components, 0, sizeof(this->output_num_components));
1884
1885   this->virtual_grf_start = NULL;
1886   this->virtual_grf_end = NULL;
1887   this->live_intervals = NULL;
1888
1889   this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1890
1891   this->uniforms = 0;
1892
1893   this->nir_locals = NULL;
1894   this->nir_ssa_values = NULL;
1895}
1896
1897
1898void
1899vec4_visitor::fail(const char *format, ...)
1900{
1901   va_list va;
1902   char *msg;
1903
1904   if (failed)
1905      return;
1906
1907   failed = true;
1908
1909   va_start(va, format);
1910   msg = ralloc_vasprintf(mem_ctx, format, va);
1911   va_end(va);
1912   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1913
1914   this->fail_msg = msg;
1915
1916   if (debug_enabled) {
1917      fprintf(stderr, "%s",  msg);
1918   }
1919}
1920
1921} /* namespace brw */
1922