1/*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/**
25 * @file brw_vec4_copy_propagation.cpp
26 *
27 * Implements tracking of values copied between registers, and
28 * optimizations based on that: copy propagation and constant
29 * propagation.
30 */
31
32#include "brw_vec4.h"
33#include "brw_cfg.h"
34#include "brw_eu.h"
35
36namespace brw {
37
38struct copy_entry {
39   src_reg *value[4];
40   int saturatemask;
41};
42
43static bool
44is_direct_copy(vec4_instruction *inst)
45{
46   return (inst->opcode == BRW_OPCODE_MOV &&
47	   !inst->predicate &&
48	   inst->dst.file == VGRF &&
49	   inst->dst.offset % REG_SIZE == 0 &&
50	   !inst->dst.reladdr &&
51	   !inst->src[0].reladdr &&
52	   (inst->dst.type == inst->src[0].type ||
53            (inst->dst.type == BRW_REGISTER_TYPE_F &&
54             inst->src[0].type == BRW_REGISTER_TYPE_VF)));
55}
56
57static bool
58is_dominated_by_previous_instruction(vec4_instruction *inst)
59{
60   return (inst->opcode != BRW_OPCODE_DO &&
61	   inst->opcode != BRW_OPCODE_WHILE &&
62	   inst->opcode != BRW_OPCODE_ELSE &&
63	   inst->opcode != BRW_OPCODE_ENDIF);
64}
65
66static bool
67is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
68{
69   const src_reg *src = values[ch];
70
71   /* consider GRF only */
72   assert(inst->dst.file == VGRF);
73   if (!src || src->file != VGRF)
74      return false;
75
76   return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
77          (inst->dst.offset != src->offset ||
78           inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
79}
80
81/**
82 * Get the origin of a copy as a single register if all components present in
83 * the given readmask originate from the same register and have compatible
84 * regions, otherwise return a BAD_FILE register.
85 */
86static src_reg
87get_copy_value(const copy_entry &entry, unsigned readmask)
88{
89   unsigned swz[4] = {};
90   src_reg value;
91
92   for (unsigned i = 0; i < 4; i++) {
93      if (readmask & (1 << i)) {
94         if (entry.value[i]) {
95            src_reg src = *entry.value[i];
96
97            if (src.file == IMM) {
98               swz[i] = i;
99            } else {
100               swz[i] = BRW_GET_SWZ(src.swizzle, i);
101               /* Overwrite the original swizzle so the src_reg::equals call
102                * below doesn't care about it, the correct swizzle will be
103                * calculated once the swizzles of all components are known.
104                */
105               src.swizzle = BRW_SWIZZLE_XYZW;
106            }
107
108            if (value.file == BAD_FILE) {
109               value = src;
110            } else if (!value.equals(src)) {
111               return src_reg();
112            }
113         } else {
114            return src_reg();
115         }
116      }
117   }
118
119   return swizzle(value,
120                  brw_compose_swizzle(brw_swizzle_for_mask(readmask),
121                                      BRW_SWIZZLE4(swz[0], swz[1],
122                                                   swz[2], swz[3])));
123}
124
125static bool
126try_constant_propagate(vec4_instruction *inst,
127                       int arg, const copy_entry *entry)
128{
129   /* For constant propagation, we only handle the same constant
130    * across all 4 channels.  Some day, we should handle the 8-bit
131    * float vector format, which would let us constant propagate
132    * vectors better.
133    * We could be more aggressive here -- some channels might not get used
134    * based on the destination writemask.
135    */
136   src_reg value =
137      get_copy_value(*entry,
138                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
139                                                   WRITEMASK_XYZW));
140
141   if (value.file != IMM)
142      return false;
143
144   /* 64-bit types can't be used except for one-source instructions, which
145    * higher levels should have constant folded away, so there's no point in
146    * propagating immediates here.
147    */
148   if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
149      return false;
150
151   if (value.type == BRW_REGISTER_TYPE_VF) {
152      /* The result of bit-casting the component values of a vector float
153       * cannot in general be represented as an immediate.
154       */
155      if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
156         return false;
157   } else {
158      value.type = inst->src[arg].type;
159   }
160
161   if (inst->src[arg].abs) {
162      if (!brw_abs_immediate(value.type, &value.as_brw_reg()))
163         return false;
164   }
165
166   if (inst->src[arg].negate) {
167      if (!brw_negate_immediate(value.type, &value.as_brw_reg()))
168         return false;
169   }
170
171   value = swizzle(value, inst->src[arg].swizzle);
172
173   switch (inst->opcode) {
174   case BRW_OPCODE_MOV:
175   case SHADER_OPCODE_BROADCAST:
176      inst->src[arg] = value;
177      return true;
178
179   case VEC4_OPCODE_UNTYPED_ATOMIC:
180      if (arg == 1) {
181         inst->src[arg] = value;
182         return true;
183      }
184      break;
185
186   case SHADER_OPCODE_POW:
187   case SHADER_OPCODE_INT_QUOTIENT:
188   case SHADER_OPCODE_INT_REMAINDER:
189         break;
190   case BRW_OPCODE_DP2:
191   case BRW_OPCODE_DP3:
192   case BRW_OPCODE_DP4:
193   case BRW_OPCODE_DPH:
194   case BRW_OPCODE_BFI1:
195   case BRW_OPCODE_ASR:
196   case BRW_OPCODE_SHL:
197   case BRW_OPCODE_SHR:
198   case BRW_OPCODE_SUBB:
199      if (arg == 1) {
200         inst->src[arg] = value;
201         return true;
202      }
203      break;
204
205   case BRW_OPCODE_MACH:
206   case BRW_OPCODE_MUL:
207   case SHADER_OPCODE_MULH:
208   case BRW_OPCODE_ADD:
209   case BRW_OPCODE_OR:
210   case BRW_OPCODE_AND:
211   case BRW_OPCODE_XOR:
212   case BRW_OPCODE_ADDC:
213      if (arg == 1) {
214	 inst->src[arg] = value;
215	 return true;
216      } else if (arg == 0 && inst->src[1].file != IMM) {
217	 /* Fit this constant in by commuting the operands.  Exception: we
218	  * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
219	  */
220	 if ((inst->opcode == BRW_OPCODE_MUL ||
221              inst->opcode == BRW_OPCODE_MACH) &&
222	     (inst->src[1].type == BRW_REGISTER_TYPE_D ||
223	      inst->src[1].type == BRW_REGISTER_TYPE_UD))
224	    break;
225	 inst->src[0] = inst->src[1];
226	 inst->src[1] = value;
227	 return true;
228      }
229      break;
230   case GS_OPCODE_SET_WRITE_OFFSET:
231      /* This is just a multiply by a constant with special strides.
232       * The generator will handle immediates in both arguments (generating
233       * a single MOV of the product).  So feel free to propagate in src0.
234       */
235      inst->src[arg] = value;
236      return true;
237
238   case BRW_OPCODE_CMP:
239      if (arg == 1) {
240	 inst->src[arg] = value;
241	 return true;
242      } else if (arg == 0 && inst->src[1].file != IMM) {
243	 enum brw_conditional_mod new_cmod;
244
245	 new_cmod = brw_swap_cmod(inst->conditional_mod);
246	 if (new_cmod != BRW_CONDITIONAL_NONE) {
247	    /* Fit this constant in by swapping the operands and
248	     * flipping the test.
249	     */
250	    inst->src[0] = inst->src[1];
251	    inst->src[1] = value;
252	    inst->conditional_mod = new_cmod;
253	    return true;
254	 }
255      }
256      break;
257
258   case BRW_OPCODE_SEL:
259      if (arg == 1) {
260	 inst->src[arg] = value;
261	 return true;
262      } else if (arg == 0 && inst->src[1].file != IMM) {
263	 inst->src[0] = inst->src[1];
264	 inst->src[1] = value;
265
266	 /* If this was predicated, flipping operands means
267	  * we also need to flip the predicate.
268	  */
269	 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
270	    inst->predicate_inverse = !inst->predicate_inverse;
271	 }
272	 return true;
273      }
274      break;
275
276   default:
277      break;
278   }
279
280   return false;
281}
282
283static bool
284is_align1_opcode(unsigned opcode)
285{
286   switch (opcode) {
287   case VEC4_OPCODE_DOUBLE_TO_F32:
288   case VEC4_OPCODE_DOUBLE_TO_D32:
289   case VEC4_OPCODE_DOUBLE_TO_U32:
290   case VEC4_OPCODE_TO_DOUBLE:
291   case VEC4_OPCODE_PICK_LOW_32BIT:
292   case VEC4_OPCODE_PICK_HIGH_32BIT:
293   case VEC4_OPCODE_SET_LOW_32BIT:
294   case VEC4_OPCODE_SET_HIGH_32BIT:
295      return true;
296   default:
297      return false;
298   }
299}
300
301static bool
302try_copy_propagate(const struct intel_device_info *devinfo,
303                   vec4_instruction *inst, int arg,
304                   const copy_entry *entry, int attributes_per_reg)
305{
306   /* Build up the value we are propagating as if it were the source of a
307    * single MOV
308    */
309   src_reg value =
310      get_copy_value(*entry,
311                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
312                                                   WRITEMASK_XYZW));
313
314   /* Check that we can propagate that value */
315   if (value.file != UNIFORM &&
316       value.file != VGRF &&
317       value.file != ATTR)
318      return false;
319
320   /* Instructions that write 2 registers also need to read 2 registers. Make
321    * sure we don't break that restriction by copy propagating from a uniform.
322    */
323   if (inst->size_written > REG_SIZE && is_uniform(value))
324      return false;
325
326   /* There is a regioning restriction such that if execsize == width
327    * and hstride != 0 then the vstride can't be 0. When we split instrutions
328    * that take a single-precision source (like F->DF conversions) we end up
329    * with a 4-wide source on an instruction with an execution size of 4.
330    * If we then copy-propagate the source from a uniform we also end up with a
331    * vstride of 0 and we violate the restriction.
332    */
333   if (inst->exec_size == 4 && value.file == UNIFORM &&
334       type_sz(value.type) == 4)
335      return false;
336
337   /* If the type of the copy value is different from the type of the
338    * instruction then the swizzles and writemasks involved don't have the same
339    * meaning and simply replacing the source would produce different semantics.
340    */
341   if (type_sz(value.type) != type_sz(inst->src[arg].type))
342      return false;
343
344   if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
345      return false;
346
347   bool has_source_modifiers = value.negate || value.abs;
348
349   /* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on
350    * instructions.
351    */
352   if (has_source_modifiers && !inst->can_do_source_mods(devinfo))
353      return false;
354
355   /* Reject cases that would violate register regioning restrictions. */
356   if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) &&
357       ((devinfo->ver == 6 && inst->is_math()) ||
358        inst->is_send_from_grf() ||
359        inst->uses_indirect_addressing())) {
360      return false;
361   }
362
363   if (has_source_modifiers &&
364       value.type != inst->src[arg].type &&
365       !inst->can_change_types())
366      return false;
367
368   if (has_source_modifiers &&
369       (inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
370        inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT))
371      return false;
372
373   unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
374                                                   value.swizzle);
375
376   /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
377    * so copy-propagation won't be safe if the composed swizzle is anything
378    * other than the identity.
379    */
380   if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
381      return false;
382
383   if (inst->is_3src(devinfo) &&
384       (value.file == UNIFORM ||
385        (value.file == ATTR && attributes_per_reg != 1)) &&
386       !brw_is_single_value_swizzle(composed_swizzle))
387      return false;
388
389   if (inst->is_send_from_grf())
390      return false;
391
392   /* we can't generally copy-propagate UD negations becuse we
393    * end up accessing the resulting values as signed integers
394    * instead. See also resolve_ud_negate().
395    */
396   if (value.negate &&
397       value.type == BRW_REGISTER_TYPE_UD)
398      return false;
399
400   /* Don't report progress if this is a noop. */
401   if (value.equals(inst->src[arg]))
402      return false;
403
404   const unsigned dst_saturate_mask = inst->dst.writemask &
405      brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
406
407   if (dst_saturate_mask) {
408      /* We either saturate all or nothing. */
409      if (dst_saturate_mask != inst->dst.writemask)
410         return false;
411
412      /* Limit saturate propagation only to SEL with src1 bounded within 0.0
413       * and 1.0, otherwise skip copy propagate altogether.
414       */
415      switch(inst->opcode) {
416      case BRW_OPCODE_SEL:
417         if (arg != 0 ||
418             inst->src[0].type != BRW_REGISTER_TYPE_F ||
419             inst->src[1].file != IMM ||
420             inst->src[1].type != BRW_REGISTER_TYPE_F ||
421             inst->src[1].f < 0.0 ||
422             inst->src[1].f > 1.0) {
423            return false;
424         }
425         if (!inst->saturate)
426            inst->saturate = true;
427         break;
428      default:
429         return false;
430      }
431   }
432
433   /* Build the final value */
434   if (inst->src[arg].abs) {
435      value.negate = false;
436      value.abs = true;
437   }
438   if (inst->src[arg].negate)
439      value.negate = !value.negate;
440
441   value.swizzle = composed_swizzle;
442   if (has_source_modifiers &&
443       value.type != inst->src[arg].type) {
444      assert(inst->can_change_types());
445      for (int i = 0; i < 3; i++) {
446         inst->src[i].type = value.type;
447      }
448      inst->dst.type = value.type;
449   } else {
450      value.type = inst->src[arg].type;
451   }
452
453   inst->src[arg] = value;
454   return true;
455}
456
457bool
458vec4_visitor::opt_copy_propagation(bool do_constant_prop)
459{
460   /* If we are in dual instanced or single mode, then attributes are going
461    * to be interleaved, so one register contains two attribute slots.
462    */
463   const int attributes_per_reg =
464      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
465   bool progress = false;
466   struct copy_entry entries[alloc.total_size];
467
468   memset(&entries, 0, sizeof(entries));
469
470   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
471      /* This pass only works on basic blocks.  If there's flow
472       * control, throw out all our information and start from
473       * scratch.
474       *
475       * This should really be fixed by using a structure like in
476       * src/glsl/opt_copy_propagation.cpp to track available copies.
477       */
478      if (!is_dominated_by_previous_instruction(inst)) {
479	 memset(&entries, 0, sizeof(entries));
480	 continue;
481      }
482
483      /* For each source arg, see if each component comes from a copy
484       * from the same type file (IMM, VGRF, UNIFORM), and try
485       * optimizing out access to the copy result
486       */
487      for (int i = 2; i >= 0; i--) {
488	 /* Copied values end up in GRFs, and we don't track reladdr
489	  * accesses.
490	  */
491	 if (inst->src[i].file != VGRF ||
492	     inst->src[i].reladdr)
493	    continue;
494
495         /* We only handle register-aligned single GRF copies. */
496         if (inst->size_read(i) != REG_SIZE ||
497             inst->src[i].offset % REG_SIZE)
498            continue;
499
500         const unsigned reg = (alloc.offsets[inst->src[i].nr] +
501                               inst->src[i].offset / REG_SIZE);
502         const copy_entry &entry = entries[reg];
503
504         if (do_constant_prop && try_constant_propagate(inst, i, &entry))
505            progress = true;
506         else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
507	    progress = true;
508      }
509
510      /* Track available source registers. */
511      if (inst->dst.file == VGRF) {
512	 const int reg =
513            alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
514
515	 /* Update our destination's current channel values.  For a direct copy,
516	  * the value is the newly propagated source.  Otherwise, we don't know
517	  * the new value, so clear it.
518	  */
519	 bool direct_copy = is_direct_copy(inst);
520         entries[reg].saturatemask &= ~inst->dst.writemask;
521	 for (int i = 0; i < 4; i++) {
522	    if (inst->dst.writemask & (1 << i)) {
523               entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
524               entries[reg].saturatemask |=
525                  inst->saturate && direct_copy ? 1 << i : 0;
526            }
527	 }
528
529	 /* Clear the records for any registers whose current value came from
530	  * our destination's updated channels, as the two are no longer equal.
531	  */
532	 if (inst->dst.reladdr)
533	    memset(&entries, 0, sizeof(entries));
534	 else {
535	    for (unsigned i = 0; i < alloc.total_size; i++) {
536	       for (int j = 0; j < 4; j++) {
537		  if (is_channel_updated(inst, entries[i].value, j)) {
538		     entries[i].value[j] = NULL;
539		     entries[i].saturatemask &= ~(1 << j);
540                  }
541	       }
542	    }
543	 }
544      }
545   }
546
547   if (progress)
548      invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
549                          DEPENDENCY_INSTRUCTION_DETAIL);
550
551   return progress;
552}
553
554} /* namespace brw */
555