1/*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_fs.h"
25#include "brw_cfg.h"
26#include "brw_fs_builder.h"
27
28using namespace brw;
29
30namespace {
31   /* From the SKL PRM Vol 2a, "Move":
32    *
33    * "A mov with the same source and destination type, no source modifier,
34    *  and no saturation is a raw move. A packed byte destination region (B
35    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36    *  using raw move."
37    */
38   bool
39   is_byte_raw_mov(const fs_inst *inst)
40   {
41      return type_sz(inst->dst.type) == 1 &&
42             inst->opcode == BRW_OPCODE_MOV &&
43             inst->src[0].type == inst->dst.type &&
44             !inst->saturate &&
45             !inst->src[0].negate &&
46             !inst->src[0].abs;
47   }
48
49   /*
50    * Return an acceptable byte stride for the destination of an instruction
51    * that requires it to have some particular alignment.
52    */
53   unsigned
54   required_dst_byte_stride(const fs_inst *inst)
55   {
56      if (inst->dst.is_accumulator()) {
57         /* If the destination is an accumulator, insist that we leave the
58          * stride alone.  We cannot "fix" accumulator destinations by writing
59          * to a temporary and emitting a MOV into the original destination.
60          * For multiply instructions (our one use of the accumulator), the
61          * MUL writes the full 66 bits of the accumulator whereas the MOV we
62          * would emit only writes 33 bits and leaves the top 33 bits
63          * undefined.
64          *
65          * It's safe to just require the original stride here because the
66          * lowering pass will detect the mismatch in has_invalid_src_region
67          * and fix the sources of the multiply instead of the destination.
68          */
69         return inst->dst.stride * type_sz(inst->dst.type);
70      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71          !is_byte_raw_mov(inst)) {
72         return get_exec_type_size(inst);
73      } else {
74         /* Calculate the maximum byte stride and the minimum/maximum type
75          * size across all source and destination operands we are required to
76          * lower.
77          */
78         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79         unsigned min_size = type_sz(inst->dst.type);
80         unsigned max_size = type_sz(inst->dst.type);
81
82         for (unsigned i = 0; i < inst->sources; i++) {
83            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84               const unsigned size = type_sz(inst->src[i].type);
85               max_stride = MAX2(max_stride, inst->src[i].stride * size);
86               min_size = MIN2(min_size, size);
87               max_size = MAX2(max_size, size);
88            }
89         }
90
91         /* All operands involved in lowering need to fit in the calculated
92          * stride.
93          */
94         assert(max_size <= 4 * min_size);
95
96         /* Attempt to use the largest byte stride among all present operands,
97          * but never exceed a stride of 4 since that would lead to illegal
98          * destination regions during lowering.
99          */
100         return MIN2(max_stride, 4 * min_size);
101      }
102   }
103
104   /*
105    * Return an acceptable byte sub-register offset for the destination of an
106    * instruction that requires it to be aligned to the sub-register offset of
107    * the sources.
108    */
109   unsigned
110   required_dst_byte_offset(const fs_inst *inst)
111   {
112      for (unsigned i = 0; i < inst->sources; i++) {
113         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114            if (reg_offset(inst->src[i]) % REG_SIZE !=
115                reg_offset(inst->dst) % REG_SIZE)
116               return 0;
117      }
118
119      return reg_offset(inst->dst) % REG_SIZE;
120   }
121
122   /*
123    * Return whether the instruction has an unsupported channel bit layout
124    * specified for the i-th source region.
125    */
126   bool
127   has_invalid_src_region(const intel_device_info *devinfo, const fs_inst *inst,
128                          unsigned i)
129   {
130      if (is_unordered(inst) || inst->is_control_source(i))
131         return false;
132
133      /* Empirical testing shows that Broadwell has a bug affecting half-float
134       * MAD instructions when any of its sources has a non-zero offset, such
135       * as:
136       *
137       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138       *
139       * We used to generate code like this for SIMD8 executions where we
140       * used to pack components Y and W of a vector at offset 16B of a SIMD
141       * register. The problem doesn't occur if the stride of the source is 0.
142       */
143      if (devinfo->ver == 8 &&
144          inst->opcode == BRW_OPCODE_MAD &&
145          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147          inst->src[i].stride != 0) {
148         return true;
149      }
150
151      const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152      const unsigned src_byte_stride = inst->src[i].stride *
153         type_sz(inst->src[i].type);
154      const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155      const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156
157      return has_dst_aligned_region_restriction(devinfo, inst) &&
158             !is_uniform(inst->src[i]) &&
159             (src_byte_stride != dst_byte_stride ||
160              src_byte_offset != dst_byte_offset);
161   }
162
163   /*
164    * Return whether the instruction has an unsupported channel bit layout
165    * specified for the destination region.
166    */
167   bool
168   has_invalid_dst_region(const intel_device_info *devinfo,
169                          const fs_inst *inst)
170   {
171      if (is_unordered(inst)) {
172         return false;
173      } else {
174         const brw_reg_type exec_type = get_exec_type(inst);
175         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178            type_sz(inst->dst.type) < type_sz(exec_type);
179
180         return (has_dst_aligned_region_restriction(devinfo, inst) &&
181                 (required_dst_byte_stride(inst) != dst_byte_stride ||
182                  required_dst_byte_offset(inst) != dst_byte_offset)) ||
183                (is_narrowing_conversion &&
184                 required_dst_byte_stride(inst) != dst_byte_stride);
185      }
186   }
187
188   /**
189    * Return a non-zero value if the execution type of the instruction is
190    * unsupported.  The destination and sources matching the returned mask
191    * will be bit-cast to an integer type of appropriate size, lowering any
192    * source or destination modifiers into separate MOV instructions.
193    */
194   unsigned
195   has_invalid_exec_type(const intel_device_info *devinfo, const fs_inst *inst)
196   {
197      switch (inst->opcode) {
198      case SHADER_OPCODE_SHUFFLE:
199      case SHADER_OPCODE_QUAD_SWIZZLE:
200         return has_dst_aligned_region_restriction(devinfo, inst) ?
201                0x1 : 0;
202
203      case SHADER_OPCODE_BROADCAST:
204      case SHADER_OPCODE_MOV_INDIRECT:
205         return (((devinfo->verx10 == 70) ||
206                  devinfo->is_cherryview || intel_device_info_is_9lp(devinfo) ||
207                  devinfo->verx10 >= 125) && type_sz(inst->src[0].type) > 4) ||
208                (devinfo->verx10 >= 125 &&
209                 brw_reg_type_is_floating_point(inst->src[0].type)) ?
210                0x1 : 0;
211
212      default:
213         return 0;
214      }
215   }
216
217   /*
218    * Return whether the instruction has unsupported source modifiers
219    * specified for the i-th source region.
220    */
221   bool
222   has_invalid_src_modifiers(const intel_device_info *devinfo,
223                             const fs_inst *inst, unsigned i)
224   {
225      return (!inst->can_do_source_mods(devinfo) &&
226              (inst->src[i].negate || inst->src[i].abs)) ||
227             ((has_invalid_exec_type(devinfo, inst) & (1u << i)) &&
228              (inst->src[i].negate || inst->src[i].abs ||
229               inst->src[i].type != get_exec_type(inst)));
230   }
231
232   /*
233    * Return whether the instruction has an unsupported type conversion
234    * specified for the destination.
235    */
236   bool
237   has_invalid_conversion(const intel_device_info *devinfo, const fs_inst *inst)
238   {
239      switch (inst->opcode) {
240      case BRW_OPCODE_MOV:
241         return false;
242      case BRW_OPCODE_SEL:
243         return inst->dst.type != get_exec_type(inst);
244      default:
245         /* FIXME: We assume the opcodes not explicitly mentioned before just
246          * work fine with arbitrary conversions, unless they need to be
247          * bit-cast.
248          */
249         return has_invalid_exec_type(devinfo, inst) &&
250                inst->dst.type != get_exec_type(inst);
251      }
252   }
253
254   /**
255    * Return whether the instruction has unsupported destination modifiers.
256    */
257   bool
258   has_invalid_dst_modifiers(const intel_device_info *devinfo, const fs_inst *inst)
259   {
260      return (has_invalid_exec_type(devinfo, inst) &&
261              (inst->saturate || inst->conditional_mod)) ||
262             has_invalid_conversion(devinfo, inst);
263   }
264
265   /**
266    * Return whether the instruction has non-standard semantics for the
267    * conditional mod which don't cause the flag register to be updated with
268    * the comparison result.
269    */
270   bool
271   has_inconsistent_cmod(const fs_inst *inst)
272   {
273      return inst->opcode == BRW_OPCODE_SEL ||
274             inst->opcode == BRW_OPCODE_CSEL ||
275             inst->opcode == BRW_OPCODE_IF ||
276             inst->opcode == BRW_OPCODE_WHILE;
277   }
278
279   bool
280   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
281}
282
283namespace brw {
284   /**
285    * Remove any modifiers from the \p i-th source region of the instruction,
286    * including negate, abs and any implicit type conversion to the execution
287    * type.  Instead any source modifiers will be implemented as a separate
288    * MOV instruction prior to the original instruction.
289    */
290   bool
291   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
292   {
293      assert(inst->components_read(i) == 1);
294      assert(v->devinfo->has_integer_dword_mul ||
295             inst->opcode != BRW_OPCODE_MUL ||
296             brw_reg_type_is_floating_point(get_exec_type(inst)) ||
297             MIN2(type_sz(inst->src[0].type), type_sz(inst->src[1].type)) >= 4 ||
298             type_sz(inst->src[i].type) == get_exec_type_size(inst));
299
300      const fs_builder ibld(v, block, inst);
301      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
302
303      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
304      inst->src[i] = tmp;
305
306      return true;
307   }
308}
309
310namespace {
311   /**
312    * Remove any modifiers from the destination region of the instruction,
313    * including saturate, conditional mod and any implicit type conversion
314    * from the execution type.  Instead any destination modifiers will be
315    * implemented as a separate MOV instruction after the original
316    * instruction.
317    */
318   bool
319   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
320   {
321      const fs_builder ibld(v, block, inst);
322      const brw_reg_type type = get_exec_type(inst);
323      /* Not strictly necessary, but if possible use a temporary with the same
324       * channel alignment as the current destination in order to avoid
325       * violating the restrictions enforced later on by lower_src_region()
326       * and lower_dst_region(), which would introduce additional copy
327       * instructions into the program unnecessarily.
328       */
329      const unsigned stride =
330         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
331         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
332      fs_reg tmp = ibld.vgrf(type, stride);
333      ibld.UNDEF(tmp);
334      tmp = horiz_stride(tmp, stride);
335
336      /* Emit a MOV taking care of all the destination modifiers. */
337      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
338      mov->saturate = inst->saturate;
339      if (!has_inconsistent_cmod(inst))
340         mov->conditional_mod = inst->conditional_mod;
341      if (inst->opcode != BRW_OPCODE_SEL) {
342         mov->predicate = inst->predicate;
343         mov->predicate_inverse = inst->predicate_inverse;
344      }
345      mov->flag_subreg = inst->flag_subreg;
346      lower_instruction(v, block, mov);
347
348      /* Point the original instruction at the temporary, and clean up any
349       * destination modifiers.
350       */
351      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
352      inst->dst = tmp;
353      inst->size_written = inst->dst.component_size(inst->exec_size);
354      inst->saturate = false;
355      if (!has_inconsistent_cmod(inst))
356         inst->conditional_mod = BRW_CONDITIONAL_NONE;
357
358      assert(!inst->flags_written(v->devinfo) || !mov->predicate);
359      return true;
360   }
361
362   /**
363    * Remove any non-trivial shuffling of data from the \p i-th source region
364    * of the instruction.  Instead implement the region as a series of integer
365    * copies into a temporary with the same channel layout as the destination.
366    */
367   bool
368   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
369   {
370      assert(inst->components_read(i) == 1);
371      const fs_builder ibld(v, block, inst);
372      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
373                              type_sz(inst->src[i].type);
374      assert(stride > 0);
375      fs_reg tmp = ibld.vgrf(inst->src[i].type, stride);
376      ibld.UNDEF(tmp);
377      tmp = horiz_stride(tmp, stride);
378
379      /* Emit a series of 32-bit integer copies with any source modifiers
380       * cleaned up (because their semantics are dependent on the type).
381       */
382      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
383                                                 false);
384      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
385      fs_reg raw_src = inst->src[i];
386      raw_src.negate = false;
387      raw_src.abs = false;
388
389      for (unsigned j = 0; j < n; j++)
390         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
391
392      /* Point the original instruction at the temporary, making sure to keep
393       * any source modifiers in the instruction.
394       */
395      fs_reg lower_src = tmp;
396      lower_src.negate = inst->src[i].negate;
397      lower_src.abs = inst->src[i].abs;
398      inst->src[i] = lower_src;
399
400      return true;
401   }
402
403   /**
404    * Remove any non-trivial shuffling of data from the destination region of
405    * the instruction.  Instead implement the region as a series of integer
406    * copies from a temporary with a channel layout compatible with the
407    * sources.
408    */
409   bool
410   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
411   {
412      /* We cannot replace the result of an integer multiply which writes the
413       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
414       * value whereas the MOV will act on only 32 or 33 bits of the
415       * accumulator.
416       */
417      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
418             brw_reg_type_is_floating_point(inst->dst.type));
419
420      const fs_builder ibld(v, block, inst);
421      const unsigned stride = required_dst_byte_stride(inst) /
422                              type_sz(inst->dst.type);
423      assert(stride > 0);
424      fs_reg tmp = ibld.vgrf(inst->dst.type, stride);
425      ibld.UNDEF(tmp);
426      tmp = horiz_stride(tmp, stride);
427
428      /* Emit a series of 32-bit integer copies from the temporary into the
429       * original destination.
430       */
431      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
432                                                 false);
433      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
434
435      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
436         /* Note that in general we cannot simply predicate the copies on the
437          * same flag register as the original instruction, since it may have
438          * been overwritten by the instruction itself.  Instead initialize
439          * the temporary with the previous contents of the destination
440          * register.
441          */
442         for (unsigned j = 0; j < n; j++)
443            ibld.MOV(subscript(tmp, raw_type, j),
444                     subscript(inst->dst, raw_type, j));
445      }
446
447      for (unsigned j = 0; j < n; j++)
448         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
449                                        subscript(tmp, raw_type, j));
450
451      /* Point the original instruction at the temporary, making sure to keep
452       * any destination modifiers in the instruction.
453       */
454      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
455      inst->dst = tmp;
456      inst->size_written = inst->dst.component_size(inst->exec_size);
457
458      return true;
459   }
460
461   /**
462    * Bit-cast sources and destination of the instruction to an appropriate
463    * integer type, to be used in cases where the instruction doesn't support
464    * some other execution type.
465    */
466   bool
467   lower_exec_type(fs_visitor *v, bblock_t *block, fs_inst *inst)
468   {
469      assert(inst->dst.type == get_exec_type(inst));
470      const unsigned mask = has_invalid_exec_type(v->devinfo, inst);
471      const brw_reg_type raw_type = brw_int_type(type_sz(inst->dst.type), false);
472
473      for (unsigned i = 0; i < inst->sources; i++) {
474         if (mask & (1u << i)) {
475            assert(inst->src[i].type == inst->dst.type);
476            inst->src[i].type = raw_type;
477         }
478      }
479
480      inst->dst.type = raw_type;
481
482      return true;
483   }
484
485   /**
486    * Legalize the source and destination regioning controls of the specified
487    * instruction.
488    */
489   bool
490   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
491   {
492      const intel_device_info *devinfo = v->devinfo;
493      bool progress = false;
494
495      if (has_invalid_dst_modifiers(devinfo, inst))
496         progress |= lower_dst_modifiers(v, block, inst);
497
498      if (has_invalid_dst_region(devinfo, inst))
499         progress |= lower_dst_region(v, block, inst);
500
501      for (unsigned i = 0; i < inst->sources; i++) {
502         if (has_invalid_src_modifiers(devinfo, inst, i))
503            progress |= lower_src_modifiers(v, block, inst, i);
504
505         if (has_invalid_src_region(devinfo, inst, i))
506            progress |= lower_src_region(v, block, inst, i);
507      }
508
509      if (has_invalid_exec_type(devinfo, inst))
510         progress |= lower_exec_type(v, block, inst);
511
512      return progress;
513   }
514}
515
516bool
517fs_visitor::lower_regioning()
518{
519   bool progress = false;
520
521   foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
522      progress |= lower_instruction(this, block, inst);
523
524   if (progress)
525      invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
526
527   return progress;
528}
529