1/*
2 * Copyright © 2018 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "brw_fs.h"
25#include "brw_cfg.h"
26#include "brw_fs_builder.h"
27
28using namespace brw;
29
30namespace {
31   /* From the SKL PRM Vol 2a, "Move":
32    *
33    * "A mov with the same source and destination type, no source modifier,
34    *  and no saturation is a raw move. A packed byte destination region (B
35    *  or UB type with HorzStride == 1 and ExecSize > 1) can only be written
36    *  using raw move."
37    */
38   bool
39   is_byte_raw_mov(const fs_inst *inst)
40   {
41      return type_sz(inst->dst.type) == 1 &&
42             inst->opcode == BRW_OPCODE_MOV &&
43             inst->src[0].type == inst->dst.type &&
44             !inst->saturate &&
45             !inst->src[0].negate &&
46             !inst->src[0].abs;
47   }
48
49   /*
50    * Return an acceptable byte stride for the destination of an instruction
51    * that requires it to have some particular alignment.
52    */
53   unsigned
54   required_dst_byte_stride(const fs_inst *inst)
55   {
56      if (inst->dst.is_accumulator()) {
57         /* If the destination is an accumulator, insist that we leave the
58          * stride alone.  We cannot "fix" accumulator destinations by writing
59          * to a temporary and emitting a MOV into the original destination.
60          * For multiply instructions (our one use of the accumulator), the
61          * MUL writes the full 66 bits of the accumulator whereas the MOV we
62          * would emit only writes 33 bits and leaves the top 33 bits
63          * undefined.
64          *
65          * It's safe to just require the original stride here because the
66          * lowering pass will detect the mismatch in has_invalid_src_region
67          * and fix the sources of the multiply instead of the destination.
68          */
69         return inst->dst.stride * type_sz(inst->dst.type);
70      } else if (type_sz(inst->dst.type) < get_exec_type_size(inst) &&
71          !is_byte_raw_mov(inst)) {
72         return get_exec_type_size(inst);
73      } else {
74         /* Calculate the maximum byte stride and the minimum/maximum type
75          * size across all source and destination operands we are required to
76          * lower.
77          */
78         unsigned max_stride = inst->dst.stride * type_sz(inst->dst.type);
79         unsigned min_size = type_sz(inst->dst.type);
80         unsigned max_size = type_sz(inst->dst.type);
81
82         for (unsigned i = 0; i < inst->sources; i++) {
83            if (!is_uniform(inst->src[i]) && !inst->is_control_source(i)) {
84               const unsigned size = type_sz(inst->src[i].type);
85               max_stride = MAX2(max_stride, inst->src[i].stride * size);
86               min_size = MIN2(min_size, size);
87               max_size = MAX2(max_size, size);
88            }
89         }
90
91         /* All operands involved in lowering need to fit in the calculated
92          * stride.
93          */
94         assert(max_size <= 4 * min_size);
95
96         /* Attempt to use the largest byte stride among all present operands,
97          * but never exceed a stride of 4 since that would lead to illegal
98          * destination regions during lowering.
99          */
100         return MIN2(max_stride, 4 * min_size);
101      }
102   }
103
104   /*
105    * Return an acceptable byte sub-register offset for the destination of an
106    * instruction that requires it to be aligned to the sub-register offset of
107    * the sources.
108    */
109   unsigned
110   required_dst_byte_offset(const fs_inst *inst)
111   {
112      for (unsigned i = 0; i < inst->sources; i++) {
113         if (!is_uniform(inst->src[i]) && !inst->is_control_source(i))
114            if (reg_offset(inst->src[i]) % REG_SIZE !=
115                reg_offset(inst->dst) % REG_SIZE)
116               return 0;
117      }
118
119      return reg_offset(inst->dst) % REG_SIZE;
120   }
121
122   /*
123    * Return whether the instruction has an unsupported channel bit layout
124    * specified for the i-th source region.
125    */
126   bool
127   has_invalid_src_region(const gen_device_info *devinfo, const fs_inst *inst,
128                          unsigned i)
129   {
130      if (is_unordered(inst) || inst->is_control_source(i))
131         return false;
132
133      /* Empirical testing shows that Broadwell has a bug affecting half-float
134       * MAD instructions when any of its sources has a non-zero offset, such
135       * as:
136       *
137       * mad(8) g18<1>HF -g17<4,4,1>HF g14.8<4,4,1>HF g11<4,4,1>HF { align16 1Q };
138       *
139       * We used to generate code like this for SIMD8 executions where we
140       * used to pack components Y and W of a vector at offset 16B of a SIMD
141       * register. The problem doesn't occur if the stride of the source is 0.
142       */
143      if (devinfo->gen == 8 &&
144          inst->opcode == BRW_OPCODE_MAD &&
145          inst->src[i].type == BRW_REGISTER_TYPE_HF &&
146          reg_offset(inst->src[i]) % REG_SIZE > 0 &&
147          inst->src[i].stride != 0) {
148         return true;
149      }
150
151      const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
152      const unsigned src_byte_stride = inst->src[i].stride *
153         type_sz(inst->src[i].type);
154      const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
155      const unsigned src_byte_offset = reg_offset(inst->src[i]) % REG_SIZE;
156
157      return has_dst_aligned_region_restriction(devinfo, inst) &&
158             !is_uniform(inst->src[i]) &&
159             (src_byte_stride != dst_byte_stride ||
160              src_byte_offset != dst_byte_offset);
161   }
162
163   /*
164    * Return whether the instruction has an unsupported channel bit layout
165    * specified for the destination region.
166    */
167   bool
168   has_invalid_dst_region(const gen_device_info *devinfo,
169                          const fs_inst *inst)
170   {
171      if (is_unordered(inst)) {
172         return false;
173      } else {
174         const brw_reg_type exec_type = get_exec_type(inst);
175         const unsigned dst_byte_offset = reg_offset(inst->dst) % REG_SIZE;
176         const unsigned dst_byte_stride = inst->dst.stride * type_sz(inst->dst.type);
177         const bool is_narrowing_conversion = !is_byte_raw_mov(inst) &&
178            type_sz(inst->dst.type) < type_sz(exec_type);
179
180         return (has_dst_aligned_region_restriction(devinfo, inst) &&
181                 (required_dst_byte_stride(inst) != dst_byte_stride ||
182                  required_dst_byte_offset(inst) != dst_byte_offset)) ||
183                (is_narrowing_conversion &&
184                 required_dst_byte_stride(inst) != dst_byte_stride);
185      }
186   }
187
188   /*
189    * Return whether the instruction has unsupported source modifiers
190    * specified for the i-th source region.
191    */
192   bool
193   has_invalid_src_modifiers(const gen_device_info *devinfo, const fs_inst *inst,
194                             unsigned i)
195   {
196      return !inst->can_do_source_mods(devinfo) &&
197             (inst->src[i].negate || inst->src[i].abs);
198   }
199
200   /*
201    * Return whether the instruction has an unsupported type conversion
202    * specified for the destination.
203    */
204   bool
205   has_invalid_conversion(const gen_device_info *devinfo, const fs_inst *inst)
206   {
207      switch (inst->opcode) {
208      case BRW_OPCODE_MOV:
209         return false;
210      case BRW_OPCODE_SEL:
211         return inst->dst.type != get_exec_type(inst);
212      case SHADER_OPCODE_BROADCAST:
213      case SHADER_OPCODE_MOV_INDIRECT:
214         /* The source and destination types of these may be hard-coded to
215          * integer at codegen time due to hardware limitations of 64-bit
216          * types.
217          */
218         return ((devinfo->gen == 7 && !devinfo->is_haswell) ||
219                 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) &&
220                type_sz(inst->src[0].type) > 4 &&
221                inst->dst.type != inst->src[0].type;
222      default:
223         /* FIXME: We assume the opcodes don't explicitly mentioned before
224          * just work fine with arbitrary conversions.
225          */
226         return false;
227      }
228   }
229
230   /**
231    * Return whether the instruction has non-standard semantics for the
232    * conditional mod which don't cause the flag register to be updated with
233    * the comparison result.
234    */
235   bool
236   has_inconsistent_cmod(const fs_inst *inst)
237   {
238      return inst->opcode == BRW_OPCODE_SEL ||
239             inst->opcode == BRW_OPCODE_CSEL ||
240             inst->opcode == BRW_OPCODE_IF ||
241             inst->opcode == BRW_OPCODE_WHILE;
242   }
243
244   bool
245   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst);
246}
247
248namespace brw {
249   /**
250    * Remove any modifiers from the \p i-th source region of the instruction,
251    * including negate, abs and any implicit type conversion to the execution
252    * type.  Instead any source modifiers will be implemented as a separate
253    * MOV instruction prior to the original instruction.
254    */
255   bool
256   lower_src_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
257   {
258      assert(inst->components_read(i) == 1);
259      const fs_builder ibld(v, block, inst);
260      const fs_reg tmp = ibld.vgrf(get_exec_type(inst));
261
262      lower_instruction(v, block, ibld.MOV(tmp, inst->src[i]));
263      inst->src[i] = tmp;
264
265      return true;
266   }
267}
268
269namespace {
270   /**
271    * Remove any modifiers from the destination region of the instruction,
272    * including saturate, conditional mod and any implicit type conversion
273    * from the execution type.  Instead any destination modifiers will be
274    * implemented as a separate MOV instruction after the original
275    * instruction.
276    */
277   bool
278   lower_dst_modifiers(fs_visitor *v, bblock_t *block, fs_inst *inst)
279   {
280      const fs_builder ibld(v, block, inst);
281      const brw_reg_type type = get_exec_type(inst);
282      /* Not strictly necessary, but if possible use a temporary with the same
283       * channel alignment as the current destination in order to avoid
284       * violating the restrictions enforced later on by lower_src_region()
285       * and lower_dst_region(), which would introduce additional copy
286       * instructions into the program unnecessarily.
287       */
288      const unsigned stride =
289         type_sz(inst->dst.type) * inst->dst.stride <= type_sz(type) ? 1 :
290         type_sz(inst->dst.type) * inst->dst.stride / type_sz(type);
291      const fs_reg tmp = horiz_stride(ibld.vgrf(type, stride), stride);
292
293      /* Emit a MOV taking care of all the destination modifiers. */
294      fs_inst *mov = ibld.at(block, inst->next).MOV(inst->dst, tmp);
295      mov->saturate = inst->saturate;
296      if (!has_inconsistent_cmod(inst))
297         mov->conditional_mod = inst->conditional_mod;
298      if (inst->opcode != BRW_OPCODE_SEL) {
299         mov->predicate = inst->predicate;
300         mov->predicate_inverse = inst->predicate_inverse;
301      }
302      mov->flag_subreg = inst->flag_subreg;
303      lower_instruction(v, block, mov);
304
305      /* Point the original instruction at the temporary, and clean up any
306       * destination modifiers.
307       */
308      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
309      inst->dst = tmp;
310      inst->size_written = inst->dst.component_size(inst->exec_size);
311      inst->saturate = false;
312      if (!has_inconsistent_cmod(inst))
313         inst->conditional_mod = BRW_CONDITIONAL_NONE;
314
315      assert(!inst->flags_written() || !mov->predicate);
316      return true;
317   }
318
319   /**
320    * Remove any non-trivial shuffling of data from the \p i-th source region
321    * of the instruction.  Instead implement the region as a series of integer
322    * copies into a temporary with the same channel layout as the destination.
323    */
324   bool
325   lower_src_region(fs_visitor *v, bblock_t *block, fs_inst *inst, unsigned i)
326   {
327      assert(inst->components_read(i) == 1);
328      const fs_builder ibld(v, block, inst);
329      const unsigned stride = type_sz(inst->dst.type) * inst->dst.stride /
330                              type_sz(inst->src[i].type);
331      assert(stride > 0);
332      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->src[i].type, stride),
333                                      stride);
334
335      /* Emit a series of 32-bit integer copies with any source modifiers
336       * cleaned up (because their semantics are dependent on the type).
337       */
338      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
339                                                 false);
340      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
341      fs_reg raw_src = inst->src[i];
342      raw_src.negate = false;
343      raw_src.abs = false;
344
345      for (unsigned j = 0; j < n; j++)
346         ibld.MOV(subscript(tmp, raw_type, j), subscript(raw_src, raw_type, j));
347
348      /* Point the original instruction at the temporary, making sure to keep
349       * any source modifiers in the instruction.
350       */
351      fs_reg lower_src = tmp;
352      lower_src.negate = inst->src[i].negate;
353      lower_src.abs = inst->src[i].abs;
354      inst->src[i] = lower_src;
355
356      return true;
357   }
358
359   /**
360    * Remove any non-trivial shuffling of data from the destination region of
361    * the instruction.  Instead implement the region as a series of integer
362    * copies from a temporary with a channel layout compatible with the
363    * sources.
364    */
365   bool
366   lower_dst_region(fs_visitor *v, bblock_t *block, fs_inst *inst)
367   {
368      /* We cannot replace the result of an integer multiply which writes the
369       * accumulator because MUL+MACH pairs act on the accumulator as a 66-bit
370       * value whereas the MOV will act on only 32 or 33 bits of the
371       * accumulator.
372       */
373      assert(inst->opcode != BRW_OPCODE_MUL || !inst->dst.is_accumulator() ||
374             brw_reg_type_is_floating_point(inst->dst.type));
375
376      const fs_builder ibld(v, block, inst);
377      const unsigned stride = required_dst_byte_stride(inst) /
378                              type_sz(inst->dst.type);
379      assert(stride > 0);
380      const fs_reg tmp = horiz_stride(ibld.vgrf(inst->dst.type, stride),
381                                      stride);
382
383      /* Emit a series of 32-bit integer copies from the temporary into the
384       * original destination.
385       */
386      const brw_reg_type raw_type = brw_int_type(MIN2(type_sz(tmp.type), 4),
387                                                 false);
388      const unsigned n = type_sz(tmp.type) / type_sz(raw_type);
389
390      if (inst->predicate && inst->opcode != BRW_OPCODE_SEL) {
391         /* Note that in general we cannot simply predicate the copies on the
392          * same flag register as the original instruction, since it may have
393          * been overwritten by the instruction itself.  Instead initialize
394          * the temporary with the previous contents of the destination
395          * register.
396          */
397         for (unsigned j = 0; j < n; j++)
398            ibld.MOV(subscript(tmp, raw_type, j),
399                     subscript(inst->dst, raw_type, j));
400      }
401
402      for (unsigned j = 0; j < n; j++)
403         ibld.at(block, inst->next).MOV(subscript(inst->dst, raw_type, j),
404                                        subscript(tmp, raw_type, j));
405
406      /* Point the original instruction at the temporary, making sure to keep
407       * any destination modifiers in the instruction.
408       */
409      assert(inst->size_written == inst->dst.component_size(inst->exec_size));
410      inst->dst = tmp;
411      inst->size_written = inst->dst.component_size(inst->exec_size);
412
413      return true;
414   }
415
416   /**
417    * Legalize the source and destination regioning controls of the specified
418    * instruction.
419    */
420   bool
421   lower_instruction(fs_visitor *v, bblock_t *block, fs_inst *inst)
422   {
423      const gen_device_info *devinfo = v->devinfo;
424      bool progress = false;
425
426      if (has_invalid_conversion(devinfo, inst))
427         progress |= lower_dst_modifiers(v, block, inst);
428
429      if (has_invalid_dst_region(devinfo, inst))
430         progress |= lower_dst_region(v, block, inst);
431
432      for (unsigned i = 0; i < inst->sources; i++) {
433         if (has_invalid_src_modifiers(devinfo, inst, i))
434            progress |= lower_src_modifiers(v, block, inst, i);
435
436         if (has_invalid_src_region(devinfo, inst, i))
437            progress |= lower_src_region(v, block, inst, i);
438      }
439
440      return progress;
441   }
442}
443
444bool
445fs_visitor::lower_regioning()
446{
447   bool progress = false;
448
449   foreach_block_and_inst_safe(block, fs_inst, inst, cfg)
450      progress |= lower_instruction(this, block, inst);
451
452   if (progress)
453      invalidate_live_intervals();
454
455   return progress;
456}
457