1/*
2 * Copyright © 2017 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24/** @file brw_fs_bank_conflicts.cpp
25 *
26 * This file contains a GRF bank conflict mitigation pass.  The pass is
27 * intended to be run after register allocation and works by rearranging the
28 * layout of the GRF space (without altering the semantics of the program) in
29 * a way that minimizes the number of GRF bank conflicts incurred by ternary
30 * instructions.
31 *
32 * Unfortunately there is close to no information about bank conflicts in the
33 * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to
34 * incur an average bank conflict penalty of one cycle per SIMD8 op whenever
35 * the second and third source are stored in the same GRF bank (\sa bank_of()
36 * for the exact bank layout) which cannot be fetched during the same cycle by
37 * the EU, unless the EU logic manages to optimize out the read cycle of a
38 * duplicate source register (\sa is_conflict_optimized_out()).
39 *
40 * The asymptotic run-time of the algorithm is dominated by the
41 * shader_conflict_weight_matrix() computation below, which is O(n) on the
42 * number of instructions in the program, however for small and medium-sized
43 * programs the run-time is likely to be dominated by
44 * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of
45 * the program (\sa partitioning), which is bounded (since the program uses a
46 * bounded number of registers post-regalloc) and of the order of 100.  For
47 * that reason optimize_reg_permutation() is vectorized in order to keep the
48 * cubic term within reasonable bounds for m close to its theoretical maximum.
49 */
50
51#include "brw_fs.h"
52#include "brw_cfg.h"
53
54#ifdef __SSE2__
55
56#include <emmintrin.h>
57
58/**
59 * Thin layer around vector intrinsics so they can be easily replaced with
60 * e.g. the fall-back scalar path, an implementation with different vector
61 * width or using different SIMD architectures (AVX-512?!).
62 *
63 * This implementation operates on pairs of independent SSE2 integer vectors à
64 * la SIMD16 for somewhat improved throughput.  SSE2 is supported by virtually
65 * all platforms that care about bank conflicts, so this path should almost
66 * always be available in practice.
67 */
68namespace {
69   /**
70    * SIMD integer vector data type.
71    */
72   struct vector_type {
73      __m128i v[2];
74   };
75
76   /**
77    * Scalar data type matching the representation of a single component of \p
78    * vector_type.
79    */
80   typedef int16_t scalar_type;
81
82   /**
83    * Maximum integer value representable as a \p scalar_type.
84    */
85   const scalar_type max_scalar = INT16_MAX;
86
87   /**
88    * Number of components of a \p vector_type.
89    */
90   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
91
92   /**
93    * Set the i-th component of vector \p v to \p x.
94    */
95   void
96   set(vector_type &v, unsigned i, scalar_type x)
97   {
98      assert(i < vector_width);
99      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
100   }
101
102   /**
103    * Get the i-th component of vector \p v.
104    */
105   scalar_type
106   get(const vector_type &v, unsigned i)
107   {
108      assert(i < vector_width);
109      scalar_type x;
110      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
111      return x;
112   }
113
114   /**
115    * Add two vectors with saturation.
116    */
117   vector_type
118   adds(const vector_type &v, const vector_type &w)
119   {
120      const vector_type u = {{
121            _mm_adds_epi16(v.v[0], w.v[0]),
122            _mm_adds_epi16(v.v[1], w.v[1])
123         }};
124      return u;
125   }
126
127   /**
128    * Subtract two vectors with saturation.
129    */
130   vector_type
131   subs(const vector_type &v, const vector_type &w)
132   {
133      const vector_type u = {{
134            _mm_subs_epi16(v.v[0], w.v[0]),
135            _mm_subs_epi16(v.v[1], w.v[1])
136         }};
137      return u;
138   }
139
140   /**
141    * Compute the bitwise conjunction of two vectors.
142    */
143   vector_type
144   mask(const vector_type &v, const vector_type &w)
145   {
146      const vector_type u = {{
147            _mm_and_si128(v.v[0], w.v[0]),
148            _mm_and_si128(v.v[1], w.v[1])
149         }};
150      return u;
151   }
152
153   /**
154    * Reduce the components of a vector using saturating addition.
155    */
156   scalar_type
157   sums(const vector_type &v)
158   {
159      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
160      const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
161      const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
162      const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
163      return _mm_extract_epi16(v1, 0);
164   }
165}
166
167#else
168
169/**
170 * Thin layer around vector intrinsics so they can be easily replaced with
171 * e.g. the fall-back scalar path, an implementation with different vector
172 * width or using different SIMD architectures (AVX-512?!).
173 *
174 * This implementation operates on scalar values and doesn't rely on
175 * any vector extensions.  This is mainly intended for debugging and
176 * to keep this file building on exotic platforms.
177 */
178namespace {
179   /**
180    * SIMD integer vector data type.
181    */
182   typedef int16_t vector_type;
183
184   /**
185    * Scalar data type matching the representation of a single component of \p
186    * vector_type.
187    */
188   typedef int16_t scalar_type;
189
190   /**
191    * Maximum integer value representable as a \p scalar_type.
192    */
193   const scalar_type max_scalar = INT16_MAX;
194
195   /**
196    * Number of components of a \p vector_type.
197    */
198   const unsigned vector_width = 1;
199
200   /**
201    * Set the i-th component of vector \p v to \p x.
202    */
203   void
204   set(vector_type &v, unsigned i, scalar_type x)
205   {
206      assert(i < vector_width);
207      v = x;
208   }
209
210   /**
211    * Get the i-th component of vector \p v.
212    */
213   scalar_type
214   get(const vector_type &v, unsigned i)
215   {
216      assert(i < vector_width);
217      return v;
218   }
219
220   /**
221    * Add two vectors with saturation.
222    */
223   vector_type
224   adds(vector_type v, vector_type w)
225   {
226      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
227   }
228
229   /**
230    * Substract two vectors with saturation.
231    */
232   vector_type
233   subs(vector_type v, vector_type w)
234   {
235      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
236   }
237
238   /**
239    * Compute the bitwise conjunction of two vectors.
240    */
241   vector_type
242   mask(vector_type v, vector_type w)
243   {
244      return v & w;
245   }
246
247   /**
248    * Reduce the components of a vector using saturating addition.
249    */
250   scalar_type
251   sums(vector_type v)
252   {
253      return v;
254   }
255}
256
257#endif
258
259/**
260 * Swap \p x and \p y.
261 */
262#define SWAP(x, y) do {                          \
263      __typeof(y) _swap_tmp = y;                 \
264      y = x;                                     \
265      x = _swap_tmp;                             \
266   } while (0)
267
268namespace {
269   /**
270    * Variable-length vector type intended to represent cycle-count costs for
271    * arbitrary atom-to-bank assignments.  It's indexed by a pair of integers
272    * (i, p), where i is an atom index and p in {0, 1} indicates the parity of
273    * the conflict (respectively, whether the cost is incurred whenever the
274    * atoms are assigned the same bank b or opposite-parity banks b and b^1).
275    * \sa shader_conflict_weight_matrix()
276    */
277   struct weight_vector_type {
278      weight_vector_type() : v(NULL), size(0) {}
279
280      weight_vector_type(unsigned n) : v(alloc(n)), size(n) {}
281
282      weight_vector_type(const weight_vector_type &u) :
283         v(alloc(u.size)), size(u.size)
284      {
285         memcpy(v, u.v,
286                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
287      }
288
289      ~weight_vector_type()
290      {
291         free(v);
292      }
293
294      weight_vector_type &
295      operator=(weight_vector_type u)
296      {
297         SWAP(v, u.v);
298         SWAP(size, u.size);
299         return *this;
300      }
301
302      vector_type *v;
303      unsigned size;
304
305   private:
306      static vector_type *
307      alloc(unsigned n)
308      {
309         const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type));
310         const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type);
311         void *p;
312         if (posix_memalign(&p, align, size))
313            return NULL;
314         memset(p, 0, size);
315         return reinterpret_cast<vector_type *>(p);
316      }
317   };
318
319   /**
320    * Set the (i, p)-th component of weight vector \p v to \p x.
321    */
322   void
323   set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
324   {
325      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
326   }
327
328   /**
329    * Get the (i, p)-th component of weight vector \p v.
330    */
331   scalar_type
332   get(const weight_vector_type &v, unsigned i, unsigned p)
333   {
334      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
335   }
336
337   /**
338    * Swap the (i, p)-th and (j, q)-th components of weight vector \p v.
339    */
340   void
341   swap(weight_vector_type &v,
342        unsigned i, unsigned p,
343        unsigned j, unsigned q)
344   {
345      const scalar_type tmp = get(v, i, p);
346      set(v, i, p, get(v, j, q));
347      set(v, j, q, tmp);
348   }
349}
350
351namespace {
352   /**
353    * Object that represents the partitioning of an arbitrary register space
354    * into indivisible units (referred to as atoms below) that can potentially
355    * be rearranged independently from other registers.  The partitioning is
356    * inferred from a number of contiguity requirements specified using
357    * require_contiguous().  This allows efficient look-up of the atom index a
358    * given register address belongs to, or conversely the range of register
359    * addresses that belong to a given atom.
360    */
361   struct partitioning {
362      /**
363       * Create a (for the moment unrestricted) partitioning of a register
364       * file of size \p n.  The units are arbitrary.
365       */
366      partitioning(unsigned n) :
367         max_reg(n),
368         offsets(new unsigned[n + num_terminator_atoms]),
369         atoms(new unsigned[n + num_terminator_atoms])
370      {
371         for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
372            offsets[i] = i;
373            atoms[i] = i;
374         }
375      }
376
377      partitioning(const partitioning &p) :
378         max_reg(p.max_reg),
379         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
380         atoms(new unsigned[p.max_reg + num_terminator_atoms])
381      {
382         memcpy(offsets, p.offsets,
383                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
384         memcpy(atoms, p.atoms,
385                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
386      }
387
388      ~partitioning()
389      {
390         delete[] offsets;
391         delete[] atoms;
392      }
393
394      partitioning &
395      operator=(partitioning p)
396      {
397         SWAP(max_reg, p.max_reg);
398         SWAP(offsets, p.offsets);
399         SWAP(atoms, p.atoms);
400         return *this;
401      }
402
403      /**
404       * Require register range [reg, reg + n[ to be considered part of the
405       * same atom.
406       */
407      void
408      require_contiguous(unsigned reg, unsigned n)
409      {
410         unsigned r = atoms[reg];
411
412         /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the
413          * case that the specified contiguity requirement leads to the fusion
414          * (yay) of one or more existing atoms.
415          */
416         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
417            if (offsets[atoms[reg1]] < reg + n) {
418               atoms[reg1] = r;
419            } else {
420               if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]])
421                  r++;
422
423               offsets[r] = offsets[atoms[reg1]];
424               atoms[reg1] = r;
425            }
426         }
427      }
428
429      /**
430       * Get the atom index register address \p reg belongs to.
431       */
432      unsigned
433      atom_of_reg(unsigned reg) const
434      {
435         return atoms[reg];
436      }
437
438      /**
439       * Get the base register address that belongs to atom \p r.
440       */
441      unsigned
442      reg_of_atom(unsigned r) const
443      {
444         return offsets[r];
445      }
446
447      /**
448       * Get the size of atom \p r in register address units.
449       */
450      unsigned
451      size_of_atom(unsigned r) const
452      {
453         assert(r < num_atoms());
454         return reg_of_atom(r + 1) - reg_of_atom(r);
455      }
456
457      /**
458       * Get the number of atoms the whole register space is partitioned into.
459       */
460      unsigned
461      num_atoms() const
462      {
463         return atoms[max_reg];
464      }
465
466   private:
467      /**
468       * Number of trailing atoms inserted for convenience so among other
469       * things we don't need to special-case the last element in
470       * size_of_atom().
471       */
472      static const unsigned num_terminator_atoms = 1;
473      unsigned max_reg;
474      unsigned *offsets;
475      unsigned *atoms;
476   };
477
478   /**
479    * Only GRF sources (whether they have been register-allocated or not) can
480    * possibly incur bank conflicts.
481    */
482   bool
483   is_grf(const fs_reg &r)
484   {
485      return r.file == VGRF || r.file == FIXED_GRF;
486   }
487
488   /**
489    * Register offset of \p r in GRF units.  Useful because the representation
490    * of GRFs post-register allocation is somewhat inconsistent and depends on
491    * whether the register already had a fixed GRF offset prior to register
492    * allocation or whether it was part of a VGRF allocation.
493    */
494   unsigned
495   reg_of(const fs_reg &r)
496   {
497      assert(is_grf(r));
498      if (r.file == VGRF)
499         return r.nr + r.offset / REG_SIZE;
500      else
501         return reg_offset(r) / REG_SIZE;
502   }
503
504   /**
505    * Calculate the finest partitioning of the GRF space compatible with the
506    * register contiguity requirements derived from all instructions part of
507    * the program.
508    */
509   partitioning
510   shader_reg_partitioning(const fs_visitor *v)
511   {
512      partitioning p(BRW_MAX_GRF);
513
514      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
515         if (is_grf(inst->dst))
516            p.require_contiguous(reg_of(inst->dst), regs_written(inst));
517
518         for (int i = 0; i < inst->sources; i++) {
519            if (is_grf(inst->src[i]))
520               p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i));
521         }
522      }
523
524      return p;
525   }
526
527   /**
528    * Return the set of GRF atoms that should be left untouched at their
529    * original location to avoid violating hardware or software assumptions.
530    */
531   bool *
532   shader_reg_constraints(const fs_visitor *v, const partitioning &p)
533   {
534      bool *constrained = new bool[p.num_atoms()]();
535
536      /* These are read implicitly by some send-message instructions without
537       * any indication at the IR level.  Assume they are unsafe to move
538       * around.
539       */
540      for (unsigned reg = 0; reg < 2; reg++)
541         constrained[p.atom_of_reg(reg)] = true;
542
543      /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference",
544       * subsection "EUISA Instructions", Send Message (page 990):
545       *
546       * "r127 must not be used for return address when there is a src and
547       * dest overlap in send instruction."
548       *
549       * Register allocation ensures that, so don't move 127 around to avoid
550       * breaking that property.
551       */
552      if (v->devinfo->gen >= 8)
553         constrained[p.atom_of_reg(127)] = true;
554
555      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
556         /* Assume that anything referenced via fixed GRFs is baked into the
557          * hardware's fixed-function logic and may be unsafe to move around.
558          * Also take into account the source GRF restrictions of EOT
559          * send-message instructions.
560          */
561         if (inst->dst.file == FIXED_GRF)
562            constrained[p.atom_of_reg(reg_of(inst->dst))] = true;
563
564         for (int i = 0; i < inst->sources; i++) {
565            if (inst->src[i].file == FIXED_GRF ||
566                (is_grf(inst->src[i]) && inst->eot))
567               constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true;
568         }
569
570         /* The location of the Gen7 MRF hack registers is hard-coded in the
571          * rest of the compiler back-end.  Don't attempt to move them around.
572          */
573         if (v->devinfo->gen >= 7) {
574            assert(inst->dst.file != MRF);
575
576            for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
577               const unsigned reg = GEN7_MRF_HACK_START + inst->base_mrf + i;
578               constrained[p.atom_of_reg(reg)] = true;
579            }
580         }
581      }
582
583      return constrained;
584   }
585
586   /**
587    * Return whether the hardware will be able to prevent a bank conflict by
588    * optimizing out the read cycle of a source register.  The formula was
589    * found experimentally.
590    */
591   bool
592   is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst)
593   {
594      return devinfo->gen >= 9 &&
595         ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) ||
596                                    reg_of(inst->src[0]) == reg_of(inst->src[2]))) ||
597          reg_of(inst->src[1]) == reg_of(inst->src[2]));
598   }
599
600   /**
601    * Return a matrix that allows reasonably efficient computation of the
602    * cycle-count cost of bank conflicts incurred throughout the whole program
603    * for any given atom-to-bank assignment.
604    *
605    * More precisely, if C_r_s_p is the result of this function, the total
606    * cost of all bank conflicts involving any given atom r can be readily
607    * recovered as follows:
608    *
609    *  S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p)
610    *
611    * where d_i_j is the Kronecker delta, and B_r indicates the bank
612    * assignment of r.  \sa delta_conflicts() for a vectorized implementation
613    * of the expression above.
614    *
615    * FINISHME: Teach this about the Gen10+ bank conflict rules, which are
616    *           somewhat more relaxed than on previous generations.  In the
617    *           meantime optimizing based on Gen9 weights is likely to be more
618    *           helpful than not optimizing at all.
619    */
620   weight_vector_type *
621   shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
622   {
623      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
624      for (unsigned r = 0; r < p.num_atoms(); r++)
625         conflicts[r] = weight_vector_type(2 * p.num_atoms());
626
627      /* Crude approximation of the number of times the current basic block
628       * will be executed at run-time.
629       */
630      unsigned block_scale = 1;
631
632      foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
633         if (inst->opcode == BRW_OPCODE_DO) {
634            block_scale *= 10;
635
636         } else if (inst->opcode == BRW_OPCODE_WHILE) {
637            block_scale /= 10;
638
639         } else if (inst->is_3src(v->devinfo) &&
640                    is_grf(inst->src[1]) && is_grf(inst->src[2])) {
641            const unsigned r = p.atom_of_reg(reg_of(inst->src[1]));
642            const unsigned s = p.atom_of_reg(reg_of(inst->src[2]));
643
644            /* Estimate of the cycle-count cost of incurring a bank conflict
645             * for this instruction.  This is only true on the average, for a
646             * sequence of back-to-back ternary instructions, since the EU
647             * front-end only seems to be able to issue a new instruction at
648             * an even cycle.  The cost of a bank conflict incurred by an
649             * isolated ternary instruction may be higher.
650             */
651            const unsigned exec_size = inst->dst.component_size(inst->exec_size);
652            const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size,
653                                                                    REG_SIZE);
654
655            /* Neglect same-atom conflicts (since they're either trivial or
656             * impossible to avoid without splitting the atom), and conflicts
657             * known to be optimized out by the hardware.
658             */
659            if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) {
660               /* Calculate the parity of the sources relative to the start of
661                * their respective atoms.  If their parity is the same (and
662                * none of the atoms straddle the 2KB mark), the instruction
663                * will incur a conflict iff both atoms are assigned the same
664                * bank b.  If their parity is opposite, the instruction will
665                * incur a conflict iff they are assigned opposite banks (b and
666                * b^1).
667                */
668               const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r));
669               const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s));
670               const unsigned p = p_r ^ p_s;
671
672               /* Calculate the updated cost of a hypothetical conflict
673                * between atoms r and s.  Note that the weight matrix is
674                * symmetric with respect to indices r and s by construction.
675                */
676               const scalar_type w = MIN2(unsigned(max_scalar),
677                                          get(conflicts[r], s, p) + cycle_scale);
678               set(conflicts[r], s, p, w);
679               set(conflicts[s], r, p, w);
680            }
681         }
682      }
683
684      return conflicts;
685   }
686
687   /**
688    * Return the set of GRF atoms that could potentially lead to bank
689    * conflicts if laid out unfavorably in the GRF space according to
690    * the specified \p conflicts matrix (\sa
691    * shader_conflict_weight_matrix()).
692    */
693   bool *
694   have_any_conflicts(const partitioning &p,
695                      const weight_vector_type *conflicts)
696   {
697      bool *any_conflicts = new bool[p.num_atoms()]();
698
699      for (unsigned r = 0; r < p.num_atoms(); r++) {
700         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
701         for (unsigned s = 0; s < m; s++)
702            any_conflicts[r] |= sums(conflicts[r].v[s]);
703      }
704
705      return any_conflicts;
706   }
707
708   /**
709    * Calculate the difference between two S(B) cost estimates as defined
710    * above (\sa shader_conflict_weight_matrix()).  This represents the
711    * (partial) cycle-count benefit from moving an atom r from bank p to n.
712    * The respective bank assignments Bp and Bn are encoded as the \p
713    * bank_mask_p and \p bank_mask_n bitmasks for efficient computation,
714    * according to the formula:
715    *
716    *  bank_mask(B)_s_p = -d_(p^B_r)_(B_s)
717    *
718    * Notice the similarity with the delta function in the S(B) expression
719    * above, and how bank_mask(B) can be precomputed for every possible
720    * selection of r since bank_mask(B) only depends on it via B_r that may
721    * only assume one of four different values, so the caller can keep every
722    * possible bank_mask(B) vector in memory without much hassle (\sa
723    * bank_characteristics()).
724    */
725   int
726   delta_conflicts(const weight_vector_type &bank_mask_p,
727                   const weight_vector_type &bank_mask_n,
728                   const weight_vector_type &conflicts)
729   {
730      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
731      vector_type s_p = {}, s_n = {};
732
733      for (unsigned r = 0; r < m; r++) {
734         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
735         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
736      }
737
738      return sums(subs(s_p, s_n));
739   }
740
741   /**
742    * Register atom permutation, represented as the start GRF offset each atom
743    * is mapped into.
744    */
745   struct permutation {
746      permutation() : v(NULL), size(0) {}
747
748      permutation(unsigned n) :
749         v(new unsigned[n]()), size(n) {}
750
751      permutation(const permutation &p) :
752         v(new unsigned[p.size]), size(p.size)
753      {
754         memcpy(v, p.v, p.size * sizeof(unsigned));
755      }
756
757      ~permutation()
758      {
759         delete[] v;
760      }
761
762      permutation &
763      operator=(permutation p)
764      {
765         SWAP(v, p.v);
766         SWAP(size, p.size);
767         return *this;
768      }
769
770      unsigned *v;
771      unsigned size;
772   };
773
774   /**
775    * Return an identity permutation of GRF atoms.
776    */
777   permutation
778   identity_reg_permutation(const partitioning &p)
779   {
780      permutation map(p.num_atoms());
781
782      for (unsigned r = 0; r < map.size; r++)
783         map.v[r] = p.reg_of_atom(r);
784
785      return map;
786   }
787
788   /**
789    * Return the bank index of GRF address \p reg, numbered according to the
790    * table:
791    *        Even Odd
792    *    Lo    0   1
793    *    Hi    2   3
794    */
795   unsigned
796   bank_of(unsigned reg)
797   {
798      return (reg & 0x40) >> 5 | (reg & 1);
799   }
800
801   /**
802    * Return bitmasks suitable for use as bank mask arguments for the
803    * delta_conflicts() computation.  Note that this is just the (negative)
804    * characteristic function of each bank, if you regard it as a set
805    * containing all atoms assigned to it according to the \p map array.
806    */
807   weight_vector_type *
808   bank_characteristics(const permutation &map)
809   {
810      weight_vector_type *banks = new weight_vector_type[4];
811
812      for (unsigned b = 0; b < 4; b++) {
813         banks[b] = weight_vector_type(2 * map.size);
814
815         for (unsigned j = 0; j < map.size; j++) {
816            for (unsigned p = 0; p < 2; p++)
817               set(banks[b], j, p,
818                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
819         }
820      }
821
822      return banks;
823   }
824
825   /**
826    * Return an improved permutation of GRF atoms based on \p map attempting
827    * to reduce the total cycle-count cost of bank conflicts greedily.
828    *
829    * Note that this doesn't attempt to merge multiple atoms into one, which
830    * may allow it to do a better job in some cases -- It simply reorders
831    * existing atoms in the GRF space without affecting their identity.
832    */
833   permutation
834   optimize_reg_permutation(const partitioning &p,
835                            const bool *constrained,
836                            const weight_vector_type *conflicts,
837                            permutation map)
838   {
839      const bool *any_conflicts = have_any_conflicts(p, conflicts);
840      weight_vector_type *banks = bank_characteristics(map);
841
842      for (unsigned r = 0; r < map.size; r++) {
843         const unsigned bank_r = bank_of(map.v[r]);
844
845         if (!constrained[r]) {
846            unsigned best_s = r;
847            int best_benefit = 0;
848
849            for (unsigned s = 0; s < map.size; s++) {
850               const unsigned bank_s = bank_of(map.v[s]);
851
852               if (bank_r != bank_s && !constrained[s] &&
853                   p.size_of_atom(r) == p.size_of_atom(s) &&
854                   (any_conflicts[r] || any_conflicts[s])) {
855                  const int benefit =
856                     delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) +
857                     delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]);
858
859                  if (benefit > best_benefit) {
860                     best_s = s;
861                     best_benefit = benefit;
862                  }
863               }
864            }
865
866            if (best_s != r) {
867               for (unsigned b = 0; b < 4; b++) {
868                  for (unsigned p = 0; p < 2; p++)
869                     swap(banks[b], r, p, best_s, p);
870               }
871
872               SWAP(map.v[r], map.v[best_s]);
873            }
874         }
875      }
876
877      delete[] banks;
878      delete[] any_conflicts;
879      return map;
880   }
881
882   /**
883    * Apply the GRF atom permutation given by \p map to register \p r and
884    * return the result.
885    */
886   fs_reg
887   transform(const partitioning &p, const permutation &map, fs_reg r)
888   {
889      if (r.file == VGRF) {
890         const unsigned reg = reg_of(r);
891         const unsigned s = p.atom_of_reg(reg);
892         r.nr = map.v[s] + reg - p.reg_of_atom(s);
893         r.offset = r.offset % REG_SIZE;
894      }
895
896      return r;
897   }
898}
899
900bool
901fs_visitor::opt_bank_conflicts()
902{
903   assert(grf_used || !"Must be called after register allocation");
904
905   /* No ternary instructions -- No bank conflicts. */
906   if (devinfo->gen < 6)
907      return false;
908
909   const partitioning p = shader_reg_partitioning(this);
910   const bool *constrained = shader_reg_constraints(this, p);
911   const weight_vector_type *conflicts =
912      shader_conflict_weight_matrix(this, p);
913   const permutation map =
914      optimize_reg_permutation(p, constrained, conflicts,
915                               identity_reg_permutation(p));
916
917   foreach_block_and_inst(block, fs_inst, inst, cfg) {
918      inst->dst = transform(p, map, inst->dst);
919
920      for (int i = 0; i < inst->sources; i++)
921         inst->src[i] = transform(p, map, inst->src[i]);
922   }
923
924   delete[] conflicts;
925   delete[] constrained;
926   return true;
927}
928
929/**
930 * Estimate the number of GRF bank conflict cycles incurred by an instruction.
931 *
932 * Note that this neglects conflict cycles prior to register allocation
933 * because we don't know which bank each VGRF is going to end up aligned to.
934 */
935unsigned
936fs_visitor::bank_conflict_cycles(const fs_inst *inst) const
937{
938   if (grf_used && inst->is_3src(devinfo) &&
939       is_grf(inst->src[1]) && is_grf(inst->src[2]) &&
940       bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) &&
941       !is_conflict_optimized_out(devinfo, inst)) {
942      return DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
943   } else {
944      return 0;
945   }
946}
947