1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_bank_conflicts.cpp 25 * 26 * This file contains a GRF bank conflict mitigation pass. The pass is 27 * intended to be run after register allocation and works by rearranging the 28 * layout of the GRF space (without altering the semantics of the program) in 29 * a way that minimizes the number of GRF bank conflicts incurred by ternary 30 * instructions. 31 * 32 * Unfortunately there is close to no information about bank conflicts in the 33 * hardware spec, but experimentally on Gen7-Gen9 ternary instructions seem to 34 * incur an average bank conflict penalty of one cycle per SIMD8 op whenever 35 * the second and third source are stored in the same GRF bank (\sa bank_of() 36 * for the exact bank layout) which cannot be fetched during the same cycle by 37 * the EU, unless the EU logic manages to optimize out the read cycle of a 38 * duplicate source register (\sa is_conflict_optimized_out()). 39 * 40 * The asymptotic run-time of the algorithm is dominated by the 41 * shader_conflict_weight_matrix() computation below, which is O(n) on the 42 * number of instructions in the program, however for small and medium-sized 43 * programs the run-time is likely to be dominated by 44 * optimize_reg_permutation() which is O(m^3) on the number of GRF atoms of 45 * the program (\sa partitioning), which is bounded (since the program uses a 46 * bounded number of registers post-regalloc) and of the order of 100. For 47 * that reason optimize_reg_permutation() is vectorized in order to keep the 48 * cubic term within reasonable bounds for m close to its theoretical maximum. 49 */ 50 51#include "brw_fs.h" 52#include "brw_cfg.h" 53 54#ifdef __SSE2__ 55 56#include <emmintrin.h> 57 58/** 59 * Thin layer around vector intrinsics so they can be easily replaced with 60 * e.g. the fall-back scalar path, an implementation with different vector 61 * width or using different SIMD architectures (AVX-512?!). 62 * 63 * This implementation operates on pairs of independent SSE2 integer vectors à 64 * la SIMD16 for somewhat improved throughput. SSE2 is supported by virtually 65 * all platforms that care about bank conflicts, so this path should almost 66 * always be available in practice. 67 */ 68namespace { 69 /** 70 * SIMD integer vector data type. 71 */ 72 struct vector_type { 73 __m128i v[2]; 74 }; 75 76 /** 77 * Scalar data type matching the representation of a single component of \p 78 * vector_type. 79 */ 80 typedef int16_t scalar_type; 81 82 /** 83 * Maximum integer value representable as a \p scalar_type. 84 */ 85 const scalar_type max_scalar = INT16_MAX; 86 87 /** 88 * Number of components of a \p vector_type. 89 */ 90 const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type); 91 92 /** 93 * Set the i-th component of vector \p v to \p x. 94 */ 95 void 96 set(vector_type &v, unsigned i, scalar_type x) 97 { 98 assert(i < vector_width); 99 memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x)); 100 } 101 102 /** 103 * Get the i-th component of vector \p v. 104 */ 105 scalar_type 106 get(const vector_type &v, unsigned i) 107 { 108 assert(i < vector_width); 109 scalar_type x; 110 memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x)); 111 return x; 112 } 113 114 /** 115 * Add two vectors with saturation. 116 */ 117 vector_type 118 adds(const vector_type &v, const vector_type &w) 119 { 120 const vector_type u = {{ 121 _mm_adds_epi16(v.v[0], w.v[0]), 122 _mm_adds_epi16(v.v[1], w.v[1]) 123 }}; 124 return u; 125 } 126 127 /** 128 * Subtract two vectors with saturation. 129 */ 130 vector_type 131 subs(const vector_type &v, const vector_type &w) 132 { 133 const vector_type u = {{ 134 _mm_subs_epi16(v.v[0], w.v[0]), 135 _mm_subs_epi16(v.v[1], w.v[1]) 136 }}; 137 return u; 138 } 139 140 /** 141 * Compute the bitwise conjunction of two vectors. 142 */ 143 vector_type 144 mask(const vector_type &v, const vector_type &w) 145 { 146 const vector_type u = {{ 147 _mm_and_si128(v.v[0], w.v[0]), 148 _mm_and_si128(v.v[1], w.v[1]) 149 }}; 150 return u; 151 } 152 153 /** 154 * Reduce the components of a vector using saturating addition. 155 */ 156 scalar_type 157 sums(const vector_type &v) 158 { 159 const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]); 160 const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e)); 161 const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1)); 162 const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1)); 163 return _mm_extract_epi16(v1, 0); 164 } 165} 166 167#else 168 169/** 170 * Thin layer around vector intrinsics so they can be easily replaced with 171 * e.g. the fall-back scalar path, an implementation with different vector 172 * width or using different SIMD architectures (AVX-512?!). 173 * 174 * This implementation operates on scalar values and doesn't rely on 175 * any vector extensions. This is mainly intended for debugging and 176 * to keep this file building on exotic platforms. 177 */ 178namespace { 179 /** 180 * SIMD integer vector data type. 181 */ 182 typedef int16_t vector_type; 183 184 /** 185 * Scalar data type matching the representation of a single component of \p 186 * vector_type. 187 */ 188 typedef int16_t scalar_type; 189 190 /** 191 * Maximum integer value representable as a \p scalar_type. 192 */ 193 const scalar_type max_scalar = INT16_MAX; 194 195 /** 196 * Number of components of a \p vector_type. 197 */ 198 const unsigned vector_width = 1; 199 200 /** 201 * Set the i-th component of vector \p v to \p x. 202 */ 203 void 204 set(vector_type &v, unsigned i, scalar_type x) 205 { 206 assert(i < vector_width); 207 v = x; 208 } 209 210 /** 211 * Get the i-th component of vector \p v. 212 */ 213 scalar_type 214 get(const vector_type &v, unsigned i) 215 { 216 assert(i < vector_width); 217 return v; 218 } 219 220 /** 221 * Add two vectors with saturation. 222 */ 223 vector_type 224 adds(vector_type v, vector_type w) 225 { 226 return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w)); 227 } 228 229 /** 230 * Substract two vectors with saturation. 231 */ 232 vector_type 233 subs(vector_type v, vector_type w) 234 { 235 return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w)); 236 } 237 238 /** 239 * Compute the bitwise conjunction of two vectors. 240 */ 241 vector_type 242 mask(vector_type v, vector_type w) 243 { 244 return v & w; 245 } 246 247 /** 248 * Reduce the components of a vector using saturating addition. 249 */ 250 scalar_type 251 sums(vector_type v) 252 { 253 return v; 254 } 255} 256 257#endif 258 259/** 260 * Swap \p x and \p y. 261 */ 262#define SWAP(x, y) do { \ 263 __typeof(y) _swap_tmp = y; \ 264 y = x; \ 265 x = _swap_tmp; \ 266 } while (0) 267 268namespace { 269 /** 270 * Variable-length vector type intended to represent cycle-count costs for 271 * arbitrary atom-to-bank assignments. It's indexed by a pair of integers 272 * (i, p), where i is an atom index and p in {0, 1} indicates the parity of 273 * the conflict (respectively, whether the cost is incurred whenever the 274 * atoms are assigned the same bank b or opposite-parity banks b and b^1). 275 * \sa shader_conflict_weight_matrix() 276 */ 277 struct weight_vector_type { 278 weight_vector_type() : v(NULL), size(0) {} 279 280 weight_vector_type(unsigned n) : v(alloc(n)), size(n) {} 281 282 weight_vector_type(const weight_vector_type &u) : 283 v(alloc(u.size)), size(u.size) 284 { 285 memcpy(v, u.v, 286 DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type)); 287 } 288 289 ~weight_vector_type() 290 { 291 free(v); 292 } 293 294 weight_vector_type & 295 operator=(weight_vector_type u) 296 { 297 SWAP(v, u.v); 298 SWAP(size, u.size); 299 return *this; 300 } 301 302 vector_type *v; 303 unsigned size; 304 305 private: 306 static vector_type * 307 alloc(unsigned n) 308 { 309 const unsigned align = MAX2(sizeof(void *), __alignof__(vector_type)); 310 const unsigned size = DIV_ROUND_UP(n, vector_width) * sizeof(vector_type); 311 void *p; 312 if (posix_memalign(&p, align, size)) 313 return NULL; 314 memset(p, 0, size); 315 return reinterpret_cast<vector_type *>(p); 316 } 317 }; 318 319 /** 320 * Set the (i, p)-th component of weight vector \p v to \p x. 321 */ 322 void 323 set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x) 324 { 325 set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x); 326 } 327 328 /** 329 * Get the (i, p)-th component of weight vector \p v. 330 */ 331 scalar_type 332 get(const weight_vector_type &v, unsigned i, unsigned p) 333 { 334 return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width); 335 } 336 337 /** 338 * Swap the (i, p)-th and (j, q)-th components of weight vector \p v. 339 */ 340 void 341 swap(weight_vector_type &v, 342 unsigned i, unsigned p, 343 unsigned j, unsigned q) 344 { 345 const scalar_type tmp = get(v, i, p); 346 set(v, i, p, get(v, j, q)); 347 set(v, j, q, tmp); 348 } 349} 350 351namespace { 352 /** 353 * Object that represents the partitioning of an arbitrary register space 354 * into indivisible units (referred to as atoms below) that can potentially 355 * be rearranged independently from other registers. The partitioning is 356 * inferred from a number of contiguity requirements specified using 357 * require_contiguous(). This allows efficient look-up of the atom index a 358 * given register address belongs to, or conversely the range of register 359 * addresses that belong to a given atom. 360 */ 361 struct partitioning { 362 /** 363 * Create a (for the moment unrestricted) partitioning of a register 364 * file of size \p n. The units are arbitrary. 365 */ 366 partitioning(unsigned n) : 367 max_reg(n), 368 offsets(new unsigned[n + num_terminator_atoms]), 369 atoms(new unsigned[n + num_terminator_atoms]) 370 { 371 for (unsigned i = 0; i < n + num_terminator_atoms; i++) { 372 offsets[i] = i; 373 atoms[i] = i; 374 } 375 } 376 377 partitioning(const partitioning &p) : 378 max_reg(p.max_reg), 379 offsets(new unsigned[p.num_atoms() + num_terminator_atoms]), 380 atoms(new unsigned[p.max_reg + num_terminator_atoms]) 381 { 382 memcpy(offsets, p.offsets, 383 sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms)); 384 memcpy(atoms, p.atoms, 385 sizeof(unsigned) * (p.max_reg + num_terminator_atoms)); 386 } 387 388 ~partitioning() 389 { 390 delete[] offsets; 391 delete[] atoms; 392 } 393 394 partitioning & 395 operator=(partitioning p) 396 { 397 SWAP(max_reg, p.max_reg); 398 SWAP(offsets, p.offsets); 399 SWAP(atoms, p.atoms); 400 return *this; 401 } 402 403 /** 404 * Require register range [reg, reg + n[ to be considered part of the 405 * same atom. 406 */ 407 void 408 require_contiguous(unsigned reg, unsigned n) 409 { 410 unsigned r = atoms[reg]; 411 412 /* Renumber atoms[reg...] = { r... } and their offsets[r...] for the 413 * case that the specified contiguity requirement leads to the fusion 414 * (yay) of one or more existing atoms. 415 */ 416 for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) { 417 if (offsets[atoms[reg1]] < reg + n) { 418 atoms[reg1] = r; 419 } else { 420 if (offsets[atoms[reg1 - 1]] != offsets[atoms[reg1]]) 421 r++; 422 423 offsets[r] = offsets[atoms[reg1]]; 424 atoms[reg1] = r; 425 } 426 } 427 } 428 429 /** 430 * Get the atom index register address \p reg belongs to. 431 */ 432 unsigned 433 atom_of_reg(unsigned reg) const 434 { 435 return atoms[reg]; 436 } 437 438 /** 439 * Get the base register address that belongs to atom \p r. 440 */ 441 unsigned 442 reg_of_atom(unsigned r) const 443 { 444 return offsets[r]; 445 } 446 447 /** 448 * Get the size of atom \p r in register address units. 449 */ 450 unsigned 451 size_of_atom(unsigned r) const 452 { 453 assert(r < num_atoms()); 454 return reg_of_atom(r + 1) - reg_of_atom(r); 455 } 456 457 /** 458 * Get the number of atoms the whole register space is partitioned into. 459 */ 460 unsigned 461 num_atoms() const 462 { 463 return atoms[max_reg]; 464 } 465 466 private: 467 /** 468 * Number of trailing atoms inserted for convenience so among other 469 * things we don't need to special-case the last element in 470 * size_of_atom(). 471 */ 472 static const unsigned num_terminator_atoms = 1; 473 unsigned max_reg; 474 unsigned *offsets; 475 unsigned *atoms; 476 }; 477 478 /** 479 * Only GRF sources (whether they have been register-allocated or not) can 480 * possibly incur bank conflicts. 481 */ 482 bool 483 is_grf(const fs_reg &r) 484 { 485 return r.file == VGRF || r.file == FIXED_GRF; 486 } 487 488 /** 489 * Register offset of \p r in GRF units. Useful because the representation 490 * of GRFs post-register allocation is somewhat inconsistent and depends on 491 * whether the register already had a fixed GRF offset prior to register 492 * allocation or whether it was part of a VGRF allocation. 493 */ 494 unsigned 495 reg_of(const fs_reg &r) 496 { 497 assert(is_grf(r)); 498 if (r.file == VGRF) 499 return r.nr + r.offset / REG_SIZE; 500 else 501 return reg_offset(r) / REG_SIZE; 502 } 503 504 /** 505 * Calculate the finest partitioning of the GRF space compatible with the 506 * register contiguity requirements derived from all instructions part of 507 * the program. 508 */ 509 partitioning 510 shader_reg_partitioning(const fs_visitor *v) 511 { 512 partitioning p(BRW_MAX_GRF); 513 514 foreach_block_and_inst(block, fs_inst, inst, v->cfg) { 515 if (is_grf(inst->dst)) 516 p.require_contiguous(reg_of(inst->dst), regs_written(inst)); 517 518 for (int i = 0; i < inst->sources; i++) { 519 if (is_grf(inst->src[i])) 520 p.require_contiguous(reg_of(inst->src[i]), regs_read(inst, i)); 521 } 522 } 523 524 return p; 525 } 526 527 /** 528 * Return the set of GRF atoms that should be left untouched at their 529 * original location to avoid violating hardware or software assumptions. 530 */ 531 bool * 532 shader_reg_constraints(const fs_visitor *v, const partitioning &p) 533 { 534 bool *constrained = new bool[p.num_atoms()](); 535 536 /* These are read implicitly by some send-message instructions without 537 * any indication at the IR level. Assume they are unsafe to move 538 * around. 539 */ 540 for (unsigned reg = 0; reg < 2; reg++) 541 constrained[p.atom_of_reg(reg)] = true; 542 543 /* At Intel Broadwell PRM, vol 07, section "Instruction Set Reference", 544 * subsection "EUISA Instructions", Send Message (page 990): 545 * 546 * "r127 must not be used for return address when there is a src and 547 * dest overlap in send instruction." 548 * 549 * Register allocation ensures that, so don't move 127 around to avoid 550 * breaking that property. 551 */ 552 if (v->devinfo->gen >= 8) 553 constrained[p.atom_of_reg(127)] = true; 554 555 foreach_block_and_inst(block, fs_inst, inst, v->cfg) { 556 /* Assume that anything referenced via fixed GRFs is baked into the 557 * hardware's fixed-function logic and may be unsafe to move around. 558 * Also take into account the source GRF restrictions of EOT 559 * send-message instructions. 560 */ 561 if (inst->dst.file == FIXED_GRF) 562 constrained[p.atom_of_reg(reg_of(inst->dst))] = true; 563 564 for (int i = 0; i < inst->sources; i++) { 565 if (inst->src[i].file == FIXED_GRF || 566 (is_grf(inst->src[i]) && inst->eot)) 567 constrained[p.atom_of_reg(reg_of(inst->src[i]))] = true; 568 } 569 570 /* The location of the Gen7 MRF hack registers is hard-coded in the 571 * rest of the compiler back-end. Don't attempt to move them around. 572 */ 573 if (v->devinfo->gen >= 7) { 574 assert(inst->dst.file != MRF); 575 576 for (int i = 0; i < v->implied_mrf_writes(inst); i++) { 577 const unsigned reg = GEN7_MRF_HACK_START + inst->base_mrf + i; 578 constrained[p.atom_of_reg(reg)] = true; 579 } 580 } 581 } 582 583 return constrained; 584 } 585 586 /** 587 * Return whether the hardware will be able to prevent a bank conflict by 588 * optimizing out the read cycle of a source register. The formula was 589 * found experimentally. 590 */ 591 bool 592 is_conflict_optimized_out(const gen_device_info *devinfo, const fs_inst *inst) 593 { 594 return devinfo->gen >= 9 && 595 ((is_grf(inst->src[0]) && (reg_of(inst->src[0]) == reg_of(inst->src[1]) || 596 reg_of(inst->src[0]) == reg_of(inst->src[2]))) || 597 reg_of(inst->src[1]) == reg_of(inst->src[2])); 598 } 599 600 /** 601 * Return a matrix that allows reasonably efficient computation of the 602 * cycle-count cost of bank conflicts incurred throughout the whole program 603 * for any given atom-to-bank assignment. 604 * 605 * More precisely, if C_r_s_p is the result of this function, the total 606 * cost of all bank conflicts involving any given atom r can be readily 607 * recovered as follows: 608 * 609 * S(B) = Sum_s_p(d_(p^B_r)_(B_s) * C_r_s_p) 610 * 611 * where d_i_j is the Kronecker delta, and B_r indicates the bank 612 * assignment of r. \sa delta_conflicts() for a vectorized implementation 613 * of the expression above. 614 * 615 * FINISHME: Teach this about the Gen10+ bank conflict rules, which are 616 * somewhat more relaxed than on previous generations. In the 617 * meantime optimizing based on Gen9 weights is likely to be more 618 * helpful than not optimizing at all. 619 */ 620 weight_vector_type * 621 shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p) 622 { 623 weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()]; 624 for (unsigned r = 0; r < p.num_atoms(); r++) 625 conflicts[r] = weight_vector_type(2 * p.num_atoms()); 626 627 /* Crude approximation of the number of times the current basic block 628 * will be executed at run-time. 629 */ 630 unsigned block_scale = 1; 631 632 foreach_block_and_inst(block, fs_inst, inst, v->cfg) { 633 if (inst->opcode == BRW_OPCODE_DO) { 634 block_scale *= 10; 635 636 } else if (inst->opcode == BRW_OPCODE_WHILE) { 637 block_scale /= 10; 638 639 } else if (inst->is_3src(v->devinfo) && 640 is_grf(inst->src[1]) && is_grf(inst->src[2])) { 641 const unsigned r = p.atom_of_reg(reg_of(inst->src[1])); 642 const unsigned s = p.atom_of_reg(reg_of(inst->src[2])); 643 644 /* Estimate of the cycle-count cost of incurring a bank conflict 645 * for this instruction. This is only true on the average, for a 646 * sequence of back-to-back ternary instructions, since the EU 647 * front-end only seems to be able to issue a new instruction at 648 * an even cycle. The cost of a bank conflict incurred by an 649 * isolated ternary instruction may be higher. 650 */ 651 const unsigned exec_size = inst->dst.component_size(inst->exec_size); 652 const unsigned cycle_scale = block_scale * DIV_ROUND_UP(exec_size, 653 REG_SIZE); 654 655 /* Neglect same-atom conflicts (since they're either trivial or 656 * impossible to avoid without splitting the atom), and conflicts 657 * known to be optimized out by the hardware. 658 */ 659 if (r != s && !is_conflict_optimized_out(v->devinfo, inst)) { 660 /* Calculate the parity of the sources relative to the start of 661 * their respective atoms. If their parity is the same (and 662 * none of the atoms straddle the 2KB mark), the instruction 663 * will incur a conflict iff both atoms are assigned the same 664 * bank b. If their parity is opposite, the instruction will 665 * incur a conflict iff they are assigned opposite banks (b and 666 * b^1). 667 */ 668 const bool p_r = 1 & (reg_of(inst->src[1]) - p.reg_of_atom(r)); 669 const bool p_s = 1 & (reg_of(inst->src[2]) - p.reg_of_atom(s)); 670 const unsigned p = p_r ^ p_s; 671 672 /* Calculate the updated cost of a hypothetical conflict 673 * between atoms r and s. Note that the weight matrix is 674 * symmetric with respect to indices r and s by construction. 675 */ 676 const scalar_type w = MIN2(unsigned(max_scalar), 677 get(conflicts[r], s, p) + cycle_scale); 678 set(conflicts[r], s, p, w); 679 set(conflicts[s], r, p, w); 680 } 681 } 682 } 683 684 return conflicts; 685 } 686 687 /** 688 * Return the set of GRF atoms that could potentially lead to bank 689 * conflicts if laid out unfavorably in the GRF space according to 690 * the specified \p conflicts matrix (\sa 691 * shader_conflict_weight_matrix()). 692 */ 693 bool * 694 have_any_conflicts(const partitioning &p, 695 const weight_vector_type *conflicts) 696 { 697 bool *any_conflicts = new bool[p.num_atoms()](); 698 699 for (unsigned r = 0; r < p.num_atoms(); r++) { 700 const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width); 701 for (unsigned s = 0; s < m; s++) 702 any_conflicts[r] |= sums(conflicts[r].v[s]); 703 } 704 705 return any_conflicts; 706 } 707 708 /** 709 * Calculate the difference between two S(B) cost estimates as defined 710 * above (\sa shader_conflict_weight_matrix()). This represents the 711 * (partial) cycle-count benefit from moving an atom r from bank p to n. 712 * The respective bank assignments Bp and Bn are encoded as the \p 713 * bank_mask_p and \p bank_mask_n bitmasks for efficient computation, 714 * according to the formula: 715 * 716 * bank_mask(B)_s_p = -d_(p^B_r)_(B_s) 717 * 718 * Notice the similarity with the delta function in the S(B) expression 719 * above, and how bank_mask(B) can be precomputed for every possible 720 * selection of r since bank_mask(B) only depends on it via B_r that may 721 * only assume one of four different values, so the caller can keep every 722 * possible bank_mask(B) vector in memory without much hassle (\sa 723 * bank_characteristics()). 724 */ 725 int 726 delta_conflicts(const weight_vector_type &bank_mask_p, 727 const weight_vector_type &bank_mask_n, 728 const weight_vector_type &conflicts) 729 { 730 const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width); 731 vector_type s_p = {}, s_n = {}; 732 733 for (unsigned r = 0; r < m; r++) { 734 s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r])); 735 s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r])); 736 } 737 738 return sums(subs(s_p, s_n)); 739 } 740 741 /** 742 * Register atom permutation, represented as the start GRF offset each atom 743 * is mapped into. 744 */ 745 struct permutation { 746 permutation() : v(NULL), size(0) {} 747 748 permutation(unsigned n) : 749 v(new unsigned[n]()), size(n) {} 750 751 permutation(const permutation &p) : 752 v(new unsigned[p.size]), size(p.size) 753 { 754 memcpy(v, p.v, p.size * sizeof(unsigned)); 755 } 756 757 ~permutation() 758 { 759 delete[] v; 760 } 761 762 permutation & 763 operator=(permutation p) 764 { 765 SWAP(v, p.v); 766 SWAP(size, p.size); 767 return *this; 768 } 769 770 unsigned *v; 771 unsigned size; 772 }; 773 774 /** 775 * Return an identity permutation of GRF atoms. 776 */ 777 permutation 778 identity_reg_permutation(const partitioning &p) 779 { 780 permutation map(p.num_atoms()); 781 782 for (unsigned r = 0; r < map.size; r++) 783 map.v[r] = p.reg_of_atom(r); 784 785 return map; 786 } 787 788 /** 789 * Return the bank index of GRF address \p reg, numbered according to the 790 * table: 791 * Even Odd 792 * Lo 0 1 793 * Hi 2 3 794 */ 795 unsigned 796 bank_of(unsigned reg) 797 { 798 return (reg & 0x40) >> 5 | (reg & 1); 799 } 800 801 /** 802 * Return bitmasks suitable for use as bank mask arguments for the 803 * delta_conflicts() computation. Note that this is just the (negative) 804 * characteristic function of each bank, if you regard it as a set 805 * containing all atoms assigned to it according to the \p map array. 806 */ 807 weight_vector_type * 808 bank_characteristics(const permutation &map) 809 { 810 weight_vector_type *banks = new weight_vector_type[4]; 811 812 for (unsigned b = 0; b < 4; b++) { 813 banks[b] = weight_vector_type(2 * map.size); 814 815 for (unsigned j = 0; j < map.size; j++) { 816 for (unsigned p = 0; p < 2; p++) 817 set(banks[b], j, p, 818 (b ^ p) == bank_of(map.v[j]) ? -1 : 0); 819 } 820 } 821 822 return banks; 823 } 824 825 /** 826 * Return an improved permutation of GRF atoms based on \p map attempting 827 * to reduce the total cycle-count cost of bank conflicts greedily. 828 * 829 * Note that this doesn't attempt to merge multiple atoms into one, which 830 * may allow it to do a better job in some cases -- It simply reorders 831 * existing atoms in the GRF space without affecting their identity. 832 */ 833 permutation 834 optimize_reg_permutation(const partitioning &p, 835 const bool *constrained, 836 const weight_vector_type *conflicts, 837 permutation map) 838 { 839 const bool *any_conflicts = have_any_conflicts(p, conflicts); 840 weight_vector_type *banks = bank_characteristics(map); 841 842 for (unsigned r = 0; r < map.size; r++) { 843 const unsigned bank_r = bank_of(map.v[r]); 844 845 if (!constrained[r]) { 846 unsigned best_s = r; 847 int best_benefit = 0; 848 849 for (unsigned s = 0; s < map.size; s++) { 850 const unsigned bank_s = bank_of(map.v[s]); 851 852 if (bank_r != bank_s && !constrained[s] && 853 p.size_of_atom(r) == p.size_of_atom(s) && 854 (any_conflicts[r] || any_conflicts[s])) { 855 const int benefit = 856 delta_conflicts(banks[bank_r], banks[bank_s], conflicts[r]) + 857 delta_conflicts(banks[bank_s], banks[bank_r], conflicts[s]); 858 859 if (benefit > best_benefit) { 860 best_s = s; 861 best_benefit = benefit; 862 } 863 } 864 } 865 866 if (best_s != r) { 867 for (unsigned b = 0; b < 4; b++) { 868 for (unsigned p = 0; p < 2; p++) 869 swap(banks[b], r, p, best_s, p); 870 } 871 872 SWAP(map.v[r], map.v[best_s]); 873 } 874 } 875 } 876 877 delete[] banks; 878 delete[] any_conflicts; 879 return map; 880 } 881 882 /** 883 * Apply the GRF atom permutation given by \p map to register \p r and 884 * return the result. 885 */ 886 fs_reg 887 transform(const partitioning &p, const permutation &map, fs_reg r) 888 { 889 if (r.file == VGRF) { 890 const unsigned reg = reg_of(r); 891 const unsigned s = p.atom_of_reg(reg); 892 r.nr = map.v[s] + reg - p.reg_of_atom(s); 893 r.offset = r.offset % REG_SIZE; 894 } 895 896 return r; 897 } 898} 899 900bool 901fs_visitor::opt_bank_conflicts() 902{ 903 assert(grf_used || !"Must be called after register allocation"); 904 905 /* No ternary instructions -- No bank conflicts. */ 906 if (devinfo->gen < 6) 907 return false; 908 909 const partitioning p = shader_reg_partitioning(this); 910 const bool *constrained = shader_reg_constraints(this, p); 911 const weight_vector_type *conflicts = 912 shader_conflict_weight_matrix(this, p); 913 const permutation map = 914 optimize_reg_permutation(p, constrained, conflicts, 915 identity_reg_permutation(p)); 916 917 foreach_block_and_inst(block, fs_inst, inst, cfg) { 918 inst->dst = transform(p, map, inst->dst); 919 920 for (int i = 0; i < inst->sources; i++) 921 inst->src[i] = transform(p, map, inst->src[i]); 922 } 923 924 delete[] conflicts; 925 delete[] constrained; 926 return true; 927} 928 929/** 930 * Estimate the number of GRF bank conflict cycles incurred by an instruction. 931 * 932 * Note that this neglects conflict cycles prior to register allocation 933 * because we don't know which bank each VGRF is going to end up aligned to. 934 */ 935unsigned 936fs_visitor::bank_conflict_cycles(const fs_inst *inst) const 937{ 938 if (grf_used && inst->is_3src(devinfo) && 939 is_grf(inst->src[1]) && is_grf(inst->src[2]) && 940 bank_of(reg_of(inst->src[1])) == bank_of(reg_of(inst->src[2])) && 941 !is_conflict_optimized_out(devinfo, inst)) { 942 return DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE); 943 } else { 944 return 0; 945 } 946} 947