brw_fs_builder.h revision 01e04c3f
1/* -*- c++ -*- */ 2/* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#ifndef BRW_FS_BUILDER_H 26#define BRW_FS_BUILDER_H 27 28#include "brw_ir_fs.h" 29#include "brw_shader.h" 30 31namespace brw { 32 /** 33 * Toolbox to assemble an FS IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::vec4_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class fs_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef fs_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef fs_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef fs_inst instruction; 50 51 /** 52 * Construct an fs_builder that inserts instructions into \p shader. 53 * \p dispatch_width gives the native execution width of the program. 54 */ 55 fs_builder(backend_shader *shader, 56 unsigned dispatch_width) : 57 shader(shader), block(NULL), cursor(NULL), 58 _dispatch_width(dispatch_width), 59 _group(0), 60 force_writemask_all(false), 61 annotation() 62 { 63 } 64 65 /** 66 * Construct an fs_builder that inserts instructions into \p shader 67 * before instruction \p inst in basic block \p block. The default 68 * execution controls and debug annotation are initialized from the 69 * instruction passed as argument. 70 */ 71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 72 shader(shader), block(block), cursor(inst), 73 _dispatch_width(inst->exec_size), 74 _group(inst->group), 75 force_writemask_all(inst->force_writemask_all) 76 { 77 annotation.str = inst->annotation; 78 annotation.ir = inst->ir; 79 } 80 81 /** 82 * Construct an fs_builder that inserts instructions before \p cursor in 83 * basic block \p block, inheriting other code generation parameters 84 * from this. 85 */ 86 fs_builder 87 at(bblock_t *block, exec_node *cursor) const 88 { 89 fs_builder bld = *this; 90 bld.block = block; 91 bld.cursor = cursor; 92 return bld; 93 } 94 95 /** 96 * Construct an fs_builder appending instructions at the end of the 97 * instruction list of the shader, inheriting other code generation 98 * parameters from this. 99 */ 100 fs_builder 101 at_end() const 102 { 103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 104 } 105 106 /** 107 * Construct a builder specifying the default SIMD width and group of 108 * channel enable signals, inheriting other code generation parameters 109 * from this. 110 * 111 * \p n gives the default SIMD width, \p i gives the slot group used for 112 * predication and control flow masking in multiples of \p n channels. 113 */ 114 fs_builder 115 group(unsigned n, unsigned i) const 116 { 117 assert(force_writemask_all || 118 (n <= dispatch_width() && i < dispatch_width() / n)); 119 fs_builder bld = *this; 120 bld._dispatch_width = n; 121 bld._group += i * n; 122 return bld; 123 } 124 125 /** 126 * Alias for group() with width equal to eight. 127 */ 128 fs_builder 129 half(unsigned i) const 130 { 131 return group(8, i); 132 } 133 134 /** 135 * Construct a builder with per-channel control flow execution masking 136 * disabled if \p b is true. If control flow execution masking is 137 * already disabled this has no effect. 138 */ 139 fs_builder 140 exec_all(bool b = true) const 141 { 142 fs_builder bld = *this; 143 if (b) 144 bld.force_writemask_all = true; 145 return bld; 146 } 147 148 /** 149 * Construct a builder with the given debug annotation info. 150 */ 151 fs_builder 152 annotate(const char *str, const void *ir = NULL) const 153 { 154 fs_builder bld = *this; 155 bld.annotation.str = str; 156 bld.annotation.ir = ir; 157 return bld; 158 } 159 160 /** 161 * Get the SIMD width in use. 162 */ 163 unsigned 164 dispatch_width() const 165 { 166 return _dispatch_width; 167 } 168 169 /** 170 * Get the channel group in use. 171 */ 172 unsigned 173 group() const 174 { 175 return _group; 176 } 177 178 /** 179 * Allocate a virtual register of natural vector size (one for this IR) 180 * and SIMD width. \p n gives the amount of space to allocate in 181 * dispatch_width units (which is just enough space for one logical 182 * component in this IR). 183 */ 184 dst_reg 185 vgrf(enum brw_reg_type type, unsigned n = 1) const 186 { 187 assert(dispatch_width() <= 32); 188 189 if (n > 0) 190 return dst_reg(VGRF, shader->alloc.allocate( 191 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 192 REG_SIZE)), 193 type); 194 else 195 return retype(null_reg_ud(), type); 196 } 197 198 /** 199 * Create a null register of floating type. 200 */ 201 dst_reg 202 null_reg_f() const 203 { 204 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 205 } 206 207 dst_reg 208 null_reg_df() const 209 { 210 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 211 } 212 213 /** 214 * Create a null register of signed integer type. 215 */ 216 dst_reg 217 null_reg_d() const 218 { 219 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 220 } 221 222 /** 223 * Create a null register of unsigned integer type. 224 */ 225 dst_reg 226 null_reg_ud() const 227 { 228 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 229 } 230 231 /** 232 * Get the mask of SIMD channels enabled by dispatch and not yet 233 * disabled by discard. 234 */ 235 src_reg 236 sample_mask_reg() const 237 { 238 if (shader->stage != MESA_SHADER_FRAGMENT) { 239 return brw_imm_d(0xffffffff); 240 } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) { 241 return brw_flag_reg(0, 1); 242 } else { 243 assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16); 244 return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7), 245 BRW_REGISTER_TYPE_UD); 246 } 247 } 248 249 /** 250 * Insert an instruction into the program. 251 */ 252 instruction * 253 emit(const instruction &inst) const 254 { 255 return emit(new(shader->mem_ctx) instruction(inst)); 256 } 257 258 /** 259 * Create and insert a nullary control instruction into the program. 260 */ 261 instruction * 262 emit(enum opcode opcode) const 263 { 264 return emit(instruction(opcode, dispatch_width())); 265 } 266 267 /** 268 * Create and insert a nullary instruction into the program. 269 */ 270 instruction * 271 emit(enum opcode opcode, const dst_reg &dst) const 272 { 273 return emit(instruction(opcode, dispatch_width(), dst)); 274 } 275 276 /** 277 * Create and insert a unary instruction into the program. 278 */ 279 instruction * 280 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 281 { 282 switch (opcode) { 283 case SHADER_OPCODE_RCP: 284 case SHADER_OPCODE_RSQ: 285 case SHADER_OPCODE_SQRT: 286 case SHADER_OPCODE_EXP2: 287 case SHADER_OPCODE_LOG2: 288 case SHADER_OPCODE_SIN: 289 case SHADER_OPCODE_COS: 290 return emit(instruction(opcode, dispatch_width(), dst, 291 fix_math_operand(src0))); 292 293 default: 294 return emit(instruction(opcode, dispatch_width(), dst, src0)); 295 } 296 } 297 298 /** 299 * Create and insert a binary instruction into the program. 300 */ 301 instruction * 302 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 303 const src_reg &src1) const 304 { 305 switch (opcode) { 306 case SHADER_OPCODE_POW: 307 case SHADER_OPCODE_INT_QUOTIENT: 308 case SHADER_OPCODE_INT_REMAINDER: 309 return emit(instruction(opcode, dispatch_width(), dst, 310 fix_math_operand(src0), 311 fix_math_operand(src1))); 312 313 default: 314 return emit(instruction(opcode, dispatch_width(), dst, src0, src1)); 315 316 } 317 } 318 319 /** 320 * Create and insert a ternary instruction into the program. 321 */ 322 instruction * 323 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 324 const src_reg &src1, const src_reg &src2) const 325 { 326 switch (opcode) { 327 case BRW_OPCODE_BFE: 328 case BRW_OPCODE_BFI2: 329 case BRW_OPCODE_MAD: 330 case BRW_OPCODE_LRP: 331 return emit(instruction(opcode, dispatch_width(), dst, 332 fix_3src_operand(src0), 333 fix_3src_operand(src1), 334 fix_3src_operand(src2))); 335 336 default: 337 return emit(instruction(opcode, dispatch_width(), dst, 338 src0, src1, src2)); 339 } 340 } 341 342 /** 343 * Create and insert an instruction with a variable number of sources 344 * into the program. 345 */ 346 instruction * 347 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 348 unsigned n) const 349 { 350 return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 351 } 352 353 /** 354 * Insert a preallocated instruction into the program. 355 */ 356 instruction * 357 emit(instruction *inst) const 358 { 359 assert(inst->exec_size <= 32); 360 assert(inst->exec_size == dispatch_width() || 361 force_writemask_all); 362 363 inst->group = _group; 364 inst->force_writemask_all = force_writemask_all; 365 inst->annotation = annotation.str; 366 inst->ir = annotation.ir; 367 368 if (block) 369 static_cast<instruction *>(cursor)->insert_before(block, inst); 370 else 371 cursor->insert_before(inst); 372 373 return inst; 374 } 375 376 /** 377 * Select \p src0 if the comparison of both sources with the given 378 * conditional mod evaluates to true, otherwise select \p src1. 379 * 380 * Generally useful to get the minimum or maximum of two values. 381 */ 382 instruction * 383 emit_minmax(const dst_reg &dst, const src_reg &src0, 384 const src_reg &src1, brw_conditional_mod mod) const 385 { 386 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 387 388 return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), 389 fix_unsigned_negate(src1))); 390 } 391 392 /** 393 * Copy any live channel from \p src to the first channel of the result. 394 */ 395 src_reg 396 emit_uniformize(const src_reg &src) const 397 { 398 /* FIXME: We use a vector chan_index and dst to allow constant and 399 * copy propagration to move result all the way into the consuming 400 * instruction (typically a surface index or sampler index for a 401 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 402 * dispatch. Once we teach const/copy propagation about scalars we 403 * should go back to scalar destinations here. 404 */ 405 const fs_builder ubld = exec_all(); 406 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 407 const dst_reg dst = vgrf(src.type); 408 409 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2; 410 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 411 412 return src_reg(component(dst, 0)); 413 } 414 415 void 416 emit_scan(enum opcode opcode, const dst_reg &tmp, 417 unsigned cluster_size, brw_conditional_mod mod) const 418 { 419 assert(dispatch_width() >= 8); 420 421 /* The instruction splitting code isn't advanced enough to split 422 * these so we need to handle that ourselves. 423 */ 424 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 425 const unsigned half_width = dispatch_width() / 2; 426 const fs_builder ubld = exec_all().group(half_width, 0); 427 dst_reg left = tmp; 428 dst_reg right = horiz_offset(tmp, half_width); 429 ubld.emit_scan(opcode, left, cluster_size, mod); 430 ubld.emit_scan(opcode, right, cluster_size, mod); 431 if (cluster_size > half_width) { 432 src_reg left_comp = component(left, half_width - 1); 433 set_condmod(mod, ubld.emit(opcode, right, left_comp, right)); 434 } 435 return; 436 } 437 438 if (cluster_size > 1) { 439 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 440 dst_reg left = horiz_stride(tmp, 2); 441 dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2); 442 443 /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": 444 * 445 * "When source or destination datatype is 64b or operation is 446 * integer DWord multiply, regioning in Align1 must follow 447 * these rules: 448 * 449 * [...] 450 * 451 * 3. Source and Destination offset must be the same, except 452 * the case of scalar source." 453 * 454 * In order to work around this, we create a temporary register 455 * and shift left over to match right. If we have a 64-bit type, 456 * we have to use two integer MOVs instead of a 64-bit MOV. 457 */ 458 if (need_matching_subreg_offset(opcode, tmp.type)) { 459 dst_reg tmp2 = vgrf(tmp.type); 460 dst_reg new_left = horiz_stride(horiz_offset(tmp2, 1), 2); 461 if (type_sz(tmp.type) > 4) { 462 ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 0), 463 subscript(left, BRW_REGISTER_TYPE_D, 0)); 464 ubld.MOV(subscript(new_left, BRW_REGISTER_TYPE_D, 1), 465 subscript(left, BRW_REGISTER_TYPE_D, 1)); 466 } else { 467 ubld.MOV(new_left, left); 468 } 469 left = new_left; 470 } 471 set_condmod(mod, ubld.emit(opcode, right, left, right)); 472 } 473 474 if (cluster_size > 2) { 475 if (type_sz(tmp.type) <= 4 && 476 !need_matching_subreg_offset(opcode, tmp.type)) { 477 const fs_builder ubld = 478 exec_all().group(dispatch_width() / 4, 0); 479 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4); 480 481 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4); 482 set_condmod(mod, ubld.emit(opcode, right, left, right)); 483 484 right = horiz_stride(horiz_offset(tmp, 3), 4); 485 set_condmod(mod, ubld.emit(opcode, right, left, right)); 486 } else { 487 /* For 64-bit types, we have to do things differently because 488 * the code above would land us with destination strides that 489 * the hardware can't handle. Fortunately, we'll only be 490 * 8-wide in that case and it's the same number of 491 * instructions. 492 */ 493 const fs_builder ubld = exec_all().group(2, 0); 494 495 for (unsigned i = 0; i < dispatch_width(); i += 4) { 496 src_reg left = component(tmp, i + 1); 497 dst_reg right = horiz_offset(tmp, i + 2); 498 set_condmod(mod, ubld.emit(opcode, right, left, right)); 499 } 500 } 501 } 502 503 if (cluster_size > 4) { 504 const fs_builder ubld = exec_all().group(4, 0); 505 src_reg left = component(tmp, 3); 506 dst_reg right = horiz_offset(tmp, 4); 507 set_condmod(mod, ubld.emit(opcode, right, left, right)); 508 509 if (dispatch_width() > 8) { 510 left = component(tmp, 8 + 3); 511 right = horiz_offset(tmp, 8 + 4); 512 set_condmod(mod, ubld.emit(opcode, right, left, right)); 513 } 514 } 515 516 if (cluster_size > 8 && dispatch_width() > 8) { 517 const fs_builder ubld = exec_all().group(8, 0); 518 src_reg left = component(tmp, 7); 519 dst_reg right = horiz_offset(tmp, 8); 520 set_condmod(mod, ubld.emit(opcode, right, left, right)); 521 } 522 } 523 524 /** 525 * Assorted arithmetic ops. 526 * @{ 527 */ 528#define ALU1(op) \ 529 instruction * \ 530 op(const dst_reg &dst, const src_reg &src0) const \ 531 { \ 532 return emit(BRW_OPCODE_##op, dst, src0); \ 533 } 534 535#define ALU2(op) \ 536 instruction * \ 537 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 538 { \ 539 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 540 } 541 542#define ALU2_ACC(op) \ 543 instruction * \ 544 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 545 { \ 546 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 547 inst->writes_accumulator = true; \ 548 return inst; \ 549 } 550 551#define ALU3(op) \ 552 instruction * \ 553 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 554 const src_reg &src2) const \ 555 { \ 556 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 557 } 558 559 ALU2(ADD) 560 ALU2_ACC(ADDC) 561 ALU2(AND) 562 ALU2(ASR) 563 ALU2(AVG) 564 ALU3(BFE) 565 ALU2(BFI1) 566 ALU3(BFI2) 567 ALU1(BFREV) 568 ALU1(CBIT) 569 ALU2(CMPN) 570 ALU1(DIM) 571 ALU2(DP2) 572 ALU2(DP3) 573 ALU2(DP4) 574 ALU2(DPH) 575 ALU1(F16TO32) 576 ALU1(F32TO16) 577 ALU1(FBH) 578 ALU1(FBL) 579 ALU1(FRC) 580 ALU2(LINE) 581 ALU1(LZD) 582 ALU2(MAC) 583 ALU2_ACC(MACH) 584 ALU3(MAD) 585 ALU1(MOV) 586 ALU2(MUL) 587 ALU1(NOT) 588 ALU2(OR) 589 ALU2(PLN) 590 ALU1(RNDD) 591 ALU1(RNDE) 592 ALU1(RNDU) 593 ALU1(RNDZ) 594 ALU2(SAD2) 595 ALU2_ACC(SADA2) 596 ALU2(SEL) 597 ALU2(SHL) 598 ALU2(SHR) 599 ALU2_ACC(SUBB) 600 ALU2(XOR) 601 602#undef ALU3 603#undef ALU2_ACC 604#undef ALU2 605#undef ALU1 606 /** @} */ 607 608 /** 609 * CMP: Sets the low bit of the destination channels with the result 610 * of the comparison, while the upper bits are undefined, and updates 611 * the flag register with the packed 16 bits of the result. 612 */ 613 instruction * 614 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 615 brw_conditional_mod condition) const 616 { 617 /* Take the instruction: 618 * 619 * CMP null<d> src0<f> src1<f> 620 * 621 * Original gen4 does type conversion to the destination type 622 * before comparison, producing garbage results for floating 623 * point comparisons. 624 * 625 * The destination type doesn't matter on newer generations, 626 * so we set the type to match src0 so we can compact the 627 * instruction. 628 */ 629 return set_condmod(condition, 630 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 631 fix_unsigned_negate(src0), 632 fix_unsigned_negate(src1))); 633 } 634 635 /** 636 * Gen4 predicated IF. 637 */ 638 instruction * 639 IF(brw_predicate predicate) const 640 { 641 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 642 } 643 644 /** 645 * CSEL: dst = src2 <op> 0.0f ? src0 : src1 646 */ 647 instruction * 648 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 649 const src_reg &src2, brw_conditional_mod condition) const 650 { 651 /* CSEL only operates on floats, so we can't do integer </<=/>=/> 652 * comparisons. Zero/non-zero (== and !=) comparisons almost work. 653 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 654 */ 655 assert(src2.type == BRW_REGISTER_TYPE_F); 656 657 return set_condmod(condition, 658 emit(BRW_OPCODE_CSEL, 659 retype(dst, BRW_REGISTER_TYPE_F), 660 retype(src0, BRW_REGISTER_TYPE_F), 661 retype(src1, BRW_REGISTER_TYPE_F), 662 src2)); 663 } 664 665 /** 666 * Emit a linear interpolation instruction. 667 */ 668 instruction * 669 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 670 const src_reg &a) const 671 { 672 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) { 673 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 674 * we need to reorder the operands. 675 */ 676 return emit(BRW_OPCODE_LRP, dst, a, y, x); 677 678 } else { 679 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 680 const dst_reg y_times_a = vgrf(dst.type); 681 const dst_reg one_minus_a = vgrf(dst.type); 682 const dst_reg x_times_one_minus_a = vgrf(dst.type); 683 684 MUL(y_times_a, y, a); 685 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 686 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 687 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 688 } 689 } 690 691 /** 692 * Collect a number of registers in a contiguous range of registers. 693 */ 694 instruction * 695 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 696 unsigned sources, unsigned header_size) const 697 { 698 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 699 inst->header_size = header_size; 700 inst->size_written = header_size * REG_SIZE; 701 for (unsigned i = header_size; i < sources; i++) { 702 inst->size_written += 703 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 704 REG_SIZE); 705 } 706 707 return inst; 708 } 709 710 backend_shader *shader; 711 712 private: 713 /** 714 * Workaround for negation of UD registers. See comment in 715 * fs_generator::generate_code() for more details. 716 */ 717 src_reg 718 fix_unsigned_negate(const src_reg &src) const 719 { 720 if (src.type == BRW_REGISTER_TYPE_UD && 721 src.negate) { 722 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 723 MOV(temp, src); 724 return src_reg(temp); 725 } else { 726 return src; 727 } 728 } 729 730 /** 731 * Workaround for source register modes not supported by the ternary 732 * instruction encoding. 733 */ 734 src_reg 735 fix_3src_operand(const src_reg &src) const 736 { 737 if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) { 738 return src; 739 } else { 740 dst_reg expanded = vgrf(src.type); 741 MOV(expanded, src); 742 return expanded; 743 } 744 } 745 746 /** 747 * Workaround for source register modes not supported by the math 748 * instruction. 749 */ 750 src_reg 751 fix_math_operand(const src_reg &src) const 752 { 753 /* Can't do hstride == 0 args on gen6 math, so expand it out. We 754 * might be able to do better by doing execsize = 1 math and then 755 * expanding that result out, but we would need to be careful with 756 * masking. 757 * 758 * Gen6 hardware ignores source modifiers (negate and abs) on math 759 * instructions, so we also move to a temp to set those up. 760 * 761 * Gen7 relaxes most of the above restrictions, but still can't use IMM 762 * operands to math 763 */ 764 if ((shader->devinfo->gen == 6 && 765 (src.file == IMM || src.file == UNIFORM || 766 src.abs || src.negate)) || 767 (shader->devinfo->gen == 7 && src.file == IMM)) { 768 const dst_reg tmp = vgrf(src.type); 769 MOV(tmp, src); 770 return tmp; 771 } else { 772 return src; 773 } 774 } 775 776 777 /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": 778 * 779 * "When source or destination datatype is 64b or operation is 780 * integer DWord multiply, regioning in Align1 must follow 781 * these rules: 782 * 783 * [...] 784 * 785 * 3. Source and Destination offset must be the same, except 786 * the case of scalar source." 787 * 788 * This helper just detects when we're in this case. 789 */ 790 bool 791 need_matching_subreg_offset(enum opcode opcode, 792 enum brw_reg_type type) const 793 { 794 if (!shader->devinfo->is_cherryview && 795 !gen_device_info_is_9lp(shader->devinfo)) 796 return false; 797 798 if (type_sz(type) > 4) 799 return true; 800 801 if (opcode == BRW_OPCODE_MUL && 802 !brw_reg_type_is_floating_point(type)) 803 return true; 804 805 return false; 806 } 807 808 bblock_t *block; 809 exec_node *cursor; 810 811 unsigned _dispatch_width; 812 unsigned _group; 813 bool force_writemask_all; 814 815 /** Debug annotation info. */ 816 struct { 817 const char *str; 818 const void *ir; 819 } annotation; 820 }; 821} 822 823#endif 824