1/* -*- c++ -*- */ 2/* 3 * Copyright © 2010-2015 Intel Corporation 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 */ 24 25#ifndef BRW_FS_BUILDER_H 26#define BRW_FS_BUILDER_H 27 28#include "brw_ir_fs.h" 29#include "brw_shader.h" 30 31namespace brw { 32 /** 33 * Toolbox to assemble an FS IR program out of individual instructions. 34 * 35 * This object is meant to have an interface consistent with 36 * brw::vec4_builder. They cannot be fully interchangeable because 37 * brw::fs_builder generates scalar code while brw::vec4_builder generates 38 * vector code. 39 */ 40 class fs_builder { 41 public: 42 /** Type used in this IR to represent a source of an instruction. */ 43 typedef fs_reg src_reg; 44 45 /** Type used in this IR to represent the destination of an instruction. */ 46 typedef fs_reg dst_reg; 47 48 /** Type used in this IR to represent an instruction. */ 49 typedef fs_inst instruction; 50 51 /** 52 * Construct an fs_builder that inserts instructions into \p shader. 53 * \p dispatch_width gives the native execution width of the program. 54 */ 55 fs_builder(backend_shader *shader, 56 unsigned dispatch_width) : 57 shader(shader), block(NULL), cursor(NULL), 58 _dispatch_width(dispatch_width), 59 _group(0), 60 force_writemask_all(false), 61 annotation() 62 { 63 } 64 65 /** 66 * Construct an fs_builder that inserts instructions into \p shader 67 * before instruction \p inst in basic block \p block. The default 68 * execution controls and debug annotation are initialized from the 69 * instruction passed as argument. 70 */ 71 fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) : 72 shader(shader), block(block), cursor(inst), 73 _dispatch_width(inst->exec_size), 74 _group(inst->group), 75 force_writemask_all(inst->force_writemask_all) 76 { 77 annotation.str = inst->annotation; 78 annotation.ir = inst->ir; 79 } 80 81 /** 82 * Construct an fs_builder that inserts instructions before \p cursor in 83 * basic block \p block, inheriting other code generation parameters 84 * from this. 85 */ 86 fs_builder 87 at(bblock_t *block, exec_node *cursor) const 88 { 89 fs_builder bld = *this; 90 bld.block = block; 91 bld.cursor = cursor; 92 return bld; 93 } 94 95 /** 96 * Construct an fs_builder appending instructions at the end of the 97 * instruction list of the shader, inheriting other code generation 98 * parameters from this. 99 */ 100 fs_builder 101 at_end() const 102 { 103 return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); 104 } 105 106 /** 107 * Construct a builder specifying the default SIMD width and group of 108 * channel enable signals, inheriting other code generation parameters 109 * from this. 110 * 111 * \p n gives the default SIMD width, \p i gives the slot group used for 112 * predication and control flow masking in multiples of \p n channels. 113 */ 114 fs_builder 115 group(unsigned n, unsigned i) const 116 { 117 fs_builder bld = *this; 118 119 if (n <= dispatch_width() && i < dispatch_width() / n) { 120 bld._group += i * n; 121 } else { 122 /* The requested channel group isn't a subset of the channel group 123 * of this builder, which means that the resulting instructions 124 * would use (potentially undefined) channel enable signals not 125 * specified by the parent builder. That's only valid if the 126 * instruction doesn't have per-channel semantics, in which case 127 * we should clear off the default group index in order to prevent 128 * emitting instructions with channel group not aligned to their 129 * own execution size. 130 */ 131 assert(force_writemask_all); 132 bld._group = 0; 133 } 134 135 bld._dispatch_width = n; 136 return bld; 137 } 138 139 /** 140 * Alias for group() with width equal to eight. 141 */ 142 fs_builder 143 half(unsigned i) const 144 { 145 return group(8, i); 146 } 147 148 /** 149 * Construct a builder with per-channel control flow execution masking 150 * disabled if \p b is true. If control flow execution masking is 151 * already disabled this has no effect. 152 */ 153 fs_builder 154 exec_all(bool b = true) const 155 { 156 fs_builder bld = *this; 157 if (b) 158 bld.force_writemask_all = true; 159 return bld; 160 } 161 162 /** 163 * Construct a builder with the given debug annotation info. 164 */ 165 fs_builder 166 annotate(const char *str, const void *ir = NULL) const 167 { 168 fs_builder bld = *this; 169 bld.annotation.str = str; 170 bld.annotation.ir = ir; 171 return bld; 172 } 173 174 /** 175 * Get the SIMD width in use. 176 */ 177 unsigned 178 dispatch_width() const 179 { 180 return _dispatch_width; 181 } 182 183 /** 184 * Get the channel group in use. 185 */ 186 unsigned 187 group() const 188 { 189 return _group; 190 } 191 192 /** 193 * Allocate a virtual register of natural vector size (one for this IR) 194 * and SIMD width. \p n gives the amount of space to allocate in 195 * dispatch_width units (which is just enough space for one logical 196 * component in this IR). 197 */ 198 dst_reg 199 vgrf(enum brw_reg_type type, unsigned n = 1) const 200 { 201 assert(dispatch_width() <= 32); 202 203 if (n > 0) 204 return dst_reg(VGRF, shader->alloc.allocate( 205 DIV_ROUND_UP(n * type_sz(type) * dispatch_width(), 206 REG_SIZE)), 207 type); 208 else 209 return retype(null_reg_ud(), type); 210 } 211 212 /** 213 * Create a null register of floating type. 214 */ 215 dst_reg 216 null_reg_f() const 217 { 218 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F)); 219 } 220 221 dst_reg 222 null_reg_df() const 223 { 224 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); 225 } 226 227 /** 228 * Create a null register of signed integer type. 229 */ 230 dst_reg 231 null_reg_d() const 232 { 233 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); 234 } 235 236 /** 237 * Create a null register of unsigned integer type. 238 */ 239 dst_reg 240 null_reg_ud() const 241 { 242 return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); 243 } 244 245 /** 246 * Get the mask of SIMD channels enabled by dispatch and not yet 247 * disabled by discard. 248 */ 249 src_reg 250 sample_mask_reg() const 251 { 252 if (shader->stage != MESA_SHADER_FRAGMENT) { 253 return brw_imm_d(0xffffffff); 254 } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) { 255 return brw_flag_reg(0, 1); 256 } else { 257 assert(shader->devinfo->gen >= 6 && dispatch_width() <= 16); 258 return retype(brw_vec1_grf((_group >= 16 ? 2 : 1), 7), 259 BRW_REGISTER_TYPE_UD); 260 } 261 } 262 263 /** 264 * Insert an instruction into the program. 265 */ 266 instruction * 267 emit(const instruction &inst) const 268 { 269 return emit(new(shader->mem_ctx) instruction(inst)); 270 } 271 272 /** 273 * Create and insert a nullary control instruction into the program. 274 */ 275 instruction * 276 emit(enum opcode opcode) const 277 { 278 return emit(instruction(opcode, dispatch_width())); 279 } 280 281 /** 282 * Create and insert a nullary instruction into the program. 283 */ 284 instruction * 285 emit(enum opcode opcode, const dst_reg &dst) const 286 { 287 return emit(instruction(opcode, dispatch_width(), dst)); 288 } 289 290 /** 291 * Create and insert a unary instruction into the program. 292 */ 293 instruction * 294 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const 295 { 296 switch (opcode) { 297 case SHADER_OPCODE_RCP: 298 case SHADER_OPCODE_RSQ: 299 case SHADER_OPCODE_SQRT: 300 case SHADER_OPCODE_EXP2: 301 case SHADER_OPCODE_LOG2: 302 case SHADER_OPCODE_SIN: 303 case SHADER_OPCODE_COS: 304 return emit(instruction(opcode, dispatch_width(), dst, 305 fix_math_operand(src0))); 306 307 default: 308 return emit(instruction(opcode, dispatch_width(), dst, src0)); 309 } 310 } 311 312 /** 313 * Create and insert a binary instruction into the program. 314 */ 315 instruction * 316 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 317 const src_reg &src1) const 318 { 319 switch (opcode) { 320 case SHADER_OPCODE_POW: 321 case SHADER_OPCODE_INT_QUOTIENT: 322 case SHADER_OPCODE_INT_REMAINDER: 323 return emit(instruction(opcode, dispatch_width(), dst, 324 fix_math_operand(src0), 325 fix_math_operand(fix_byte_src(src1)))); 326 327 default: 328 return emit(instruction(opcode, dispatch_width(), dst, 329 src0, fix_byte_src(src1))); 330 331 } 332 } 333 334 /** 335 * Create and insert a ternary instruction into the program. 336 */ 337 instruction * 338 emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 339 const src_reg &src1, const src_reg &src2) const 340 { 341 switch (opcode) { 342 case BRW_OPCODE_BFE: 343 case BRW_OPCODE_BFI2: 344 case BRW_OPCODE_MAD: 345 case BRW_OPCODE_LRP: 346 return emit(instruction(opcode, dispatch_width(), dst, 347 fix_3src_operand(src0), 348 fix_3src_operand(fix_byte_src(src1)), 349 fix_3src_operand(fix_byte_src(src2)))); 350 351 default: 352 return emit(instruction(opcode, dispatch_width(), dst, 353 src0, fix_byte_src(src1), fix_byte_src(src2))); 354 } 355 } 356 357 /** 358 * Create and insert an instruction with a variable number of sources 359 * into the program. 360 */ 361 instruction * 362 emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[], 363 unsigned n) const 364 { 365 return emit(instruction(opcode, dispatch_width(), dst, srcs, n)); 366 } 367 368 /** 369 * Insert a preallocated instruction into the program. 370 */ 371 instruction * 372 emit(instruction *inst) const 373 { 374 assert(inst->exec_size <= 32); 375 assert(inst->exec_size == dispatch_width() || 376 force_writemask_all); 377 378 inst->group = _group; 379 inst->force_writemask_all = force_writemask_all; 380 inst->annotation = annotation.str; 381 inst->ir = annotation.ir; 382 383 if (block) 384 static_cast<instruction *>(cursor)->insert_before(block, inst); 385 else 386 cursor->insert_before(inst); 387 388 return inst; 389 } 390 391 /** 392 * Select \p src0 if the comparison of both sources with the given 393 * conditional mod evaluates to true, otherwise select \p src1. 394 * 395 * Generally useful to get the minimum or maximum of two values. 396 */ 397 instruction * 398 emit_minmax(const dst_reg &dst, const src_reg &src0, 399 const src_reg &src1, brw_conditional_mod mod) const 400 { 401 assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); 402 403 /* In some cases we can't have bytes as operand for src1, so use the 404 * same type for both operand. 405 */ 406 return set_condmod(mod, SEL(dst, fix_unsigned_negate(fix_byte_src(src0)), 407 fix_unsigned_negate(fix_byte_src(src1)))); 408 } 409 410 /** 411 * Copy any live channel from \p src to the first channel of the result. 412 */ 413 src_reg 414 emit_uniformize(const src_reg &src) const 415 { 416 /* FIXME: We use a vector chan_index and dst to allow constant and 417 * copy propagration to move result all the way into the consuming 418 * instruction (typically a surface index or sampler index for a 419 * send). This uses 1 or 3 extra hw registers in 16 or 32 wide 420 * dispatch. Once we teach const/copy propagation about scalars we 421 * should go back to scalar destinations here. 422 */ 423 const fs_builder ubld = exec_all(); 424 const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD); 425 const dst_reg dst = vgrf(src.type); 426 427 ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index)->flag_subreg = 2; 428 ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0)); 429 430 return src_reg(component(dst, 0)); 431 } 432 433 src_reg 434 move_to_vgrf(const src_reg &src, unsigned num_components) const 435 { 436 src_reg *const src_comps = new src_reg[num_components]; 437 for (unsigned i = 0; i < num_components; i++) 438 src_comps[i] = offset(src, dispatch_width(), i); 439 440 const dst_reg dst = vgrf(src.type, num_components); 441 LOAD_PAYLOAD(dst, src_comps, num_components, 0); 442 443 delete[] src_comps; 444 445 return src_reg(dst); 446 } 447 448 void 449 emit_scan(enum opcode opcode, const dst_reg &tmp, 450 unsigned cluster_size, brw_conditional_mod mod) const 451 { 452 assert(dispatch_width() >= 8); 453 454 /* The instruction splitting code isn't advanced enough to split 455 * these so we need to handle that ourselves. 456 */ 457 if (dispatch_width() * type_sz(tmp.type) > 2 * REG_SIZE) { 458 const unsigned half_width = dispatch_width() / 2; 459 const fs_builder ubld = exec_all().group(half_width, 0); 460 dst_reg left = tmp; 461 dst_reg right = horiz_offset(tmp, half_width); 462 ubld.emit_scan(opcode, left, cluster_size, mod); 463 ubld.emit_scan(opcode, right, cluster_size, mod); 464 if (cluster_size > half_width) { 465 src_reg left_comp = component(left, half_width - 1); 466 set_condmod(mod, ubld.emit(opcode, right, left_comp, right)); 467 } 468 return; 469 } 470 471 if (cluster_size > 1) { 472 const fs_builder ubld = exec_all().group(dispatch_width() / 2, 0); 473 const dst_reg left = horiz_stride(tmp, 2); 474 const dst_reg right = horiz_stride(horiz_offset(tmp, 1), 2); 475 set_condmod(mod, ubld.emit(opcode, right, left, right)); 476 } 477 478 if (cluster_size > 2) { 479 if (type_sz(tmp.type) <= 4) { 480 const fs_builder ubld = 481 exec_all().group(dispatch_width() / 4, 0); 482 src_reg left = horiz_stride(horiz_offset(tmp, 1), 4); 483 484 dst_reg right = horiz_stride(horiz_offset(tmp, 2), 4); 485 set_condmod(mod, ubld.emit(opcode, right, left, right)); 486 487 right = horiz_stride(horiz_offset(tmp, 3), 4); 488 set_condmod(mod, ubld.emit(opcode, right, left, right)); 489 } else { 490 /* For 64-bit types, we have to do things differently because 491 * the code above would land us with destination strides that 492 * the hardware can't handle. Fortunately, we'll only be 493 * 8-wide in that case and it's the same number of 494 * instructions. 495 */ 496 const fs_builder ubld = exec_all().group(2, 0); 497 498 for (unsigned i = 0; i < dispatch_width(); i += 4) { 499 src_reg left = component(tmp, i + 1); 500 dst_reg right = horiz_offset(tmp, i + 2); 501 set_condmod(mod, ubld.emit(opcode, right, left, right)); 502 } 503 } 504 } 505 506 if (cluster_size > 4) { 507 const fs_builder ubld = exec_all().group(4, 0); 508 src_reg left = component(tmp, 3); 509 dst_reg right = horiz_offset(tmp, 4); 510 set_condmod(mod, ubld.emit(opcode, right, left, right)); 511 512 if (dispatch_width() > 8) { 513 left = component(tmp, 8 + 3); 514 right = horiz_offset(tmp, 8 + 4); 515 set_condmod(mod, ubld.emit(opcode, right, left, right)); 516 } 517 } 518 519 if (cluster_size > 8 && dispatch_width() > 8) { 520 const fs_builder ubld = exec_all().group(8, 0); 521 src_reg left = component(tmp, 7); 522 dst_reg right = horiz_offset(tmp, 8); 523 set_condmod(mod, ubld.emit(opcode, right, left, right)); 524 } 525 } 526 527 /** 528 * Assorted arithmetic ops. 529 * @{ 530 */ 531#define ALU1(op) \ 532 instruction * \ 533 op(const dst_reg &dst, const src_reg &src0) const \ 534 { \ 535 return emit(BRW_OPCODE_##op, dst, src0); \ 536 } 537 538#define ALU2(op) \ 539 instruction * \ 540 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 541 { \ 542 return emit(BRW_OPCODE_##op, dst, src0, src1); \ 543 } 544 545#define ALU2_ACC(op) \ 546 instruction * \ 547 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ 548 { \ 549 instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ 550 inst->writes_accumulator = true; \ 551 return inst; \ 552 } 553 554#define ALU3(op) \ 555 instruction * \ 556 op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ 557 const src_reg &src2) const \ 558 { \ 559 return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ 560 } 561 562 ALU2(ADD) 563 ALU2_ACC(ADDC) 564 ALU2(AND) 565 ALU2(ASR) 566 ALU2(AVG) 567 ALU3(BFE) 568 ALU2(BFI1) 569 ALU3(BFI2) 570 ALU1(BFREV) 571 ALU1(CBIT) 572 ALU2(CMPN) 573 ALU1(DIM) 574 ALU2(DP2) 575 ALU2(DP3) 576 ALU2(DP4) 577 ALU2(DPH) 578 ALU1(F16TO32) 579 ALU1(F32TO16) 580 ALU1(FBH) 581 ALU1(FBL) 582 ALU1(FRC) 583 ALU2(LINE) 584 ALU1(LZD) 585 ALU2(MAC) 586 ALU2_ACC(MACH) 587 ALU3(MAD) 588 ALU1(MOV) 589 ALU2(MUL) 590 ALU1(NOT) 591 ALU2(OR) 592 ALU2(PLN) 593 ALU1(RNDD) 594 ALU1(RNDE) 595 ALU1(RNDU) 596 ALU1(RNDZ) 597 ALU2(SAD2) 598 ALU2_ACC(SADA2) 599 ALU2(SEL) 600 ALU2(SHL) 601 ALU2(SHR) 602 ALU2_ACC(SUBB) 603 ALU2(XOR) 604 605#undef ALU3 606#undef ALU2_ACC 607#undef ALU2 608#undef ALU1 609 /** @} */ 610 611 /** 612 * CMP: Sets the low bit of the destination channels with the result 613 * of the comparison, while the upper bits are undefined, and updates 614 * the flag register with the packed 16 bits of the result. 615 */ 616 instruction * 617 CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 618 brw_conditional_mod condition) const 619 { 620 /* Take the instruction: 621 * 622 * CMP null<d> src0<f> src1<f> 623 * 624 * Original gen4 does type conversion to the destination type 625 * before comparison, producing garbage results for floating 626 * point comparisons. 627 * 628 * The destination type doesn't matter on newer generations, 629 * so we set the type to match src0 so we can compact the 630 * instruction. 631 */ 632 return set_condmod(condition, 633 emit(BRW_OPCODE_CMP, retype(dst, src0.type), 634 fix_unsigned_negate(src0), 635 fix_unsigned_negate(src1))); 636 } 637 638 /** 639 * Gen4 predicated IF. 640 */ 641 instruction * 642 IF(brw_predicate predicate) const 643 { 644 return set_predicate(predicate, emit(BRW_OPCODE_IF)); 645 } 646 647 /** 648 * CSEL: dst = src2 <op> 0.0f ? src0 : src1 649 */ 650 instruction * 651 CSEL(const dst_reg &dst, const src_reg &src0, const src_reg &src1, 652 const src_reg &src2, brw_conditional_mod condition) const 653 { 654 /* CSEL only operates on floats, so we can't do integer </<=/>=/> 655 * comparisons. Zero/non-zero (== and !=) comparisons almost work. 656 * 0x80000000 fails because it is -0.0, and -0.0 == 0.0. 657 */ 658 assert(src2.type == BRW_REGISTER_TYPE_F); 659 660 return set_condmod(condition, 661 emit(BRW_OPCODE_CSEL, 662 retype(dst, BRW_REGISTER_TYPE_F), 663 retype(src0, BRW_REGISTER_TYPE_F), 664 retype(fix_byte_src(src1), BRW_REGISTER_TYPE_F), 665 fix_byte_src(src2))); 666 } 667 668 /** 669 * Emit a linear interpolation instruction. 670 */ 671 instruction * 672 LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, 673 const src_reg &a) const 674 { 675 if (shader->devinfo->gen >= 6 && shader->devinfo->gen <= 10) { 676 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so 677 * we need to reorder the operands. 678 */ 679 return emit(BRW_OPCODE_LRP, dst, a, y, x); 680 681 } else { 682 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */ 683 const dst_reg y_times_a = vgrf(dst.type); 684 const dst_reg one_minus_a = vgrf(dst.type); 685 const dst_reg x_times_one_minus_a = vgrf(dst.type); 686 687 MUL(y_times_a, y, a); 688 ADD(one_minus_a, negate(a), brw_imm_f(1.0f)); 689 MUL(x_times_one_minus_a, x, src_reg(one_minus_a)); 690 return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)); 691 } 692 } 693 694 /** 695 * Collect a number of registers in a contiguous range of registers. 696 */ 697 instruction * 698 LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src, 699 unsigned sources, unsigned header_size) const 700 { 701 instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); 702 inst->header_size = header_size; 703 inst->size_written = header_size * REG_SIZE; 704 for (unsigned i = header_size; i < sources; i++) { 705 inst->size_written += 706 ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride, 707 REG_SIZE); 708 } 709 710 return inst; 711 } 712 713 backend_shader *shader; 714 715 /** 716 * Byte sized operands are not supported for src1 on Gen11+. 717 */ 718 src_reg 719 fix_byte_src(const src_reg &src) const 720 { 721 if ((shader->devinfo->gen < 11 && !shader->devinfo->is_geminilake) || 722 type_sz(src.type) != 1) 723 return src; 724 725 dst_reg temp = vgrf(src.type == BRW_REGISTER_TYPE_UB ? 726 BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D); 727 MOV(temp, src); 728 return src_reg(temp); 729 } 730 731 private: 732 /** 733 * Workaround for negation of UD registers. See comment in 734 * fs_generator::generate_code() for more details. 735 */ 736 src_reg 737 fix_unsigned_negate(const src_reg &src) const 738 { 739 if (src.type == BRW_REGISTER_TYPE_UD && 740 src.negate) { 741 dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); 742 MOV(temp, src); 743 return src_reg(temp); 744 } else { 745 return src; 746 } 747 } 748 749 /** 750 * Workaround for source register modes not supported by the ternary 751 * instruction encoding. 752 */ 753 src_reg 754 fix_3src_operand(const src_reg &src) const 755 { 756 switch (src.file) { 757 case FIXED_GRF: 758 /* FINISHME: Could handle scalar region, other stride=1 regions */ 759 if (src.vstride != BRW_VERTICAL_STRIDE_8 || 760 src.width != BRW_WIDTH_8 || 761 src.hstride != BRW_HORIZONTAL_STRIDE_1) 762 break; 763 /* fallthrough */ 764 case ATTR: 765 case VGRF: 766 case UNIFORM: 767 case IMM: 768 return src; 769 default: 770 break; 771 } 772 773 dst_reg expanded = vgrf(src.type); 774 MOV(expanded, src); 775 return expanded; 776 } 777 778 /** 779 * Workaround for source register modes not supported by the math 780 * instruction. 781 */ 782 src_reg 783 fix_math_operand(const src_reg &src) const 784 { 785 /* Can't do hstride == 0 args on gen6 math, so expand it out. We 786 * might be able to do better by doing execsize = 1 math and then 787 * expanding that result out, but we would need to be careful with 788 * masking. 789 * 790 * Gen6 hardware ignores source modifiers (negate and abs) on math 791 * instructions, so we also move to a temp to set those up. 792 * 793 * Gen7 relaxes most of the above restrictions, but still can't use IMM 794 * operands to math 795 */ 796 if ((shader->devinfo->gen == 6 && 797 (src.file == IMM || src.file == UNIFORM || 798 src.abs || src.negate)) || 799 (shader->devinfo->gen == 7 && src.file == IMM)) { 800 const dst_reg tmp = vgrf(src.type); 801 MOV(tmp, src); 802 return tmp; 803 } else { 804 return src; 805 } 806 } 807 808 bblock_t *block; 809 exec_node *cursor; 810 811 unsigned _dispatch_width; 812 unsigned _group; 813 bool force_writemask_all; 814 815 /** Debug annotation info. */ 816 struct { 817 const char *str; 818 const void *ir; 819 } annotation; 820 }; 821} 822 823#endif 824