1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_cfg.h" 26#include "brw_eu.h" 27#include "util/u_math.h" 28 29namespace brw { 30 31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, 32 const src_reg &src0, const src_reg &src1, 33 const src_reg &src2) 34{ 35 this->opcode = opcode; 36 this->dst = dst; 37 this->src[0] = src0; 38 this->src[1] = src1; 39 this->src[2] = src2; 40 this->saturate = false; 41 this->force_writemask_all = false; 42 this->no_dd_clear = false; 43 this->no_dd_check = false; 44 this->writes_accumulator = false; 45 this->conditional_mod = BRW_CONDITIONAL_NONE; 46 this->predicate = BRW_PREDICATE_NONE; 47 this->predicate_inverse = false; 48 this->target = 0; 49 this->shadow_compare = false; 50 this->eot = false; 51 this->ir = NULL; 52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 53 this->header_size = 0; 54 this->flag_subreg = 0; 55 this->mlen = 0; 56 this->base_mrf = 0; 57 this->offset = 0; 58 this->exec_size = 8; 59 this->group = 0; 60 this->size_written = (dst.file == BAD_FILE ? 61 0 : this->exec_size * type_sz(dst.type)); 62 this->annotation = NULL; 63} 64 65vec4_instruction * 66vec4_visitor::emit(vec4_instruction *inst) 67{ 68 inst->ir = this->base_ir; 69 inst->annotation = this->current_annotation; 70 71 this->instructions.push_tail(inst); 72 73 return inst; 74} 75 76vec4_instruction * 77vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, 78 vec4_instruction *new_inst) 79{ 80 new_inst->ir = inst->ir; 81 new_inst->annotation = inst->annotation; 82 83 inst->insert_before(block, new_inst); 84 85 return inst; 86} 87 88vec4_instruction * 89vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 90 const src_reg &src1, const src_reg &src2) 91{ 92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); 93} 94 95 96vec4_instruction * 97vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 98 const src_reg &src1) 99{ 100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); 101} 102 103vec4_instruction * 104vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) 105{ 106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); 107} 108 109vec4_instruction * 110vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) 111{ 112 return emit(new(mem_ctx) vec4_instruction(opcode, dst)); 113} 114 115vec4_instruction * 116vec4_visitor::emit(enum opcode opcode) 117{ 118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); 119} 120 121#define ALU1(op) \ 122 vec4_instruction * \ 123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ 124 { \ 125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ 126 } 127 128#define ALU2(op) \ 129 vec4_instruction * \ 130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 131 const src_reg &src1) \ 132 { \ 133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 134 src0, src1); \ 135 } 136 137#define ALU2_ACC(op) \ 138 vec4_instruction * \ 139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 140 const src_reg &src1) \ 141 { \ 142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ 143 BRW_OPCODE_##op, dst, src0, src1); \ 144 inst->writes_accumulator = true; \ 145 return inst; \ 146 } 147 148#define ALU3(op) \ 149 vec4_instruction * \ 150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 151 const src_reg &src1, const src_reg &src2) \ 152 { \ 153 assert(devinfo->ver >= 6); \ 154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 155 src0, src1, src2); \ 156 } 157 158ALU1(NOT) 159ALU1(MOV) 160ALU1(FRC) 161ALU1(RNDD) 162ALU1(RNDE) 163ALU1(RNDZ) 164ALU1(F32TO16) 165ALU1(F16TO32) 166ALU2(ADD) 167ALU2(MUL) 168ALU2_ACC(MACH) 169ALU2(AND) 170ALU2(OR) 171ALU2(XOR) 172ALU2(DP3) 173ALU2(DP4) 174ALU2(DPH) 175ALU2(SHL) 176ALU2(SHR) 177ALU2(ASR) 178ALU3(LRP) 179ALU1(BFREV) 180ALU3(BFE) 181ALU2(BFI1) 182ALU3(BFI2) 183ALU1(FBH) 184ALU1(FBL) 185ALU1(CBIT) 186ALU3(MAD) 187ALU2_ACC(ADDC) 188ALU2_ACC(SUBB) 189ALU2(MAC) 190ALU1(DIM) 191 192/** Gfx4 predicated IF. */ 193vec4_instruction * 194vec4_visitor::IF(enum brw_predicate predicate) 195{ 196 vec4_instruction *inst; 197 198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); 199 inst->predicate = predicate; 200 201 return inst; 202} 203 204/** Gfx6 IF with embedded comparison. */ 205vec4_instruction * 206vec4_visitor::IF(src_reg src0, src_reg src1, 207 enum brw_conditional_mod condition) 208{ 209 assert(devinfo->ver == 6); 210 211 vec4_instruction *inst; 212 213 resolve_ud_negate(&src0); 214 resolve_ud_negate(&src1); 215 216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), 217 src0, src1); 218 inst->conditional_mod = condition; 219 220 return inst; 221} 222 223/** 224 * CMP: Sets the low bit of the destination channels with the result 225 * of the comparison, while the upper bits are undefined, and updates 226 * the flag register with the packed 16 bits of the result. 227 */ 228vec4_instruction * 229vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, 230 enum brw_conditional_mod condition) 231{ 232 vec4_instruction *inst; 233 234 /* Take the instruction: 235 * 236 * CMP null<d> src0<f> src1<f> 237 * 238 * Original gfx4 does type conversion to the destination type before 239 * comparison, producing garbage results for floating point comparisons. 240 * 241 * The destination type doesn't matter on newer generations, so we set the 242 * type to match src0 so we can compact the instruction. 243 */ 244 dst.type = src0.type; 245 246 resolve_ud_negate(&src0); 247 resolve_ud_negate(&src1); 248 249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); 250 inst->conditional_mod = condition; 251 252 return inst; 253} 254 255vec4_instruction * 256vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) 257{ 258 vec4_instruction *inst; 259 260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ, 261 dst, index); 262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1; 263 inst->mlen = 2; 264 265 return inst; 266} 267 268vec4_instruction * 269vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, 270 const src_reg &index) 271{ 272 vec4_instruction *inst; 273 274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE, 275 dst, src, index); 276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver); 277 inst->mlen = 3; 278 279 return inst; 280} 281 282src_reg 283vec4_visitor::fix_3src_operand(const src_reg &src) 284{ 285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 286 * able to use vertical stride of zero to replicate the vec4 uniform, like 287 * 288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 289 * 290 * But you can't, since vertical stride is always four in three-source 291 * instructions. Instead, insert a MOV instruction to do the replication so 292 * that the three-source instruction can consume it. 293 */ 294 295 /* The MOV is only needed if the source is a uniform or immediate. */ 296 if (src.file != UNIFORM && src.file != IMM) 297 return src; 298 299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 300 return src; 301 302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 303 expanded.type = src.type; 304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 305 return src_reg(expanded); 306} 307 308src_reg 309vec4_visitor::fix_math_operand(const src_reg &src) 310{ 311 if (devinfo->ver < 6 || src.file == BAD_FILE) 312 return src; 313 314 /* The gfx6 math instruction ignores the source modifiers -- 315 * swizzle, abs, negate, and at least some parts of the register 316 * region description. 317 * 318 * Rather than trying to enumerate all these cases, *always* expand the 319 * operand to a temp GRF for gfx6. 320 * 321 * For gfx7, keep the operand as-is, except if immediate, which gfx7 still 322 * can't use. 323 */ 324 325 if (devinfo->ver == 7 && src.file != IMM) 326 return src; 327 328 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 329 expanded.type = src.type; 330 emit(MOV(expanded, src)); 331 return src_reg(expanded); 332} 333 334vec4_instruction * 335vec4_visitor::emit_math(enum opcode opcode, 336 const dst_reg &dst, 337 const src_reg &src0, const src_reg &src1) 338{ 339 vec4_instruction *math = 340 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); 341 342 if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) { 343 /* MATH on Gfx6 must be align1, so we can't do writemasks. */ 344 math->dst = dst_reg(this, glsl_type::vec4_type); 345 math->dst.type = dst.type; 346 math = emit(MOV(dst, src_reg(math->dst))); 347 } else if (devinfo->ver < 6) { 348 math->base_mrf = 1; 349 math->mlen = src1.file == BAD_FILE ? 1 : 2; 350 } 351 352 return math; 353} 354 355void 356vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) 357{ 358 if (devinfo->ver < 7) { 359 unreachable("ir_unop_pack_half_2x16 should be lowered"); 360 } 361 362 assert(dst.type == BRW_REGISTER_TYPE_UD); 363 assert(src0.type == BRW_REGISTER_TYPE_F); 364 365 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 366 * 367 * Because this instruction does not have a 16-bit floating-point type, 368 * the destination data type must be Word (W). 369 * 370 * The destination must be DWord-aligned and specify a horizontal stride 371 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 372 * each destination channel and the upper word is not modified. 373 * 374 * The above restriction implies that the f32to16 instruction must use 375 * align1 mode, because only in align1 mode is it possible to specify 376 * horizontal stride. We choose here to defy the hardware docs and emit 377 * align16 instructions. 378 * 379 * (I [chadv] did attempt to emit align1 instructions for VS f32to16 380 * instructions. I was partially successful in that the code passed all 381 * tests. However, the code was dubiously correct and fragile, and the 382 * tests were not harsh enough to probe that frailty. Not trusting the 383 * code, I chose instead to remain in align16 mode in defiance of the hw 384 * docs). 385 * 386 * I've [chadv] experimentally confirmed that, on gfx7 hardware and the 387 * simulator, emitting a f32to16 in align16 mode with UD as destination 388 * data type is safe. The behavior differs from that specified in the PRM 389 * in that the upper word of each destination channel is cleared to 0. 390 */ 391 392 dst_reg tmp_dst(this, glsl_type::uvec2_type); 393 src_reg tmp_src(tmp_dst); 394 395#if 0 396 /* Verify the undocumented behavior on which the following instructions 397 * rely. If f32to16 fails to clear the upper word of the X and Y channels, 398 * then the result of the bit-or instruction below will be incorrect. 399 * 400 * You should inspect the disasm output in order to verify that the MOV is 401 * not optimized away. 402 */ 403 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); 404#endif 405 406 /* Give tmp the form below, where "." means untouched. 407 * 408 * w z y x w z y x 409 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| 410 * 411 * That the upper word of each write-channel be 0 is required for the 412 * following bit-shift and bit-or instructions to work. Note that this 413 * relies on the undocumented hardware behavior mentioned above. 414 */ 415 tmp_dst.writemask = WRITEMASK_XY; 416 emit(F32TO16(tmp_dst, src0)); 417 418 /* Give the write-channels of dst the form: 419 * 0xhhhh0000 420 */ 421 tmp_src.swizzle = BRW_SWIZZLE_YYYY; 422 emit(SHL(dst, tmp_src, brw_imm_ud(16u))); 423 424 /* Finally, give the write-channels of dst the form of packHalf2x16's 425 * output: 426 * 0xhhhhllll 427 */ 428 tmp_src.swizzle = BRW_SWIZZLE_XXXX; 429 emit(OR(dst, src_reg(dst), tmp_src)); 430} 431 432void 433vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) 434{ 435 if (devinfo->ver < 7) { 436 unreachable("ir_unop_unpack_half_2x16 should be lowered"); 437 } 438 439 assert(dst.type == BRW_REGISTER_TYPE_F); 440 assert(src0.type == BRW_REGISTER_TYPE_UD); 441 442 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 443 * 444 * Because this instruction does not have a 16-bit floating-point type, 445 * the source data type must be Word (W). The destination type must be 446 * F (Float). 447 * 448 * To use W as the source data type, we must adjust horizontal strides, 449 * which is only possible in align1 mode. All my [chadv] attempts at 450 * emitting align1 instructions for unpackHalf2x16 failed to pass the 451 * Piglit tests, so I gave up. 452 * 453 * I've verified that, on gfx7 hardware and the simulator, it is safe to 454 * emit f16to32 in align16 mode with UD as source data type. 455 */ 456 457 dst_reg tmp_dst(this, glsl_type::uvec2_type); 458 src_reg tmp_src(tmp_dst); 459 460 tmp_dst.writemask = WRITEMASK_X; 461 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); 462 463 tmp_dst.writemask = WRITEMASK_Y; 464 emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); 465 466 dst.writemask = WRITEMASK_XY; 467 emit(F16TO32(dst, tmp_src)); 468} 469 470void 471vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) 472{ 473 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 474 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 475 * is not suitable to generate the shift values, but we can use the packed 476 * vector float and a type-converting MOV. 477 */ 478 dst_reg shift(this, glsl_type::uvec4_type); 479 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 480 481 dst_reg shifted(this, glsl_type::uvec4_type); 482 src0.swizzle = BRW_SWIZZLE_XXXX; 483 emit(SHR(shifted, src0, src_reg(shift))); 484 485 shifted.type = BRW_REGISTER_TYPE_UB; 486 dst_reg f(this, glsl_type::vec4_type); 487 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 488 489 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); 490} 491 492void 493vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) 494{ 495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 497 * is not suitable to generate the shift values, but we can use the packed 498 * vector float and a type-converting MOV. 499 */ 500 dst_reg shift(this, glsl_type::uvec4_type); 501 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 502 503 dst_reg shifted(this, glsl_type::uvec4_type); 504 src0.swizzle = BRW_SWIZZLE_XXXX; 505 emit(SHR(shifted, src0, src_reg(shift))); 506 507 shifted.type = BRW_REGISTER_TYPE_B; 508 dst_reg f(this, glsl_type::vec4_type); 509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 510 511 dst_reg scaled(this, glsl_type::vec4_type); 512 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); 513 514 dst_reg max(this, glsl_type::vec4_type); 515 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); 516 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); 517} 518 519void 520vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) 521{ 522 dst_reg saturated(this, glsl_type::vec4_type); 523 vec4_instruction *inst = emit(MOV(saturated, src0)); 524 inst->saturate = true; 525 526 dst_reg scaled(this, glsl_type::vec4_type); 527 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); 528 529 dst_reg rounded(this, glsl_type::vec4_type); 530 emit(RNDE(rounded, src_reg(scaled))); 531 532 dst_reg u(this, glsl_type::uvec4_type); 533 emit(MOV(u, src_reg(rounded))); 534 535 src_reg bytes(u); 536 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 537} 538 539void 540vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) 541{ 542 dst_reg max(this, glsl_type::vec4_type); 543 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); 544 545 dst_reg min(this, glsl_type::vec4_type); 546 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); 547 548 dst_reg scaled(this, glsl_type::vec4_type); 549 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); 550 551 dst_reg rounded(this, glsl_type::vec4_type); 552 emit(RNDE(rounded, src_reg(scaled))); 553 554 dst_reg i(this, glsl_type::ivec4_type); 555 emit(MOV(i, src_reg(rounded))); 556 557 src_reg bytes(i); 558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 559} 560 561/* 562 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == 563 * false) elements needed to pack a type. 564 */ 565static int 566type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless) 567{ 568 unsigned int i; 569 int size; 570 571 switch (type->base_type) { 572 case GLSL_TYPE_UINT: 573 case GLSL_TYPE_INT: 574 case GLSL_TYPE_FLOAT: 575 case GLSL_TYPE_FLOAT16: 576 case GLSL_TYPE_BOOL: 577 case GLSL_TYPE_DOUBLE: 578 case GLSL_TYPE_UINT16: 579 case GLSL_TYPE_INT16: 580 case GLSL_TYPE_UINT8: 581 case GLSL_TYPE_INT8: 582 case GLSL_TYPE_UINT64: 583 case GLSL_TYPE_INT64: 584 if (type->is_matrix()) { 585 const glsl_type *col_type = type->column_type(); 586 unsigned col_slots = 587 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; 588 return type->matrix_columns * col_slots; 589 } else { 590 /* Regardless of size of vector, it gets a vec4. This is bad 591 * packing for things like floats, but otherwise arrays become a 592 * mess. Hopefully a later pass over the code can pack scalars 593 * down if appropriate. 594 */ 595 return (as_vec4 && type->is_dual_slot()) ? 2 : 1; 596 } 597 case GLSL_TYPE_ARRAY: 598 assert(type->length > 0); 599 return type_size_xvec4(type->fields.array, as_vec4, bindless) * 600 type->length; 601 case GLSL_TYPE_STRUCT: 602 case GLSL_TYPE_INTERFACE: 603 size = 0; 604 for (i = 0; i < type->length; i++) { 605 size += type_size_xvec4(type->fields.structure[i].type, as_vec4, 606 bindless); 607 } 608 return size; 609 case GLSL_TYPE_SUBROUTINE: 610 return 1; 611 612 case GLSL_TYPE_SAMPLER: 613 /* Samplers take up no register space, since they're baked in at 614 * link time. 615 */ 616 return bindless ? 1 : 0; 617 case GLSL_TYPE_ATOMIC_UINT: 618 return 0; 619 case GLSL_TYPE_IMAGE: 620 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); 621 case GLSL_TYPE_VOID: 622 case GLSL_TYPE_ERROR: 623 case GLSL_TYPE_FUNCTION: 624 unreachable("not reached"); 625 } 626 627 return 0; 628} 629 630/** 631 * Returns the minimum number of vec4 elements needed to pack a type. 632 * 633 * For simple types, it will return 1 (a single vec4); for matrices, the 634 * number of columns; for array and struct, the sum of the vec4_size of 635 * each of its elements; and for sampler and atomic, zero. 636 * 637 * This method is useful to calculate how much register space is needed to 638 * store a particular type. 639 */ 640extern "C" int 641type_size_vec4(const struct glsl_type *type, bool bindless) 642{ 643 return type_size_xvec4(type, true, bindless); 644} 645 646/** 647 * Returns the minimum number of dvec4 elements needed to pack a type. 648 * 649 * For simple types, it will return 1 (a single dvec4); for matrices, the 650 * number of columns; for array and struct, the sum of the dvec4_size of 651 * each of its elements; and for sampler and atomic, zero. 652 * 653 * This method is useful to calculate how much register space is needed to 654 * store a particular type. 655 * 656 * Measuring double-precision vertex inputs as dvec4 is required because 657 * ARB_vertex_attrib_64bit states that these uses the same number of locations 658 * than the single-precision version. That is, two consecutives dvec4 would be 659 * located in location "x" and location "x+1", not "x+2". 660 * 661 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, 662 * remap_vs_attrs() will take in account both the location and also if the 663 * type fits in one or two vec4 slots. 664 */ 665extern "C" int 666type_size_dvec4(const struct glsl_type *type, bool bindless) 667{ 668 return type_size_xvec4(type, false, bindless); 669} 670 671src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) 672{ 673 init(); 674 675 this->file = VGRF; 676 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 677 678 if (type->is_array() || type->is_struct()) { 679 this->swizzle = BRW_SWIZZLE_NOOP; 680 } else { 681 this->swizzle = brw_swizzle_for_size(type->vector_elements); 682 } 683 684 this->type = brw_type_for_base_type(type); 685} 686 687src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) 688{ 689 assert(size > 0); 690 691 init(); 692 693 this->file = VGRF; 694 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size); 695 696 this->swizzle = BRW_SWIZZLE_NOOP; 697 698 this->type = brw_type_for_base_type(type); 699} 700 701dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) 702{ 703 init(); 704 705 this->file = VGRF; 706 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 707 708 if (type->is_array() || type->is_struct()) { 709 this->writemask = WRITEMASK_XYZW; 710 } else { 711 this->writemask = (1 << type->vector_elements) - 1; 712 } 713 714 this->type = brw_type_for_base_type(type); 715} 716 717vec4_instruction * 718vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, 719 src_reg src0, src_reg src1) 720{ 721 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); 722 inst->conditional_mod = conditionalmod; 723 return inst; 724} 725 726/** 727 * Emits the instructions needed to perform a pull constant load. before_block 728 * and before_inst can be NULL in which case the instruction will be appended 729 * to the end of the instruction list. 730 */ 731void 732vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, 733 src_reg surf_index, 734 src_reg offset_reg, 735 bblock_t *before_block, 736 vec4_instruction *before_inst) 737{ 738 assert((before_inst == NULL && before_block == NULL) || 739 (before_inst && before_block)); 740 741 vec4_instruction *pull; 742 743 if (devinfo->ver >= 7) { 744 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); 745 746 grf_offset.type = offset_reg.type; 747 748 pull = MOV(grf_offset, offset_reg); 749 750 if (before_inst) 751 emit_before(before_block, before_inst, pull); 752 else 753 emit(pull); 754 755 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7, 756 dst, 757 surf_index, 758 src_reg(grf_offset)); 759 pull->mlen = 1; 760 } else { 761 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, 762 dst, 763 surf_index, 764 offset_reg); 765 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; 766 pull->mlen = 1; 767 } 768 769 if (before_inst) 770 emit_before(before_block, before_inst, pull); 771 else 772 emit(pull); 773} 774 775src_reg 776vec4_visitor::emit_uniformize(const src_reg &src) 777{ 778 const src_reg chan_index(this, glsl_type::uint_type); 779 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), 780 src.type); 781 782 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) 783 ->force_writemask_all = true; 784 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) 785 ->force_writemask_all = true; 786 787 return src_reg(dst); 788} 789 790src_reg 791vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, 792 src_reg coordinate, src_reg surface) 793{ 794 vec4_instruction *inst = 795 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, 796 dst_reg(this, glsl_type::uvec4_type)); 797 inst->base_mrf = 2; 798 inst->src[1] = surface; 799 inst->src[2] = brw_imm_ud(0); /* sampler */ 800 inst->mlen = 1; 801 802 const int param_base = inst->base_mrf; 803 804 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ 805 int coord_mask = (1 << coordinate_type->vector_elements) - 1; 806 int zero_mask = 0xf & ~coord_mask; 807 808 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), 809 coordinate)); 810 811 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), 812 brw_imm_d(0))); 813 814 emit(inst); 815 return src_reg(inst->dst); 816} 817 818bool 819vec4_visitor::is_high_sampler(src_reg sampler) 820{ 821 if (!devinfo->is_haswell) 822 return false; 823 824 return sampler.file != IMM || sampler.ud >= 16; 825} 826 827void 828vec4_visitor::emit_texture(ir_texture_opcode op, 829 dst_reg dest, 830 int dest_components, 831 src_reg coordinate, 832 int coord_components, 833 src_reg shadow_comparator, 834 src_reg lod, src_reg lod2, 835 src_reg sample_index, 836 uint32_t constant_offset, 837 src_reg offset_value, 838 src_reg mcs, 839 uint32_t surface, 840 src_reg surface_reg, 841 src_reg sampler_reg) 842{ 843 enum opcode opcode; 844 switch (op) { 845 case ir_tex: opcode = SHADER_OPCODE_TXL; break; 846 case ir_txl: opcode = SHADER_OPCODE_TXL; break; 847 case ir_txd: opcode = SHADER_OPCODE_TXD; break; 848 case ir_txf: opcode = SHADER_OPCODE_TXF; break; 849 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break; 850 case ir_txs: opcode = SHADER_OPCODE_TXS; break; 851 case ir_tg4: opcode = offset_value.file != BAD_FILE 852 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; 853 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break; 854 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; 855 case ir_txb: 856 unreachable("TXB is not valid for vertex shaders."); 857 case ir_lod: 858 unreachable("LOD is not valid for vertex shaders."); 859 case ir_samples_identical: { 860 /* There are some challenges implementing this for vec4, and it seems 861 * unlikely to be used anyway. For now, just return false ways. 862 */ 863 emit(MOV(dest, brw_imm_ud(0u))); 864 return; 865 } 866 default: 867 unreachable("Unrecognized tex op"); 868 } 869 870 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); 871 872 inst->offset = constant_offset; 873 874 /* The message header is necessary for: 875 * - Gfx4 (always) 876 * - Texel offsets 877 * - Gather channel selection 878 * - Sampler indices too large to fit in a 4-bit value. 879 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal 880 */ 881 inst->header_size = 882 (devinfo->ver < 5 || 883 inst->offset != 0 || op == ir_tg4 || 884 op == ir_texture_samples || 885 is_high_sampler(sampler_reg)) ? 1 : 0; 886 inst->base_mrf = 2; 887 inst->mlen = inst->header_size; 888 inst->dst.writemask = WRITEMASK_XYZW; 889 inst->shadow_compare = shadow_comparator.file != BAD_FILE; 890 891 inst->src[1] = surface_reg; 892 inst->src[2] = sampler_reg; 893 894 /* MRF for the first parameter */ 895 int param_base = inst->base_mrf + inst->header_size; 896 897 if (op == ir_txs || op == ir_query_levels) { 898 int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X; 899 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); 900 inst->mlen++; 901 } else if (op == ir_texture_samples) { 902 inst->dst.writemask = WRITEMASK_X; 903 } else { 904 /* Load the coordinate */ 905 /* FINISHME: gl_clamp_mask and saturate */ 906 int coord_mask = (1 << coord_components) - 1; 907 int zero_mask = 0xf & ~coord_mask; 908 909 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), 910 coordinate)); 911 inst->mlen++; 912 913 if (zero_mask != 0) { 914 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), 915 brw_imm_d(0))); 916 } 917 /* Load the shadow comparator */ 918 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) { 919 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, 920 WRITEMASK_X), 921 shadow_comparator)); 922 inst->mlen++; 923 } 924 925 /* Load the LOD info */ 926 if (op == ir_tex || op == ir_txl) { 927 int mrf, writemask; 928 if (devinfo->ver >= 5) { 929 mrf = param_base + 1; 930 if (shadow_comparator.file != BAD_FILE) { 931 writemask = WRITEMASK_Y; 932 /* mlen already incremented */ 933 } else { 934 writemask = WRITEMASK_X; 935 inst->mlen++; 936 } 937 } else /* devinfo->ver == 4 */ { 938 mrf = param_base; 939 writemask = WRITEMASK_W; 940 } 941 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); 942 } else if (op == ir_txf) { 943 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); 944 } else if (op == ir_txf_ms) { 945 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), 946 sample_index)); 947 if (devinfo->ver >= 7) { 948 /* MCS data is in the first channel of `mcs`, but we need to get it into 949 * the .y channel of the second vec4 of params, so replicate .x across 950 * the whole vec4 and then mask off everything except .y 951 */ 952 mcs.swizzle = BRW_SWIZZLE_XXXX; 953 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), 954 mcs)); 955 } 956 inst->mlen++; 957 } else if (op == ir_txd) { 958 const brw_reg_type type = lod.type; 959 960 if (devinfo->ver >= 5) { 961 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 962 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 963 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); 964 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); 965 inst->mlen++; 966 967 if (dest_components == 3 || shadow_comparator.file != BAD_FILE) { 968 lod.swizzle = BRW_SWIZZLE_ZZZZ; 969 lod2.swizzle = BRW_SWIZZLE_ZZZZ; 970 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); 971 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); 972 inst->mlen++; 973 974 if (shadow_comparator.file != BAD_FILE) { 975 emit(MOV(dst_reg(MRF, param_base + 2, 976 shadow_comparator.type, WRITEMASK_Z), 977 shadow_comparator)); 978 } 979 } 980 } else /* devinfo->ver == 4 */ { 981 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); 982 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); 983 inst->mlen += 2; 984 } 985 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) { 986 if (shadow_comparator.file != BAD_FILE) { 987 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), 988 shadow_comparator)); 989 } 990 991 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), 992 offset_value)); 993 inst->mlen++; 994 } 995 } 996 997 emit(inst); 998 999 /* fixup num layers (z) for cube arrays: hardware returns faces * layers; 1000 * spec requires layers. 1001 */ 1002 if (op == ir_txs && devinfo->ver < 7) { 1003 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ 1004 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), 1005 src_reg(inst->dst), brw_imm_d(1)); 1006 } 1007 1008 if (devinfo->ver == 6 && op == ir_tg4) { 1009 emit_gfx6_gather_wa(key_tex->gfx6_gather_wa[surface], inst->dst); 1010 } 1011 1012 if (op == ir_query_levels) { 1013 /* # levels is in .w */ 1014 src_reg swizzled(dest); 1015 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, 1016 SWIZZLE_W, SWIZZLE_W); 1017 emit(MOV(dest, swizzled)); 1018 } 1019} 1020 1021/** 1022 * Apply workarounds for Gfx6 gather with UINT/SINT 1023 */ 1024void 1025vec4_visitor::emit_gfx6_gather_wa(uint8_t wa, dst_reg dst) 1026{ 1027 if (!wa) 1028 return; 1029 1030 int width = (wa & WA_8BIT) ? 8 : 16; 1031 dst_reg dst_f = dst; 1032 dst_f.type = BRW_REGISTER_TYPE_F; 1033 1034 /* Convert from UNORM to UINT */ 1035 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1)))); 1036 emit(MOV(dst, src_reg(dst_f))); 1037 1038 if (wa & WA_SIGN) { 1039 /* Reinterpret the UINT value as a signed INT value by 1040 * shifting the sign bit into place, then shifting back 1041 * preserving sign. 1042 */ 1043 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width))); 1044 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width))); 1045 } 1046} 1047 1048void 1049vec4_visitor::gs_emit_vertex(int /* stream_id */) 1050{ 1051 unreachable("not reached"); 1052} 1053 1054void 1055vec4_visitor::gs_end_primitive() 1056{ 1057 unreachable("not reached"); 1058} 1059 1060void 1061vec4_visitor::emit_ndc_computation() 1062{ 1063 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) 1064 return; 1065 1066 /* Get the position */ 1067 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); 1068 1069 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ 1070 dst_reg ndc = dst_reg(this, glsl_type::vec4_type); 1071 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; 1072 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; 1073 1074 current_annotation = "NDC"; 1075 dst_reg ndc_w = ndc; 1076 ndc_w.writemask = WRITEMASK_W; 1077 src_reg pos_w = pos; 1078 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); 1079 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); 1080 1081 dst_reg ndc_xyz = ndc; 1082 ndc_xyz.writemask = WRITEMASK_XYZ; 1083 1084 emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); 1085} 1086 1087void 1088vec4_visitor::emit_psiz_and_flags(dst_reg reg) 1089{ 1090 if (devinfo->ver < 6 && 1091 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || 1092 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || 1093 devinfo->has_negative_rhw_bug)) { 1094 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); 1095 dst_reg header1_w = header1; 1096 header1_w.writemask = WRITEMASK_W; 1097 1098 emit(MOV(header1, brw_imm_ud(0u))); 1099 1100 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1101 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1102 1103 current_annotation = "Point size"; 1104 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); 1105 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); 1106 } 1107 1108 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { 1109 current_annotation = "Clipping flags"; 1110 dst_reg flags0 = dst_reg(this, glsl_type::uint_type); 1111 1112 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1113 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); 1114 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); 1115 } 1116 1117 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) { 1118 dst_reg flags1 = dst_reg(this, glsl_type::uint_type); 1119 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1120 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); 1121 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); 1122 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); 1123 } 1124 1125 /* i965 clipping workaround: 1126 * 1) Test for -ve rhw 1127 * 2) If set, 1128 * set ndc = (0,0,0,0) 1129 * set ucp[6] = 1 1130 * 1131 * Later, clipping will detect ucp[6] and ensure the primitive is 1132 * clipped against all fixed planes. 1133 */ 1134 if (devinfo->has_negative_rhw_bug && 1135 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { 1136 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); 1137 ndc_w.swizzle = BRW_SWIZZLE_WWWW; 1138 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1139 vec4_instruction *inst; 1140 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); 1141 inst->predicate = BRW_PREDICATE_NORMAL; 1142 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; 1143 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); 1144 inst->predicate = BRW_PREDICATE_NORMAL; 1145 } 1146 1147 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); 1148 } else if (devinfo->ver < 6) { 1149 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); 1150 } else { 1151 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); 1152 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) { 1153 dst_reg reg_w = reg; 1154 reg_w.writemask = WRITEMASK_W; 1155 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1156 reg_as_src.type = reg_w.type; 1157 reg_as_src.swizzle = brw_swizzle_for_size(1); 1158 emit(MOV(reg_w, reg_as_src)); 1159 } 1160 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) { 1161 dst_reg reg_y = reg; 1162 reg_y.writemask = WRITEMASK_Y; 1163 reg_y.type = BRW_REGISTER_TYPE_D; 1164 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; 1165 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); 1166 } 1167 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) { 1168 dst_reg reg_z = reg; 1169 reg_z.writemask = WRITEMASK_Z; 1170 reg_z.type = BRW_REGISTER_TYPE_D; 1171 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; 1172 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); 1173 } 1174 } 1175} 1176 1177vec4_instruction * 1178vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) 1179{ 1180 assert(varying < VARYING_SLOT_MAX); 1181 1182 unsigned num_comps = output_num_components[varying][component]; 1183 if (num_comps == 0) 1184 return NULL; 1185 1186 assert(output_reg[varying][component].type == reg.type); 1187 current_annotation = output_reg_annotation[varying]; 1188 if (output_reg[varying][component].file != BAD_FILE) { 1189 src_reg src = src_reg(output_reg[varying][component]); 1190 src.swizzle = BRW_SWZ_COMP_OUTPUT(component); 1191 reg.writemask = 1192 brw_writemask_for_component_packing(num_comps, component); 1193 return emit(MOV(reg, src)); 1194 } 1195 return NULL; 1196} 1197 1198void 1199vec4_visitor::emit_urb_slot(dst_reg reg, int varying) 1200{ 1201 reg.type = BRW_REGISTER_TYPE_F; 1202 output_reg[varying][0].type = reg.type; 1203 1204 switch (varying) { 1205 case VARYING_SLOT_PSIZ: 1206 { 1207 /* PSIZ is always in slot 0, and is coupled with other flags. */ 1208 current_annotation = "indices, point width, clip flags"; 1209 emit_psiz_and_flags(reg); 1210 break; 1211 } 1212 case BRW_VARYING_SLOT_NDC: 1213 current_annotation = "NDC"; 1214 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) 1215 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); 1216 break; 1217 case VARYING_SLOT_POS: 1218 current_annotation = "gl_Position"; 1219 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) 1220 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); 1221 break; 1222 case BRW_VARYING_SLOT_PAD: 1223 /* No need to write to this slot */ 1224 break; 1225 default: 1226 for (int i = 0; i < 4; i++) { 1227 emit_generic_urb_slot(reg, varying, i); 1228 } 1229 break; 1230 } 1231} 1232 1233static unsigned 1234align_interleaved_urb_mlen(const struct intel_device_info *devinfo, 1235 unsigned mlen) 1236{ 1237 if (devinfo->ver >= 6) { 1238 /* URB data written (does not include the message header reg) must 1239 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 1240 * section 5.4.3.2.2: URB_INTERLEAVED. 1241 * 1242 * URB entries are allocated on a multiple of 1024 bits, so an 1243 * extra 128 bits written here to make the end align to 256 is 1244 * no problem. 1245 */ 1246 if ((mlen % 2) != 1) 1247 mlen++; 1248 } 1249 1250 return mlen; 1251} 1252 1253 1254/** 1255 * Generates the VUE payload plus the necessary URB write instructions to 1256 * output it. 1257 * 1258 * The VUE layout is documented in Volume 2a. 1259 */ 1260void 1261vec4_visitor::emit_vertex() 1262{ 1263 /* MRF 0 is reserved for the debugger, so start with message header 1264 * in MRF 1. 1265 */ 1266 int base_mrf = 1; 1267 int mrf = base_mrf; 1268 /* In the process of generating our URB write message contents, we 1269 * may need to unspill a register or load from an array. Those 1270 * reads would use MRFs 14-15. 1271 */ 1272 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); 1273 1274 /* The following assertion verifies that max_usable_mrf causes an 1275 * even-numbered amount of URB write data, which will meet gfx6's 1276 * requirements for length alignment. 1277 */ 1278 assert ((max_usable_mrf - base_mrf) % 2 == 0); 1279 1280 /* First mrf is the g0-based message header containing URB handles and 1281 * such. 1282 */ 1283 emit_urb_write_header(mrf++); 1284 1285 if (devinfo->ver < 6) { 1286 emit_ndc_computation(); 1287 } 1288 1289 /* We may need to split this up into several URB writes, so do them in a 1290 * loop. 1291 */ 1292 int slot = 0; 1293 bool complete = false; 1294 do { 1295 /* URB offset is in URB row increments, and each of our MRFs is half of 1296 * one of those, since we're doing interleaved writes. 1297 */ 1298 int offset = slot / 2; 1299 1300 mrf = base_mrf + 1; 1301 for (; slot < prog_data->vue_map.num_slots; ++slot) { 1302 emit_urb_slot(dst_reg(MRF, mrf++), 1303 prog_data->vue_map.slot_to_varying[slot]); 1304 1305 /* If this was max_usable_mrf, we can't fit anything more into this 1306 * URB WRITE. Same thing if we reached the maximum length available. 1307 */ 1308 if (mrf > max_usable_mrf || 1309 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 1310 slot++; 1311 break; 1312 } 1313 } 1314 1315 complete = slot >= prog_data->vue_map.num_slots; 1316 current_annotation = "URB write"; 1317 vec4_instruction *inst = emit_urb_write_opcode(complete); 1318 inst->base_mrf = base_mrf; 1319 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); 1320 inst->offset += offset; 1321 } while(!complete); 1322} 1323 1324 1325src_reg 1326vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, 1327 src_reg *reladdr, int reg_offset) 1328{ 1329 /* Because we store the values to scratch interleaved like our 1330 * vertex data, we need to scale the vec4 index by 2. 1331 */ 1332 int message_header_scale = 2; 1333 1334 /* Pre-gfx6, the message header uses byte offsets instead of vec4 1335 * (16-byte) offset units. 1336 */ 1337 if (devinfo->ver < 6) 1338 message_header_scale *= 16; 1339 1340 if (reladdr) { 1341 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have 1342 * to multiply the reladdr by 2. Notice that the reg_offset part 1343 * is in units of 16 bytes and is used to select the low/high 16-byte 1344 * chunk of a full dvec4, so we don't want to multiply that part. 1345 */ 1346 src_reg index = src_reg(this, glsl_type::int_type); 1347 if (type_sz(inst->dst.type) < 8) { 1348 emit_before(block, inst, ADD(dst_reg(index), *reladdr, 1349 brw_imm_d(reg_offset))); 1350 emit_before(block, inst, MUL(dst_reg(index), index, 1351 brw_imm_d(message_header_scale))); 1352 } else { 1353 emit_before(block, inst, MUL(dst_reg(index), *reladdr, 1354 brw_imm_d(message_header_scale * 2))); 1355 emit_before(block, inst, ADD(dst_reg(index), index, 1356 brw_imm_d(reg_offset * message_header_scale))); 1357 } 1358 return index; 1359 } else { 1360 return brw_imm_d(reg_offset * message_header_scale); 1361 } 1362} 1363 1364/** 1365 * Emits an instruction before @inst to load the value named by @orig_src 1366 * from scratch space at @base_offset to @temp. 1367 * 1368 * @base_offset is measured in 32-byte units (the size of a register). 1369 */ 1370void 1371vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, 1372 dst_reg temp, src_reg orig_src, 1373 int base_offset) 1374{ 1375 assert(orig_src.offset % REG_SIZE == 0); 1376 int reg_offset = base_offset + orig_src.offset / REG_SIZE; 1377 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, 1378 reg_offset); 1379 1380 if (type_sz(orig_src.type) < 8) { 1381 emit_before(block, inst, SCRATCH_READ(temp, index)); 1382 } else { 1383 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 1384 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); 1385 emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); 1386 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); 1387 vec4_instruction *last_read = 1388 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); 1389 emit_before(block, inst, last_read); 1390 shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read); 1391 } 1392} 1393 1394/** 1395 * Emits an instruction after @inst to store the value to be written 1396 * to @orig_dst to scratch space at @base_offset, from @temp. 1397 * 1398 * @base_offset is measured in 32-byte units (the size of a register). 1399 */ 1400void 1401vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, 1402 int base_offset) 1403{ 1404 assert(inst->dst.offset % REG_SIZE == 0); 1405 int reg_offset = base_offset + inst->dst.offset / REG_SIZE; 1406 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1407 reg_offset); 1408 1409 /* Create a temporary register to store *inst's result in. 1410 * 1411 * We have to be careful in MOVing from our temporary result register in 1412 * the scratch write. If we swizzle from channels of the temporary that 1413 * weren't initialized, it will confuse live interval analysis, which will 1414 * make spilling fail to make progress. 1415 */ 1416 bool is_64bit = type_sz(inst->dst.type) == 8; 1417 const glsl_type *alloc_type = 1418 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; 1419 const src_reg temp = swizzle(retype(src_reg(this, alloc_type), 1420 inst->dst.type), 1421 brw_swizzle_for_mask(inst->dst.writemask)); 1422 1423 if (!is_64bit) { 1424 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), 1425 inst->dst.writemask)); 1426 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); 1427 if (inst->opcode != BRW_OPCODE_SEL) 1428 write->predicate = inst->predicate; 1429 write->ir = inst->ir; 1430 write->annotation = inst->annotation; 1431 inst->insert_after(block, write); 1432 } else { 1433 dst_reg shuffled = dst_reg(this, alloc_type); 1434 vec4_instruction *last = 1435 shuffle_64bit_data(shuffled, temp, true, true, block, inst); 1436 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 1437 1438 uint8_t mask = 0; 1439 if (inst->dst.writemask & WRITEMASK_X) 1440 mask |= WRITEMASK_XY; 1441 if (inst->dst.writemask & WRITEMASK_Y) 1442 mask |= WRITEMASK_ZW; 1443 if (mask) { 1444 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1445 1446 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); 1447 if (inst->opcode != BRW_OPCODE_SEL) 1448 write->predicate = inst->predicate; 1449 write->ir = inst->ir; 1450 write->annotation = inst->annotation; 1451 last->insert_after(block, write); 1452 } 1453 1454 mask = 0; 1455 if (inst->dst.writemask & WRITEMASK_Z) 1456 mask |= WRITEMASK_XY; 1457 if (inst->dst.writemask & WRITEMASK_W) 1458 mask |= WRITEMASK_ZW; 1459 if (mask) { 1460 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1461 1462 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1463 reg_offset + 1); 1464 vec4_instruction *write = 1465 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); 1466 if (inst->opcode != BRW_OPCODE_SEL) 1467 write->predicate = inst->predicate; 1468 write->ir = inst->ir; 1469 write->annotation = inst->annotation; 1470 last->insert_after(block, write); 1471 } 1472 } 1473 1474 inst->dst.file = temp.file; 1475 inst->dst.nr = temp.nr; 1476 inst->dst.offset %= REG_SIZE; 1477 inst->dst.reladdr = NULL; 1478} 1479 1480/** 1481 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, 1482 * adds the scratch read(s) before \p inst. The function also checks for 1483 * recursive reladdr scratch accesses, issuing the corresponding scratch 1484 * loads and rewriting reladdr references accordingly. 1485 * 1486 * \return \p src if it did not require a scratch load, otherwise, the 1487 * register holding the result of the scratch load that the caller should 1488 * use to rewrite src. 1489 */ 1490src_reg 1491vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, 1492 vec4_instruction *inst, src_reg src) 1493{ 1494 /* Resolve recursive reladdr scratch access by calling ourselves 1495 * with src.reladdr 1496 */ 1497 if (src.reladdr) 1498 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1499 *src.reladdr); 1500 1501 /* Now handle scratch access on src */ 1502 if (src.file == VGRF && scratch_loc[src.nr] != -1) { 1503 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? 1504 glsl_type::dvec4_type : glsl_type::vec4_type); 1505 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); 1506 src.nr = temp.nr; 1507 src.offset %= REG_SIZE; 1508 src.reladdr = NULL; 1509 } 1510 1511 return src; 1512} 1513 1514/** 1515 * We can't generally support array access in GRF space, because a 1516 * single instruction's destination can only span 2 contiguous 1517 * registers. So, we send all GRF arrays that get variable index 1518 * access to scratch space. 1519 */ 1520void 1521vec4_visitor::move_grf_array_access_to_scratch() 1522{ 1523 int scratch_loc[this->alloc.count]; 1524 memset(scratch_loc, -1, sizeof(scratch_loc)); 1525 1526 /* First, calculate the set of virtual GRFs that need to be punted 1527 * to scratch due to having any array access on them, and where in 1528 * scratch. 1529 */ 1530 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1531 if (inst->dst.file == VGRF && inst->dst.reladdr) { 1532 if (scratch_loc[inst->dst.nr] == -1) { 1533 scratch_loc[inst->dst.nr] = last_scratch; 1534 last_scratch += this->alloc.sizes[inst->dst.nr]; 1535 } 1536 1537 for (src_reg *iter = inst->dst.reladdr; 1538 iter->reladdr; 1539 iter = iter->reladdr) { 1540 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1541 scratch_loc[iter->nr] = last_scratch; 1542 last_scratch += this->alloc.sizes[iter->nr]; 1543 } 1544 } 1545 } 1546 1547 for (int i = 0 ; i < 3; i++) { 1548 for (src_reg *iter = &inst->src[i]; 1549 iter->reladdr; 1550 iter = iter->reladdr) { 1551 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1552 scratch_loc[iter->nr] = last_scratch; 1553 last_scratch += this->alloc.sizes[iter->nr]; 1554 } 1555 } 1556 } 1557 } 1558 1559 /* Now, for anything that will be accessed through scratch, rewrite 1560 * it to load/store. Note that this is a _safe list walk, because 1561 * we may generate a new scratch_write instruction after the one 1562 * we're processing. 1563 */ 1564 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1565 /* Set up the annotation tracking for new generated instructions. */ 1566 base_ir = inst->ir; 1567 current_annotation = inst->annotation; 1568 1569 /* First handle scratch access on the dst. Notice we have to handle 1570 * the case where the dst's reladdr also points to scratch space. 1571 */ 1572 if (inst->dst.reladdr) 1573 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1574 *inst->dst.reladdr); 1575 1576 /* Now that we have handled any (possibly recursive) reladdr scratch 1577 * accesses for dst we can safely do the scratch write for dst itself 1578 */ 1579 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) 1580 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); 1581 1582 /* Now handle scratch access on any src. In this case, since inst->src[i] 1583 * already is a src_reg, we can just call emit_resolve_reladdr with 1584 * inst->src[i] and it will take care of handling scratch loads for 1585 * both src and src.reladdr (recursively). 1586 */ 1587 for (int i = 0 ; i < 3; i++) { 1588 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, 1589 inst->src[i]); 1590 } 1591 } 1592} 1593 1594/** 1595 * Emits an instruction before @inst to load the value named by @orig_src 1596 * from the pull constant buffer (surface) at @base_offset to @temp. 1597 */ 1598void 1599vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, 1600 dst_reg temp, src_reg orig_src, 1601 int base_offset, src_reg indirect) 1602{ 1603 assert(orig_src.offset % 16 == 0); 1604 const unsigned index = prog_data->base.binding_table.pull_constants_start; 1605 1606 /* For 64bit loads we need to emit two 32-bit load messages and we also 1607 * we need to shuffle the 32-bit data result into proper 64-bit data. To do 1608 * that we emit the 32-bit loads into a temporary and we shuffle the result 1609 * into the original destination. 1610 */ 1611 dst_reg orig_temp = temp; 1612 bool is_64bit = type_sz(orig_src.type) == 8; 1613 if (is_64bit) { 1614 assert(type_sz(temp.type) == 8); 1615 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type); 1616 temp = retype(temp_df, BRW_REGISTER_TYPE_F); 1617 } 1618 1619 src_reg src = orig_src; 1620 for (int i = 0; i < (is_64bit ? 2 : 1); i++) { 1621 int reg_offset = base_offset + src.offset / 16; 1622 1623 src_reg offset; 1624 if (indirect.file != BAD_FILE) { 1625 offset = src_reg(this, glsl_type::uint_type); 1626 emit_before(block, inst, ADD(dst_reg(offset), indirect, 1627 brw_imm_ud(reg_offset * 16))); 1628 } else { 1629 offset = brw_imm_d(reg_offset * 16); 1630 } 1631 1632 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE), 1633 brw_imm_ud(index), 1634 offset, 1635 block, inst); 1636 1637 src = byte_offset(src, 16); 1638 } 1639 1640 if (is_64bit) { 1641 temp = retype(temp, BRW_REGISTER_TYPE_DF); 1642 shuffle_64bit_data(orig_temp, src_reg(temp), false, false, block, inst); 1643 } 1644} 1645 1646/** 1647 * Implements array access of uniforms by inserting a 1648 * PULL_CONSTANT_LOAD instruction. 1649 * 1650 * Unlike temporary GRF array access (where we don't support it due to 1651 * the difficulty of doing relative addressing on instruction 1652 * destinations), we could potentially do array access of uniforms 1653 * that were loaded in GRF space as push constants. In real-world 1654 * usage we've seen, though, the arrays being used are always larger 1655 * than we could load as push constants, so just always move all 1656 * uniform array access out to a pull constant buffer. 1657 */ 1658void 1659vec4_visitor::move_uniform_array_access_to_pull_constants() 1660{ 1661 /* The vulkan dirver doesn't support pull constants other than UBOs so 1662 * everything has to be pushed regardless. 1663 */ 1664 if (!compiler->supports_pull_constants) { 1665 split_uniform_registers(); 1666 return; 1667 } 1668 1669 /* Allocate the pull_params array */ 1670 assert(stage_prog_data->nr_pull_params == 0); 1671 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, 1672 this->uniforms * 4); 1673 1674 int pull_constant_loc[this->uniforms]; 1675 memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); 1676 1677 /* First, walk through the instructions and determine which things need to 1678 * be pulled. We mark something as needing to be pulled by setting 1679 * pull_constant_loc to 0. 1680 */ 1681 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1682 /* We only care about MOV_INDIRECT of a uniform */ 1683 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1684 inst->src[0].file != UNIFORM) 1685 continue; 1686 1687 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1688 1689 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) 1690 pull_constant_loc[uniform_nr + j] = 0; 1691 } 1692 1693 /* Next, we walk the list of uniforms and assign real pull constant 1694 * locations and set their corresponding entries in pull_param. 1695 */ 1696 for (int j = 0; j < this->uniforms; j++) { 1697 if (pull_constant_loc[j] < 0) 1698 continue; 1699 1700 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; 1701 1702 for (int i = 0; i < 4; i++) { 1703 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] 1704 = stage_prog_data->param[j * 4 + i]; 1705 } 1706 } 1707 1708 /* Finally, we can walk through the instructions and lower MOV_INDIRECT 1709 * instructions to actual uniform pulls. 1710 */ 1711 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1712 /* We only care about MOV_INDIRECT of a uniform */ 1713 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1714 inst->src[0].file != UNIFORM) 1715 continue; 1716 1717 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1718 1719 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); 1720 1721 emit_pull_constant_load(block, inst, inst->dst, inst->src[0], 1722 pull_constant_loc[uniform_nr], inst->src[1]); 1723 inst->remove(block); 1724 } 1725 1726 /* Now there are no accesses of the UNIFORM file with a reladdr, so 1727 * no need to track them as larger-than-vec4 objects. This will be 1728 * relied on in cutting out unused uniform vectors from push 1729 * constants. 1730 */ 1731 split_uniform_registers(); 1732} 1733 1734void 1735vec4_visitor::resolve_ud_negate(src_reg *reg) 1736{ 1737 if (reg->type != BRW_REGISTER_TYPE_UD || 1738 !reg->negate) 1739 return; 1740 1741 src_reg temp = src_reg(this, glsl_type::uvec4_type); 1742 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); 1743 *reg = temp; 1744} 1745 1746vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, 1747 void *log_data, 1748 const struct brw_sampler_prog_key_data *key_tex, 1749 struct brw_vue_prog_data *prog_data, 1750 const nir_shader *shader, 1751 void *mem_ctx, 1752 bool no_spills, 1753 int shader_time_index, 1754 bool debug_enabled) 1755 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base, 1756 debug_enabled), 1757 key_tex(key_tex), 1758 prog_data(prog_data), 1759 fail_msg(NULL), 1760 first_non_payload_grf(0), 1761 ubo_push_start(), 1762 push_length(0), 1763 live_analysis(this), performance_analysis(this), 1764 need_all_constants_in_pull_buffer(false), 1765 no_spills(no_spills), 1766 shader_time_index(shader_time_index), 1767 last_scratch(0) 1768{ 1769 this->failed = false; 1770 1771 this->base_ir = NULL; 1772 this->current_annotation = NULL; 1773 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); 1774 1775 memset(this->output_num_components, 0, sizeof(this->output_num_components)); 1776 1777 this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF; 1778 1779 this->uniforms = 0; 1780 1781 this->nir_locals = NULL; 1782 this->nir_ssa_values = NULL; 1783} 1784 1785 1786void 1787vec4_visitor::fail(const char *format, ...) 1788{ 1789 va_list va; 1790 char *msg; 1791 1792 if (failed) 1793 return; 1794 1795 failed = true; 1796 1797 va_start(va, format); 1798 msg = ralloc_vasprintf(mem_ctx, format, va); 1799 va_end(va); 1800 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 1801 1802 this->fail_msg = msg; 1803 1804 if (unlikely(debug_enabled)) { 1805 fprintf(stderr, "%s", msg); 1806 } 1807} 1808 1809} /* namespace brw */ 1810