1/* 2 * Copyright © 2011 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_vec4.h" 25#include "brw_cfg.h" 26#include "brw_eu.h" 27#include "util/u_math.h" 28 29namespace brw { 30 31vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, 32 const src_reg &src0, const src_reg &src1, 33 const src_reg &src2) 34{ 35 this->opcode = opcode; 36 this->dst = dst; 37 this->src[0] = src0; 38 this->src[1] = src1; 39 this->src[2] = src2; 40 this->saturate = false; 41 this->force_writemask_all = false; 42 this->no_dd_clear = false; 43 this->no_dd_check = false; 44 this->writes_accumulator = false; 45 this->conditional_mod = BRW_CONDITIONAL_NONE; 46 this->predicate = BRW_PREDICATE_NONE; 47 this->predicate_inverse = false; 48 this->target = 0; 49 this->shadow_compare = false; 50 this->eot = false; 51 this->ir = NULL; 52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; 53 this->header_size = 0; 54 this->flag_subreg = 0; 55 this->mlen = 0; 56 this->base_mrf = 0; 57 this->offset = 0; 58 this->exec_size = 8; 59 this->group = 0; 60 this->size_written = (dst.file == BAD_FILE ? 61 0 : this->exec_size * type_sz(dst.type)); 62 this->annotation = NULL; 63} 64 65vec4_instruction * 66vec4_visitor::emit(vec4_instruction *inst) 67{ 68 inst->ir = this->base_ir; 69 inst->annotation = this->current_annotation; 70 71 this->instructions.push_tail(inst); 72 73 return inst; 74} 75 76vec4_instruction * 77vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, 78 vec4_instruction *new_inst) 79{ 80 new_inst->ir = inst->ir; 81 new_inst->annotation = inst->annotation; 82 83 inst->insert_before(block, new_inst); 84 85 return inst; 86} 87 88vec4_instruction * 89vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 90 const src_reg &src1, const src_reg &src2) 91{ 92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); 93} 94 95 96vec4_instruction * 97vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, 98 const src_reg &src1) 99{ 100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); 101} 102 103vec4_instruction * 104vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) 105{ 106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); 107} 108 109vec4_instruction * 110vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) 111{ 112 return emit(new(mem_ctx) vec4_instruction(opcode, dst)); 113} 114 115vec4_instruction * 116vec4_visitor::emit(enum opcode opcode) 117{ 118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); 119} 120 121#define ALU1(op) \ 122 vec4_instruction * \ 123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ 124 { \ 125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ 126 } 127 128#define ALU2(op) \ 129 vec4_instruction * \ 130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 131 const src_reg &src1) \ 132 { \ 133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 134 src0, src1); \ 135 } 136 137#define ALU2_ACC(op) \ 138 vec4_instruction * \ 139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 140 const src_reg &src1) \ 141 { \ 142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ 143 BRW_OPCODE_##op, dst, src0, src1); \ 144 inst->writes_accumulator = true; \ 145 return inst; \ 146 } 147 148#define ALU3(op) \ 149 vec4_instruction * \ 150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ 151 const src_reg &src1, const src_reg &src2) \ 152 { \ 153 assert(devinfo->gen >= 6); \ 154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ 155 src0, src1, src2); \ 156 } 157 158ALU1(NOT) 159ALU1(MOV) 160ALU1(FRC) 161ALU1(RNDD) 162ALU1(RNDE) 163ALU1(RNDZ) 164ALU1(F32TO16) 165ALU1(F16TO32) 166ALU2(ADD) 167ALU2(MUL) 168ALU2_ACC(MACH) 169ALU2(AND) 170ALU2(OR) 171ALU2(XOR) 172ALU2(DP3) 173ALU2(DP4) 174ALU2(DPH) 175ALU2(SHL) 176ALU2(SHR) 177ALU2(ASR) 178ALU3(LRP) 179ALU1(BFREV) 180ALU3(BFE) 181ALU2(BFI1) 182ALU3(BFI2) 183ALU1(FBH) 184ALU1(FBL) 185ALU1(CBIT) 186ALU3(MAD) 187ALU2_ACC(ADDC) 188ALU2_ACC(SUBB) 189ALU2(MAC) 190ALU1(DIM) 191 192/** Gen4 predicated IF. */ 193vec4_instruction * 194vec4_visitor::IF(enum brw_predicate predicate) 195{ 196 vec4_instruction *inst; 197 198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); 199 inst->predicate = predicate; 200 201 return inst; 202} 203 204/** Gen6 IF with embedded comparison. */ 205vec4_instruction * 206vec4_visitor::IF(src_reg src0, src_reg src1, 207 enum brw_conditional_mod condition) 208{ 209 assert(devinfo->gen == 6); 210 211 vec4_instruction *inst; 212 213 resolve_ud_negate(&src0); 214 resolve_ud_negate(&src1); 215 216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), 217 src0, src1); 218 inst->conditional_mod = condition; 219 220 return inst; 221} 222 223/** 224 * CMP: Sets the low bit of the destination channels with the result 225 * of the comparison, while the upper bits are undefined, and updates 226 * the flag register with the packed 16 bits of the result. 227 */ 228vec4_instruction * 229vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, 230 enum brw_conditional_mod condition) 231{ 232 vec4_instruction *inst; 233 234 /* Take the instruction: 235 * 236 * CMP null<d> src0<f> src1<f> 237 * 238 * Original gen4 does type conversion to the destination type before 239 * comparison, producing garbage results for floating point comparisons. 240 * 241 * The destination type doesn't matter on newer generations, so we set the 242 * type to match src0 so we can compact the instruction. 243 */ 244 dst.type = src0.type; 245 246 resolve_ud_negate(&src0); 247 resolve_ud_negate(&src1); 248 249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); 250 inst->conditional_mod = condition; 251 252 return inst; 253} 254 255vec4_instruction * 256vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) 257{ 258 vec4_instruction *inst; 259 260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ, 261 dst, index); 262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1; 263 inst->mlen = 2; 264 265 return inst; 266} 267 268vec4_instruction * 269vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, 270 const src_reg &index) 271{ 272 vec4_instruction *inst; 273 274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE, 275 dst, src, index); 276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen); 277 inst->mlen = 3; 278 279 return inst; 280} 281 282src_reg 283vec4_visitor::fix_3src_operand(const src_reg &src) 284{ 285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be 286 * able to use vertical stride of zero to replicate the vec4 uniform, like 287 * 288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] 289 * 290 * But you can't, since vertical stride is always four in three-source 291 * instructions. Instead, insert a MOV instruction to do the replication so 292 * that the three-source instruction can consume it. 293 */ 294 295 /* The MOV is only needed if the source is a uniform or immediate. */ 296 if (src.file != UNIFORM && src.file != IMM) 297 return src; 298 299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) 300 return src; 301 302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 303 expanded.type = src.type; 304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); 305 return src_reg(expanded); 306} 307 308src_reg 309vec4_visitor::resolve_source_modifiers(const src_reg &src) 310{ 311 if (!src.abs && !src.negate) 312 return src; 313 314 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type); 315 resolved.type = src.type; 316 emit(MOV(resolved, src)); 317 318 return src_reg(resolved); 319} 320 321src_reg 322vec4_visitor::fix_math_operand(const src_reg &src) 323{ 324 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE) 325 return src; 326 327 /* The gen6 math instruction ignores the source modifiers -- 328 * swizzle, abs, negate, and at least some parts of the register 329 * region description. 330 * 331 * Rather than trying to enumerate all these cases, *always* expand the 332 * operand to a temp GRF for gen6. 333 * 334 * For gen7, keep the operand as-is, except if immediate, which gen7 still 335 * can't use. 336 */ 337 338 if (devinfo->gen == 7 && src.file != IMM) 339 return src; 340 341 dst_reg expanded = dst_reg(this, glsl_type::vec4_type); 342 expanded.type = src.type; 343 emit(MOV(expanded, src)); 344 return src_reg(expanded); 345} 346 347vec4_instruction * 348vec4_visitor::emit_math(enum opcode opcode, 349 const dst_reg &dst, 350 const src_reg &src0, const src_reg &src1) 351{ 352 vec4_instruction *math = 353 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); 354 355 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) { 356 /* MATH on Gen6 must be align1, so we can't do writemasks. */ 357 math->dst = dst_reg(this, glsl_type::vec4_type); 358 math->dst.type = dst.type; 359 math = emit(MOV(dst, src_reg(math->dst))); 360 } else if (devinfo->gen < 6) { 361 math->base_mrf = 1; 362 math->mlen = src1.file == BAD_FILE ? 1 : 2; 363 } 364 365 return math; 366} 367 368void 369vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) 370{ 371 if (devinfo->gen < 7) { 372 unreachable("ir_unop_pack_half_2x16 should be lowered"); 373 } 374 375 assert(dst.type == BRW_REGISTER_TYPE_UD); 376 assert(src0.type == BRW_REGISTER_TYPE_F); 377 378 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: 379 * 380 * Because this instruction does not have a 16-bit floating-point type, 381 * the destination data type must be Word (W). 382 * 383 * The destination must be DWord-aligned and specify a horizontal stride 384 * (HorzStride) of 2. The 16-bit result is stored in the lower word of 385 * each destination channel and the upper word is not modified. 386 * 387 * The above restriction implies that the f32to16 instruction must use 388 * align1 mode, because only in align1 mode is it possible to specify 389 * horizontal stride. We choose here to defy the hardware docs and emit 390 * align16 instructions. 391 * 392 * (I [chadv] did attempt to emit align1 instructions for VS f32to16 393 * instructions. I was partially successful in that the code passed all 394 * tests. However, the code was dubiously correct and fragile, and the 395 * tests were not harsh enough to probe that frailty. Not trusting the 396 * code, I chose instead to remain in align16 mode in defiance of the hw 397 * docs). 398 * 399 * I've [chadv] experimentally confirmed that, on gen7 hardware and the 400 * simulator, emitting a f32to16 in align16 mode with UD as destination 401 * data type is safe. The behavior differs from that specified in the PRM 402 * in that the upper word of each destination channel is cleared to 0. 403 */ 404 405 dst_reg tmp_dst(this, glsl_type::uvec2_type); 406 src_reg tmp_src(tmp_dst); 407 408#if 0 409 /* Verify the undocumented behavior on which the following instructions 410 * rely. If f32to16 fails to clear the upper word of the X and Y channels, 411 * then the result of the bit-or instruction below will be incorrect. 412 * 413 * You should inspect the disasm output in order to verify that the MOV is 414 * not optimized away. 415 */ 416 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); 417#endif 418 419 /* Give tmp the form below, where "." means untouched. 420 * 421 * w z y x w z y x 422 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| 423 * 424 * That the upper word of each write-channel be 0 is required for the 425 * following bit-shift and bit-or instructions to work. Note that this 426 * relies on the undocumented hardware behavior mentioned above. 427 */ 428 tmp_dst.writemask = WRITEMASK_XY; 429 emit(F32TO16(tmp_dst, src0)); 430 431 /* Give the write-channels of dst the form: 432 * 0xhhhh0000 433 */ 434 tmp_src.swizzle = BRW_SWIZZLE_YYYY; 435 emit(SHL(dst, tmp_src, brw_imm_ud(16u))); 436 437 /* Finally, give the write-channels of dst the form of packHalf2x16's 438 * output: 439 * 0xhhhhllll 440 */ 441 tmp_src.swizzle = BRW_SWIZZLE_XXXX; 442 emit(OR(dst, src_reg(dst), tmp_src)); 443} 444 445void 446vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) 447{ 448 if (devinfo->gen < 7) { 449 unreachable("ir_unop_unpack_half_2x16 should be lowered"); 450 } 451 452 assert(dst.type == BRW_REGISTER_TYPE_F); 453 assert(src0.type == BRW_REGISTER_TYPE_UD); 454 455 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: 456 * 457 * Because this instruction does not have a 16-bit floating-point type, 458 * the source data type must be Word (W). The destination type must be 459 * F (Float). 460 * 461 * To use W as the source data type, we must adjust horizontal strides, 462 * which is only possible in align1 mode. All my [chadv] attempts at 463 * emitting align1 instructions for unpackHalf2x16 failed to pass the 464 * Piglit tests, so I gave up. 465 * 466 * I've verified that, on gen7 hardware and the simulator, it is safe to 467 * emit f16to32 in align16 mode with UD as source data type. 468 */ 469 470 dst_reg tmp_dst(this, glsl_type::uvec2_type); 471 src_reg tmp_src(tmp_dst); 472 473 tmp_dst.writemask = WRITEMASK_X; 474 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); 475 476 tmp_dst.writemask = WRITEMASK_Y; 477 emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); 478 479 dst.writemask = WRITEMASK_XY; 480 emit(F16TO32(dst, tmp_src)); 481} 482 483void 484vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) 485{ 486 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 487 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 488 * is not suitable to generate the shift values, but we can use the packed 489 * vector float and a type-converting MOV. 490 */ 491 dst_reg shift(this, glsl_type::uvec4_type); 492 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 493 494 dst_reg shifted(this, glsl_type::uvec4_type); 495 src0.swizzle = BRW_SWIZZLE_XXXX; 496 emit(SHR(shifted, src0, src_reg(shift))); 497 498 shifted.type = BRW_REGISTER_TYPE_UB; 499 dst_reg f(this, glsl_type::vec4_type); 500 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 501 502 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); 503} 504 505void 506vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) 507{ 508 /* Instead of splitting the 32-bit integer, shifting, and ORing it back 509 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate 510 * is not suitable to generate the shift values, but we can use the packed 511 * vector float and a type-converting MOV. 512 */ 513 dst_reg shift(this, glsl_type::uvec4_type); 514 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); 515 516 dst_reg shifted(this, glsl_type::uvec4_type); 517 src0.swizzle = BRW_SWIZZLE_XXXX; 518 emit(SHR(shifted, src0, src_reg(shift))); 519 520 shifted.type = BRW_REGISTER_TYPE_B; 521 dst_reg f(this, glsl_type::vec4_type); 522 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); 523 524 dst_reg scaled(this, glsl_type::vec4_type); 525 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); 526 527 dst_reg max(this, glsl_type::vec4_type); 528 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); 529 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); 530} 531 532void 533vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) 534{ 535 dst_reg saturated(this, glsl_type::vec4_type); 536 vec4_instruction *inst = emit(MOV(saturated, src0)); 537 inst->saturate = true; 538 539 dst_reg scaled(this, glsl_type::vec4_type); 540 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); 541 542 dst_reg rounded(this, glsl_type::vec4_type); 543 emit(RNDE(rounded, src_reg(scaled))); 544 545 dst_reg u(this, glsl_type::uvec4_type); 546 emit(MOV(u, src_reg(rounded))); 547 548 src_reg bytes(u); 549 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 550} 551 552void 553vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) 554{ 555 dst_reg max(this, glsl_type::vec4_type); 556 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); 557 558 dst_reg min(this, glsl_type::vec4_type); 559 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); 560 561 dst_reg scaled(this, glsl_type::vec4_type); 562 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); 563 564 dst_reg rounded(this, glsl_type::vec4_type); 565 emit(RNDE(rounded, src_reg(scaled))); 566 567 dst_reg i(this, glsl_type::ivec4_type); 568 emit(MOV(i, src_reg(rounded))); 569 570 src_reg bytes(i); 571 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); 572} 573 574/* 575 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 == 576 * false) elements needed to pack a type. 577 */ 578static int 579type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless) 580{ 581 unsigned int i; 582 int size; 583 584 switch (type->base_type) { 585 case GLSL_TYPE_UINT: 586 case GLSL_TYPE_INT: 587 case GLSL_TYPE_FLOAT: 588 case GLSL_TYPE_FLOAT16: 589 case GLSL_TYPE_BOOL: 590 case GLSL_TYPE_DOUBLE: 591 case GLSL_TYPE_UINT16: 592 case GLSL_TYPE_INT16: 593 case GLSL_TYPE_UINT8: 594 case GLSL_TYPE_INT8: 595 case GLSL_TYPE_UINT64: 596 case GLSL_TYPE_INT64: 597 if (type->is_matrix()) { 598 const glsl_type *col_type = type->column_type(); 599 unsigned col_slots = 600 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1; 601 return type->matrix_columns * col_slots; 602 } else { 603 /* Regardless of size of vector, it gets a vec4. This is bad 604 * packing for things like floats, but otherwise arrays become a 605 * mess. Hopefully a later pass over the code can pack scalars 606 * down if appropriate. 607 */ 608 return (as_vec4 && type->is_dual_slot()) ? 2 : 1; 609 } 610 case GLSL_TYPE_ARRAY: 611 assert(type->length > 0); 612 return type_size_xvec4(type->fields.array, as_vec4, bindless) * 613 type->length; 614 case GLSL_TYPE_STRUCT: 615 case GLSL_TYPE_INTERFACE: 616 size = 0; 617 for (i = 0; i < type->length; i++) { 618 size += type_size_xvec4(type->fields.structure[i].type, as_vec4, 619 bindless); 620 } 621 return size; 622 case GLSL_TYPE_SUBROUTINE: 623 return 1; 624 625 case GLSL_TYPE_SAMPLER: 626 /* Samplers take up no register space, since they're baked in at 627 * link time. 628 */ 629 return bindless ? 1 : 0; 630 case GLSL_TYPE_ATOMIC_UINT: 631 return 0; 632 case GLSL_TYPE_IMAGE: 633 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4); 634 case GLSL_TYPE_VOID: 635 case GLSL_TYPE_ERROR: 636 case GLSL_TYPE_FUNCTION: 637 unreachable("not reached"); 638 } 639 640 return 0; 641} 642 643/** 644 * Returns the minimum number of vec4 elements needed to pack a type. 645 * 646 * For simple types, it will return 1 (a single vec4); for matrices, the 647 * number of columns; for array and struct, the sum of the vec4_size of 648 * each of its elements; and for sampler and atomic, zero. 649 * 650 * This method is useful to calculate how much register space is needed to 651 * store a particular type. 652 */ 653extern "C" int 654type_size_vec4(const struct glsl_type *type, bool bindless) 655{ 656 return type_size_xvec4(type, true, bindless); 657} 658 659/** 660 * Returns the minimum number of dvec4 elements needed to pack a type. 661 * 662 * For simple types, it will return 1 (a single dvec4); for matrices, the 663 * number of columns; for array and struct, the sum of the dvec4_size of 664 * each of its elements; and for sampler and atomic, zero. 665 * 666 * This method is useful to calculate how much register space is needed to 667 * store a particular type. 668 * 669 * Measuring double-precision vertex inputs as dvec4 is required because 670 * ARB_vertex_attrib_64bit states that these uses the same number of locations 671 * than the single-precision version. That is, two consecutives dvec4 would be 672 * located in location "x" and location "x+1", not "x+2". 673 * 674 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs, 675 * remap_vs_attrs() will take in account both the location and also if the 676 * type fits in one or two vec4 slots. 677 */ 678extern "C" int 679type_size_dvec4(const struct glsl_type *type, bool bindless) 680{ 681 return type_size_xvec4(type, false, bindless); 682} 683 684src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) 685{ 686 init(); 687 688 this->file = VGRF; 689 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 690 691 if (type->is_array() || type->is_struct()) { 692 this->swizzle = BRW_SWIZZLE_NOOP; 693 } else { 694 this->swizzle = brw_swizzle_for_size(type->vector_elements); 695 } 696 697 this->type = brw_type_for_base_type(type); 698} 699 700src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) 701{ 702 assert(size > 0); 703 704 init(); 705 706 this->file = VGRF; 707 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size); 708 709 this->swizzle = BRW_SWIZZLE_NOOP; 710 711 this->type = brw_type_for_base_type(type); 712} 713 714dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) 715{ 716 init(); 717 718 this->file = VGRF; 719 this->nr = v->alloc.allocate(type_size_vec4(type, false)); 720 721 if (type->is_array() || type->is_struct()) { 722 this->writemask = WRITEMASK_XYZW; 723 } else { 724 this->writemask = (1 << type->vector_elements) - 1; 725 } 726 727 this->type = brw_type_for_base_type(type); 728} 729 730vec4_instruction * 731vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, 732 src_reg src0, src_reg src1) 733{ 734 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); 735 inst->conditional_mod = conditionalmod; 736 return inst; 737} 738 739vec4_instruction * 740vec4_visitor::emit_lrp(const dst_reg &dst, 741 const src_reg &x, const src_reg &y, const src_reg &a) 742{ 743 if (devinfo->gen >= 6 && devinfo->gen <= 10) { 744 /* Note that the instruction's argument order is reversed from GLSL 745 * and the IR. 746 */ 747 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y), 748 fix_3src_operand(x))); 749 } else { 750 /* Earlier generations don't support three source operations, so we 751 * need to emit x*(1-a) + y*a. 752 */ 753 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type); 754 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type); 755 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type); 756 y_times_a.writemask = dst.writemask; 757 one_minus_a.writemask = dst.writemask; 758 x_times_one_minus_a.writemask = dst.writemask; 759 760 emit(MUL(y_times_a, y, a)); 761 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f))); 762 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a))); 763 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a))); 764 } 765} 766 767/** 768 * Emits the instructions needed to perform a pull constant load. before_block 769 * and before_inst can be NULL in which case the instruction will be appended 770 * to the end of the instruction list. 771 */ 772void 773vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, 774 src_reg surf_index, 775 src_reg offset_reg, 776 bblock_t *before_block, 777 vec4_instruction *before_inst) 778{ 779 assert((before_inst == NULL && before_block == NULL) || 780 (before_inst && before_block)); 781 782 vec4_instruction *pull; 783 784 if (devinfo->gen >= 9) { 785 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 786 src_reg header(this, glsl_type::uvec4_type, 2); 787 788 pull = new(mem_ctx) 789 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 790 dst_reg(header)); 791 792 if (before_inst) 793 emit_before(before_block, before_inst, pull); 794 else 795 emit(pull); 796 797 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE), 798 offset_reg.type); 799 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg); 800 801 if (before_inst) 802 emit_before(before_block, before_inst, pull); 803 else 804 emit(pull); 805 806 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 807 dst, 808 surf_index, 809 header); 810 pull->mlen = 2; 811 pull->header_size = 1; 812 } else if (devinfo->gen >= 7) { 813 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type); 814 815 grf_offset.type = offset_reg.type; 816 817 pull = MOV(grf_offset, offset_reg); 818 819 if (before_inst) 820 emit_before(before_block, before_inst, pull); 821 else 822 emit(pull); 823 824 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7, 825 dst, 826 surf_index, 827 src_reg(grf_offset)); 828 pull->mlen = 1; 829 } else { 830 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, 831 dst, 832 surf_index, 833 offset_reg); 834 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1; 835 pull->mlen = 1; 836 } 837 838 if (before_inst) 839 emit_before(before_block, before_inst, pull); 840 else 841 emit(pull); 842} 843 844src_reg 845vec4_visitor::emit_uniformize(const src_reg &src) 846{ 847 const src_reg chan_index(this, glsl_type::uint_type); 848 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type), 849 src.type); 850 851 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) 852 ->force_writemask_all = true; 853 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) 854 ->force_writemask_all = true; 855 856 return src_reg(dst); 857} 858 859src_reg 860vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, 861 src_reg coordinate, src_reg surface) 862{ 863 vec4_instruction *inst = 864 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, 865 dst_reg(this, glsl_type::uvec4_type)); 866 inst->base_mrf = 2; 867 inst->src[1] = surface; 868 inst->src[2] = brw_imm_ud(0); /* sampler */ 869 870 int param_base; 871 872 if (devinfo->gen >= 9) { 873 /* Gen9+ needs a message header in order to use SIMD4x2 mode */ 874 vec4_instruction *header_inst = new(mem_ctx) 875 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9, 876 dst_reg(MRF, inst->base_mrf)); 877 878 emit(header_inst); 879 880 inst->mlen = 2; 881 inst->header_size = 1; 882 param_base = inst->base_mrf + 1; 883 } else { 884 inst->mlen = 1; 885 param_base = inst->base_mrf; 886 } 887 888 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ 889 int coord_mask = (1 << coordinate_type->vector_elements) - 1; 890 int zero_mask = 0xf & ~coord_mask; 891 892 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), 893 coordinate)); 894 895 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), 896 brw_imm_d(0))); 897 898 emit(inst); 899 return src_reg(inst->dst); 900} 901 902bool 903vec4_visitor::is_high_sampler(src_reg sampler) 904{ 905 if (devinfo->gen < 8 && !devinfo->is_haswell) 906 return false; 907 908 return sampler.file != IMM || sampler.ud >= 16; 909} 910 911void 912vec4_visitor::emit_texture(ir_texture_opcode op, 913 dst_reg dest, 914 const glsl_type *dest_type, 915 src_reg coordinate, 916 int coord_components, 917 src_reg shadow_comparator, 918 src_reg lod, src_reg lod2, 919 src_reg sample_index, 920 uint32_t constant_offset, 921 src_reg offset_value, 922 src_reg mcs, 923 uint32_t surface, 924 src_reg surface_reg, 925 src_reg sampler_reg) 926{ 927 enum opcode opcode; 928 switch (op) { 929 case ir_tex: opcode = SHADER_OPCODE_TXL; break; 930 case ir_txl: opcode = SHADER_OPCODE_TXL; break; 931 case ir_txd: opcode = SHADER_OPCODE_TXD; break; 932 case ir_txf: opcode = SHADER_OPCODE_TXF; break; 933 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W : 934 SHADER_OPCODE_TXF_CMS); break; 935 case ir_txs: opcode = SHADER_OPCODE_TXS; break; 936 case ir_tg4: opcode = offset_value.file != BAD_FILE 937 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break; 938 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break; 939 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; 940 case ir_txb: 941 unreachable("TXB is not valid for vertex shaders."); 942 case ir_lod: 943 unreachable("LOD is not valid for vertex shaders."); 944 case ir_samples_identical: { 945 /* There are some challenges implementing this for vec4, and it seems 946 * unlikely to be used anyway. For now, just return false ways. 947 */ 948 emit(MOV(dest, brw_imm_ud(0u))); 949 return; 950 } 951 default: 952 unreachable("Unrecognized tex op"); 953 } 954 955 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); 956 957 inst->offset = constant_offset; 958 959 /* The message header is necessary for: 960 * - Gen4 (always) 961 * - Gen9+ for selecting SIMD4x2 962 * - Texel offsets 963 * - Gather channel selection 964 * - Sampler indices too large to fit in a 4-bit value. 965 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal 966 */ 967 inst->header_size = 968 (devinfo->gen < 5 || devinfo->gen >= 9 || 969 inst->offset != 0 || op == ir_tg4 || 970 op == ir_texture_samples || 971 is_high_sampler(sampler_reg)) ? 1 : 0; 972 inst->base_mrf = 2; 973 inst->mlen = inst->header_size; 974 inst->dst.writemask = WRITEMASK_XYZW; 975 inst->shadow_compare = shadow_comparator.file != BAD_FILE; 976 977 inst->src[1] = surface_reg; 978 inst->src[2] = sampler_reg; 979 980 /* MRF for the first parameter */ 981 int param_base = inst->base_mrf + inst->header_size; 982 983 if (op == ir_txs || op == ir_query_levels) { 984 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X; 985 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); 986 inst->mlen++; 987 } else if (op == ir_texture_samples) { 988 inst->dst.writemask = WRITEMASK_X; 989 } else { 990 /* Load the coordinate */ 991 /* FINISHME: gl_clamp_mask and saturate */ 992 int coord_mask = (1 << coord_components) - 1; 993 int zero_mask = 0xf & ~coord_mask; 994 995 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), 996 coordinate)); 997 inst->mlen++; 998 999 if (zero_mask != 0) { 1000 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), 1001 brw_imm_d(0))); 1002 } 1003 /* Load the shadow comparator */ 1004 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) { 1005 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, 1006 WRITEMASK_X), 1007 shadow_comparator)); 1008 inst->mlen++; 1009 } 1010 1011 /* Load the LOD info */ 1012 if (op == ir_tex || op == ir_txl) { 1013 int mrf, writemask; 1014 if (devinfo->gen >= 5) { 1015 mrf = param_base + 1; 1016 if (shadow_comparator.file != BAD_FILE) { 1017 writemask = WRITEMASK_Y; 1018 /* mlen already incremented */ 1019 } else { 1020 writemask = WRITEMASK_X; 1021 inst->mlen++; 1022 } 1023 } else /* devinfo->gen == 4 */ { 1024 mrf = param_base; 1025 writemask = WRITEMASK_W; 1026 } 1027 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); 1028 } else if (op == ir_txf) { 1029 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); 1030 } else if (op == ir_txf_ms) { 1031 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), 1032 sample_index)); 1033 if (opcode == SHADER_OPCODE_TXF_CMS_W) { 1034 /* MCS data is stored in the first two channels of ‘mcs’, but we 1035 * need to get it into the .y and .z channels of the second vec4 1036 * of params. 1037 */ 1038 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1); 1039 emit(MOV(dst_reg(MRF, param_base + 1, 1040 glsl_type::uint_type, WRITEMASK_YZ), 1041 mcs)); 1042 } else if (devinfo->gen >= 7) { 1043 /* MCS data is in the first channel of `mcs`, but we need to get it into 1044 * the .y channel of the second vec4 of params, so replicate .x across 1045 * the whole vec4 and then mask off everything except .y 1046 */ 1047 mcs.swizzle = BRW_SWIZZLE_XXXX; 1048 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y), 1049 mcs)); 1050 } 1051 inst->mlen++; 1052 } else if (op == ir_txd) { 1053 const brw_reg_type type = lod.type; 1054 1055 if (devinfo->gen >= 5) { 1056 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1057 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y); 1058 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); 1059 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); 1060 inst->mlen++; 1061 1062 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) { 1063 lod.swizzle = BRW_SWIZZLE_ZZZZ; 1064 lod2.swizzle = BRW_SWIZZLE_ZZZZ; 1065 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); 1066 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); 1067 inst->mlen++; 1068 1069 if (shadow_comparator.file != BAD_FILE) { 1070 emit(MOV(dst_reg(MRF, param_base + 2, 1071 shadow_comparator.type, WRITEMASK_Z), 1072 shadow_comparator)); 1073 } 1074 } 1075 } else /* devinfo->gen == 4 */ { 1076 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); 1077 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); 1078 inst->mlen += 2; 1079 } 1080 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) { 1081 if (shadow_comparator.file != BAD_FILE) { 1082 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), 1083 shadow_comparator)); 1084 } 1085 1086 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY), 1087 offset_value)); 1088 inst->mlen++; 1089 } 1090 } 1091 1092 emit(inst); 1093 1094 /* fixup num layers (z) for cube arrays: hardware returns faces * layers; 1095 * spec requires layers. 1096 */ 1097 if (op == ir_txs && devinfo->gen < 7) { 1098 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 1099 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), 1100 src_reg(inst->dst), brw_imm_d(1)); 1101 } 1102 1103 if (devinfo->gen == 6 && op == ir_tg4) { 1104 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst); 1105 } 1106 1107 if (op == ir_query_levels) { 1108 /* # levels is in .w */ 1109 src_reg swizzled(dest); 1110 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, 1111 SWIZZLE_W, SWIZZLE_W); 1112 emit(MOV(dest, swizzled)); 1113 } 1114} 1115 1116/** 1117 * Apply workarounds for Gen6 gather with UINT/SINT 1118 */ 1119void 1120vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst) 1121{ 1122 if (!wa) 1123 return; 1124 1125 int width = (wa & WA_8BIT) ? 8 : 16; 1126 dst_reg dst_f = dst; 1127 dst_f.type = BRW_REGISTER_TYPE_F; 1128 1129 /* Convert from UNORM to UINT */ 1130 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1)))); 1131 emit(MOV(dst, src_reg(dst_f))); 1132 1133 if (wa & WA_SIGN) { 1134 /* Reinterpret the UINT value as a signed INT value by 1135 * shifting the sign bit into place, then shifting back 1136 * preserving sign. 1137 */ 1138 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width))); 1139 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width))); 1140 } 1141} 1142 1143void 1144vec4_visitor::gs_emit_vertex(int /* stream_id */) 1145{ 1146 unreachable("not reached"); 1147} 1148 1149void 1150vec4_visitor::gs_end_primitive() 1151{ 1152 unreachable("not reached"); 1153} 1154 1155void 1156vec4_visitor::emit_ndc_computation() 1157{ 1158 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) 1159 return; 1160 1161 /* Get the position */ 1162 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); 1163 1164 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ 1165 dst_reg ndc = dst_reg(this, glsl_type::vec4_type); 1166 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; 1167 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; 1168 1169 current_annotation = "NDC"; 1170 dst_reg ndc_w = ndc; 1171 ndc_w.writemask = WRITEMASK_W; 1172 src_reg pos_w = pos; 1173 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W); 1174 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); 1175 1176 dst_reg ndc_xyz = ndc; 1177 ndc_xyz.writemask = WRITEMASK_XYZ; 1178 1179 emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); 1180} 1181 1182void 1183vec4_visitor::emit_psiz_and_flags(dst_reg reg) 1184{ 1185 if (devinfo->gen < 6 && 1186 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || 1187 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || 1188 devinfo->has_negative_rhw_bug)) { 1189 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type); 1190 dst_reg header1_w = header1; 1191 header1_w.writemask = WRITEMASK_W; 1192 1193 emit(MOV(header1, brw_imm_ud(0u))); 1194 1195 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 1196 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1197 1198 current_annotation = "Point size"; 1199 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); 1200 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); 1201 } 1202 1203 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { 1204 current_annotation = "Clipping flags"; 1205 dst_reg flags0 = dst_reg(this, glsl_type::uint_type); 1206 1207 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1208 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); 1209 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); 1210 } 1211 1212 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) { 1213 dst_reg flags1 = dst_reg(this, glsl_type::uint_type); 1214 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1215 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); 1216 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); 1217 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); 1218 } 1219 1220 /* i965 clipping workaround: 1221 * 1) Test for -ve rhw 1222 * 2) If set, 1223 * set ndc = (0,0,0,0) 1224 * set ucp[6] = 1 1225 * 1226 * Later, clipping will detect ucp[6] and ensure the primitive is 1227 * clipped against all fixed planes. 1228 */ 1229 if (devinfo->has_negative_rhw_bug && 1230 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { 1231 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); 1232 ndc_w.swizzle = BRW_SWIZZLE_WWWW; 1233 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); 1234 vec4_instruction *inst; 1235 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); 1236 inst->predicate = BRW_PREDICATE_NORMAL; 1237 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; 1238 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); 1239 inst->predicate = BRW_PREDICATE_NORMAL; 1240 } 1241 1242 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); 1243 } else if (devinfo->gen < 6) { 1244 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); 1245 } else { 1246 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); 1247 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) { 1248 dst_reg reg_w = reg; 1249 reg_w.writemask = WRITEMASK_W; 1250 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); 1251 reg_as_src.type = reg_w.type; 1252 reg_as_src.swizzle = brw_swizzle_for_size(1); 1253 emit(MOV(reg_w, reg_as_src)); 1254 } 1255 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) { 1256 dst_reg reg_y = reg; 1257 reg_y.writemask = WRITEMASK_Y; 1258 reg_y.type = BRW_REGISTER_TYPE_D; 1259 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; 1260 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); 1261 } 1262 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) { 1263 dst_reg reg_z = reg; 1264 reg_z.writemask = WRITEMASK_Z; 1265 reg_z.type = BRW_REGISTER_TYPE_D; 1266 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; 1267 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); 1268 } 1269 } 1270} 1271 1272vec4_instruction * 1273vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) 1274{ 1275 assert(varying < VARYING_SLOT_MAX); 1276 1277 unsigned num_comps = output_num_components[varying][component]; 1278 if (num_comps == 0) 1279 return NULL; 1280 1281 assert(output_reg[varying][component].type == reg.type); 1282 current_annotation = output_reg_annotation[varying]; 1283 if (output_reg[varying][component].file != BAD_FILE) { 1284 src_reg src = src_reg(output_reg[varying][component]); 1285 src.swizzle = BRW_SWZ_COMP_OUTPUT(component); 1286 reg.writemask = 1287 brw_writemask_for_component_packing(num_comps, component); 1288 return emit(MOV(reg, src)); 1289 } 1290 return NULL; 1291} 1292 1293void 1294vec4_visitor::emit_urb_slot(dst_reg reg, int varying) 1295{ 1296 reg.type = BRW_REGISTER_TYPE_F; 1297 output_reg[varying][0].type = reg.type; 1298 1299 switch (varying) { 1300 case VARYING_SLOT_PSIZ: 1301 { 1302 /* PSIZ is always in slot 0, and is coupled with other flags. */ 1303 current_annotation = "indices, point width, clip flags"; 1304 emit_psiz_and_flags(reg); 1305 break; 1306 } 1307 case BRW_VARYING_SLOT_NDC: 1308 current_annotation = "NDC"; 1309 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) 1310 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); 1311 break; 1312 case VARYING_SLOT_POS: 1313 current_annotation = "gl_Position"; 1314 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) 1315 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); 1316 break; 1317 case VARYING_SLOT_EDGE: { 1318 /* This is present when doing unfilled polygons. We're supposed to copy 1319 * the edge flag from the user-provided vertex array 1320 * (glEdgeFlagPointer), or otherwise we'll copy from the current value 1321 * of that attribute (starts as 1.0f). This is then used in clipping to 1322 * determine which edges should be drawn as wireframe. 1323 */ 1324 current_annotation = "edge flag"; 1325 int edge_attr = util_bitcount64(nir->info.inputs_read & 1326 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG)); 1327 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr, 1328 glsl_type::float_type, WRITEMASK_XYZW)))); 1329 break; 1330 } 1331 case BRW_VARYING_SLOT_PAD: 1332 /* No need to write to this slot */ 1333 break; 1334 default: 1335 for (int i = 0; i < 4; i++) { 1336 emit_generic_urb_slot(reg, varying, i); 1337 } 1338 break; 1339 } 1340} 1341 1342static unsigned 1343align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen) 1344{ 1345 if (devinfo->gen >= 6) { 1346 /* URB data written (does not include the message header reg) must 1347 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, 1348 * section 5.4.3.2.2: URB_INTERLEAVED. 1349 * 1350 * URB entries are allocated on a multiple of 1024 bits, so an 1351 * extra 128 bits written here to make the end align to 256 is 1352 * no problem. 1353 */ 1354 if ((mlen % 2) != 1) 1355 mlen++; 1356 } 1357 1358 return mlen; 1359} 1360 1361 1362/** 1363 * Generates the VUE payload plus the necessary URB write instructions to 1364 * output it. 1365 * 1366 * The VUE layout is documented in Volume 2a. 1367 */ 1368void 1369vec4_visitor::emit_vertex() 1370{ 1371 /* MRF 0 is reserved for the debugger, so start with message header 1372 * in MRF 1. 1373 */ 1374 int base_mrf = 1; 1375 int mrf = base_mrf; 1376 /* In the process of generating our URB write message contents, we 1377 * may need to unspill a register or load from an array. Those 1378 * reads would use MRFs 14-15. 1379 */ 1380 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen); 1381 1382 /* The following assertion verifies that max_usable_mrf causes an 1383 * even-numbered amount of URB write data, which will meet gen6's 1384 * requirements for length alignment. 1385 */ 1386 assert ((max_usable_mrf - base_mrf) % 2 == 0); 1387 1388 /* First mrf is the g0-based message header containing URB handles and 1389 * such. 1390 */ 1391 emit_urb_write_header(mrf++); 1392 1393 if (devinfo->gen < 6) { 1394 emit_ndc_computation(); 1395 } 1396 1397 /* We may need to split this up into several URB writes, so do them in a 1398 * loop. 1399 */ 1400 int slot = 0; 1401 bool complete = false; 1402 do { 1403 /* URB offset is in URB row increments, and each of our MRFs is half of 1404 * one of those, since we're doing interleaved writes. 1405 */ 1406 int offset = slot / 2; 1407 1408 mrf = base_mrf + 1; 1409 for (; slot < prog_data->vue_map.num_slots; ++slot) { 1410 emit_urb_slot(dst_reg(MRF, mrf++), 1411 prog_data->vue_map.slot_to_varying[slot]); 1412 1413 /* If this was max_usable_mrf, we can't fit anything more into this 1414 * URB WRITE. Same thing if we reached the maximum length available. 1415 */ 1416 if (mrf > max_usable_mrf || 1417 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { 1418 slot++; 1419 break; 1420 } 1421 } 1422 1423 complete = slot >= prog_data->vue_map.num_slots; 1424 current_annotation = "URB write"; 1425 vec4_instruction *inst = emit_urb_write_opcode(complete); 1426 inst->base_mrf = base_mrf; 1427 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); 1428 inst->offset += offset; 1429 } while(!complete); 1430} 1431 1432 1433src_reg 1434vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, 1435 src_reg *reladdr, int reg_offset) 1436{ 1437 /* Because we store the values to scratch interleaved like our 1438 * vertex data, we need to scale the vec4 index by 2. 1439 */ 1440 int message_header_scale = 2; 1441 1442 /* Pre-gen6, the message header uses byte offsets instead of vec4 1443 * (16-byte) offset units. 1444 */ 1445 if (devinfo->gen < 6) 1446 message_header_scale *= 16; 1447 1448 if (reladdr) { 1449 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have 1450 * to multiply the reladdr by 2. Notice that the reg_offset part 1451 * is in units of 16 bytes and is used to select the low/high 16-byte 1452 * chunk of a full dvec4, so we don't want to multiply that part. 1453 */ 1454 src_reg index = src_reg(this, glsl_type::int_type); 1455 if (type_sz(inst->dst.type) < 8) { 1456 emit_before(block, inst, ADD(dst_reg(index), *reladdr, 1457 brw_imm_d(reg_offset))); 1458 emit_before(block, inst, MUL(dst_reg(index), index, 1459 brw_imm_d(message_header_scale))); 1460 } else { 1461 emit_before(block, inst, MUL(dst_reg(index), *reladdr, 1462 brw_imm_d(message_header_scale * 2))); 1463 emit_before(block, inst, ADD(dst_reg(index), index, 1464 brw_imm_d(reg_offset * message_header_scale))); 1465 } 1466 return index; 1467 } else { 1468 return brw_imm_d(reg_offset * message_header_scale); 1469 } 1470} 1471 1472/** 1473 * Emits an instruction before @inst to load the value named by @orig_src 1474 * from scratch space at @base_offset to @temp. 1475 * 1476 * @base_offset is measured in 32-byte units (the size of a register). 1477 */ 1478void 1479vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, 1480 dst_reg temp, src_reg orig_src, 1481 int base_offset) 1482{ 1483 assert(orig_src.offset % REG_SIZE == 0); 1484 int reg_offset = base_offset + orig_src.offset / REG_SIZE; 1485 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, 1486 reg_offset); 1487 1488 if (type_sz(orig_src.type) < 8) { 1489 emit_before(block, inst, SCRATCH_READ(temp, index)); 1490 } else { 1491 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type); 1492 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); 1493 emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); 1494 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); 1495 vec4_instruction *last_read = 1496 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); 1497 emit_before(block, inst, last_read); 1498 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read); 1499 } 1500} 1501 1502/** 1503 * Emits an instruction after @inst to store the value to be written 1504 * to @orig_dst to scratch space at @base_offset, from @temp. 1505 * 1506 * @base_offset is measured in 32-byte units (the size of a register). 1507 */ 1508void 1509vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, 1510 int base_offset) 1511{ 1512 assert(inst->dst.offset % REG_SIZE == 0); 1513 int reg_offset = base_offset + inst->dst.offset / REG_SIZE; 1514 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1515 reg_offset); 1516 1517 /* Create a temporary register to store *inst's result in. 1518 * 1519 * We have to be careful in MOVing from our temporary result register in 1520 * the scratch write. If we swizzle from channels of the temporary that 1521 * weren't initialized, it will confuse live interval analysis, which will 1522 * make spilling fail to make progress. 1523 */ 1524 bool is_64bit = type_sz(inst->dst.type) == 8; 1525 const glsl_type *alloc_type = 1526 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type; 1527 const src_reg temp = swizzle(retype(src_reg(this, alloc_type), 1528 inst->dst.type), 1529 brw_swizzle_for_mask(inst->dst.writemask)); 1530 1531 if (!is_64bit) { 1532 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), 1533 inst->dst.writemask)); 1534 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); 1535 if (inst->opcode != BRW_OPCODE_SEL) 1536 write->predicate = inst->predicate; 1537 write->ir = inst->ir; 1538 write->annotation = inst->annotation; 1539 inst->insert_after(block, write); 1540 } else { 1541 dst_reg shuffled = dst_reg(this, alloc_type); 1542 vec4_instruction *last = 1543 shuffle_64bit_data(shuffled, temp, true, block, inst); 1544 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); 1545 1546 uint8_t mask = 0; 1547 if (inst->dst.writemask & WRITEMASK_X) 1548 mask |= WRITEMASK_XY; 1549 if (inst->dst.writemask & WRITEMASK_Y) 1550 mask |= WRITEMASK_ZW; 1551 if (mask) { 1552 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1553 1554 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); 1555 if (inst->opcode != BRW_OPCODE_SEL) 1556 write->predicate = inst->predicate; 1557 write->ir = inst->ir; 1558 write->annotation = inst->annotation; 1559 last->insert_after(block, write); 1560 } 1561 1562 mask = 0; 1563 if (inst->dst.writemask & WRITEMASK_Z) 1564 mask |= WRITEMASK_XY; 1565 if (inst->dst.writemask & WRITEMASK_W) 1566 mask |= WRITEMASK_ZW; 1567 if (mask) { 1568 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); 1569 1570 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, 1571 reg_offset + 1); 1572 vec4_instruction *write = 1573 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); 1574 if (inst->opcode != BRW_OPCODE_SEL) 1575 write->predicate = inst->predicate; 1576 write->ir = inst->ir; 1577 write->annotation = inst->annotation; 1578 last->insert_after(block, write); 1579 } 1580 } 1581 1582 inst->dst.file = temp.file; 1583 inst->dst.nr = temp.nr; 1584 inst->dst.offset %= REG_SIZE; 1585 inst->dst.reladdr = NULL; 1586} 1587 1588/** 1589 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, 1590 * adds the scratch read(s) before \p inst. The function also checks for 1591 * recursive reladdr scratch accesses, issuing the corresponding scratch 1592 * loads and rewriting reladdr references accordingly. 1593 * 1594 * \return \p src if it did not require a scratch load, otherwise, the 1595 * register holding the result of the scratch load that the caller should 1596 * use to rewrite src. 1597 */ 1598src_reg 1599vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, 1600 vec4_instruction *inst, src_reg src) 1601{ 1602 /* Resolve recursive reladdr scratch access by calling ourselves 1603 * with src.reladdr 1604 */ 1605 if (src.reladdr) 1606 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1607 *src.reladdr); 1608 1609 /* Now handle scratch access on src */ 1610 if (src.file == VGRF && scratch_loc[src.nr] != -1) { 1611 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? 1612 glsl_type::dvec4_type : glsl_type::vec4_type); 1613 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); 1614 src.nr = temp.nr; 1615 src.offset %= REG_SIZE; 1616 src.reladdr = NULL; 1617 } 1618 1619 return src; 1620} 1621 1622/** 1623 * We can't generally support array access in GRF space, because a 1624 * single instruction's destination can only span 2 contiguous 1625 * registers. So, we send all GRF arrays that get variable index 1626 * access to scratch space. 1627 */ 1628void 1629vec4_visitor::move_grf_array_access_to_scratch() 1630{ 1631 int scratch_loc[this->alloc.count]; 1632 memset(scratch_loc, -1, sizeof(scratch_loc)); 1633 1634 /* First, calculate the set of virtual GRFs that need to be punted 1635 * to scratch due to having any array access on them, and where in 1636 * scratch. 1637 */ 1638 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1639 if (inst->dst.file == VGRF && inst->dst.reladdr) { 1640 if (scratch_loc[inst->dst.nr] == -1) { 1641 scratch_loc[inst->dst.nr] = last_scratch; 1642 last_scratch += this->alloc.sizes[inst->dst.nr]; 1643 } 1644 1645 for (src_reg *iter = inst->dst.reladdr; 1646 iter->reladdr; 1647 iter = iter->reladdr) { 1648 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1649 scratch_loc[iter->nr] = last_scratch; 1650 last_scratch += this->alloc.sizes[iter->nr]; 1651 } 1652 } 1653 } 1654 1655 for (int i = 0 ; i < 3; i++) { 1656 for (src_reg *iter = &inst->src[i]; 1657 iter->reladdr; 1658 iter = iter->reladdr) { 1659 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { 1660 scratch_loc[iter->nr] = last_scratch; 1661 last_scratch += this->alloc.sizes[iter->nr]; 1662 } 1663 } 1664 } 1665 } 1666 1667 /* Now, for anything that will be accessed through scratch, rewrite 1668 * it to load/store. Note that this is a _safe list walk, because 1669 * we may generate a new scratch_write instruction after the one 1670 * we're processing. 1671 */ 1672 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1673 /* Set up the annotation tracking for new generated instructions. */ 1674 base_ir = inst->ir; 1675 current_annotation = inst->annotation; 1676 1677 /* First handle scratch access on the dst. Notice we have to handle 1678 * the case where the dst's reladdr also points to scratch space. 1679 */ 1680 if (inst->dst.reladdr) 1681 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, 1682 *inst->dst.reladdr); 1683 1684 /* Now that we have handled any (possibly recursive) reladdr scratch 1685 * accesses for dst we can safely do the scratch write for dst itself 1686 */ 1687 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) 1688 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); 1689 1690 /* Now handle scratch access on any src. In this case, since inst->src[i] 1691 * already is a src_reg, we can just call emit_resolve_reladdr with 1692 * inst->src[i] and it will take care of handling scratch loads for 1693 * both src and src.reladdr (recursively). 1694 */ 1695 for (int i = 0 ; i < 3; i++) { 1696 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, 1697 inst->src[i]); 1698 } 1699 } 1700} 1701 1702/** 1703 * Emits an instruction before @inst to load the value named by @orig_src 1704 * from the pull constant buffer (surface) at @base_offset to @temp. 1705 */ 1706void 1707vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst, 1708 dst_reg temp, src_reg orig_src, 1709 int base_offset, src_reg indirect) 1710{ 1711 assert(orig_src.offset % 16 == 0); 1712 const unsigned index = prog_data->base.binding_table.pull_constants_start; 1713 1714 /* For 64bit loads we need to emit two 32-bit load messages and we also 1715 * we need to shuffle the 32-bit data result into proper 64-bit data. To do 1716 * that we emit the 32-bit loads into a temporary and we shuffle the result 1717 * into the original destination. 1718 */ 1719 dst_reg orig_temp = temp; 1720 bool is_64bit = type_sz(orig_src.type) == 8; 1721 if (is_64bit) { 1722 assert(type_sz(temp.type) == 8); 1723 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type); 1724 temp = retype(temp_df, BRW_REGISTER_TYPE_F); 1725 } 1726 1727 src_reg src = orig_src; 1728 for (int i = 0; i < (is_64bit ? 2 : 1); i++) { 1729 int reg_offset = base_offset + src.offset / 16; 1730 1731 src_reg offset; 1732 if (indirect.file != BAD_FILE) { 1733 offset = src_reg(this, glsl_type::uint_type); 1734 emit_before(block, inst, ADD(dst_reg(offset), indirect, 1735 brw_imm_ud(reg_offset * 16))); 1736 } else if (devinfo->gen >= 8) { 1737 /* Store the offset in a GRF so we can send-from-GRF. */ 1738 offset = src_reg(this, glsl_type::uint_type); 1739 emit_before(block, inst, MOV(dst_reg(offset), 1740 brw_imm_ud(reg_offset * 16))); 1741 } else { 1742 offset = brw_imm_d(reg_offset * 16); 1743 } 1744 1745 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE), 1746 brw_imm_ud(index), 1747 offset, 1748 block, inst); 1749 1750 src = byte_offset(src, 16); 1751 } 1752 1753 if (is_64bit) { 1754 temp = retype(temp, BRW_REGISTER_TYPE_DF); 1755 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst); 1756 } 1757} 1758 1759/** 1760 * Implements array access of uniforms by inserting a 1761 * PULL_CONSTANT_LOAD instruction. 1762 * 1763 * Unlike temporary GRF array access (where we don't support it due to 1764 * the difficulty of doing relative addressing on instruction 1765 * destinations), we could potentially do array access of uniforms 1766 * that were loaded in GRF space as push constants. In real-world 1767 * usage we've seen, though, the arrays being used are always larger 1768 * than we could load as push constants, so just always move all 1769 * uniform array access out to a pull constant buffer. 1770 */ 1771void 1772vec4_visitor::move_uniform_array_access_to_pull_constants() 1773{ 1774 /* The vulkan dirver doesn't support pull constants other than UBOs so 1775 * everything has to be pushed regardless. 1776 */ 1777 if (!compiler->supports_pull_constants) { 1778 split_uniform_registers(); 1779 return; 1780 } 1781 1782 /* Allocate the pull_params array */ 1783 assert(stage_prog_data->nr_pull_params == 0); 1784 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t, 1785 this->uniforms * 4); 1786 1787 int pull_constant_loc[this->uniforms]; 1788 memset(pull_constant_loc, -1, sizeof(pull_constant_loc)); 1789 1790 /* First, walk through the instructions and determine which things need to 1791 * be pulled. We mark something as needing to be pulled by setting 1792 * pull_constant_loc to 0. 1793 */ 1794 foreach_block_and_inst(block, vec4_instruction, inst, cfg) { 1795 /* We only care about MOV_INDIRECT of a uniform */ 1796 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1797 inst->src[0].file != UNIFORM) 1798 continue; 1799 1800 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1801 1802 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++) 1803 pull_constant_loc[uniform_nr + j] = 0; 1804 } 1805 1806 /* Next, we walk the list of uniforms and assign real pull constant 1807 * locations and set their corresponding entries in pull_param. 1808 */ 1809 for (int j = 0; j < this->uniforms; j++) { 1810 if (pull_constant_loc[j] < 0) 1811 continue; 1812 1813 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4; 1814 1815 for (int i = 0; i < 4; i++) { 1816 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] 1817 = stage_prog_data->param[j * 4 + i]; 1818 } 1819 } 1820 1821 /* Finally, we can walk through the instructions and lower MOV_INDIRECT 1822 * instructions to actual uniform pulls. 1823 */ 1824 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { 1825 /* We only care about MOV_INDIRECT of a uniform */ 1826 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT || 1827 inst->src[0].file != UNIFORM) 1828 continue; 1829 1830 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16; 1831 1832 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP); 1833 1834 emit_pull_constant_load(block, inst, inst->dst, inst->src[0], 1835 pull_constant_loc[uniform_nr], inst->src[1]); 1836 inst->remove(block); 1837 } 1838 1839 /* Now there are no accesses of the UNIFORM file with a reladdr, so 1840 * no need to track them as larger-than-vec4 objects. This will be 1841 * relied on in cutting out unused uniform vectors from push 1842 * constants. 1843 */ 1844 split_uniform_registers(); 1845} 1846 1847void 1848vec4_visitor::resolve_ud_negate(src_reg *reg) 1849{ 1850 if (reg->type != BRW_REGISTER_TYPE_UD || 1851 !reg->negate) 1852 return; 1853 1854 src_reg temp = src_reg(this, glsl_type::uvec4_type); 1855 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); 1856 *reg = temp; 1857} 1858 1859vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, 1860 void *log_data, 1861 const struct brw_sampler_prog_key_data *key_tex, 1862 struct brw_vue_prog_data *prog_data, 1863 const nir_shader *shader, 1864 void *mem_ctx, 1865 bool no_spills, 1866 int shader_time_index) 1867 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base), 1868 key_tex(key_tex), 1869 prog_data(prog_data), 1870 fail_msg(NULL), 1871 first_non_payload_grf(0), 1872 need_all_constants_in_pull_buffer(false), 1873 no_spills(no_spills), 1874 shader_time_index(shader_time_index), 1875 last_scratch(0) 1876{ 1877 this->failed = false; 1878 1879 this->base_ir = NULL; 1880 this->current_annotation = NULL; 1881 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); 1882 1883 memset(this->output_num_components, 0, sizeof(this->output_num_components)); 1884 1885 this->virtual_grf_start = NULL; 1886 this->virtual_grf_end = NULL; 1887 this->live_intervals = NULL; 1888 1889 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; 1890 1891 this->uniforms = 0; 1892 1893 this->nir_locals = NULL; 1894 this->nir_ssa_values = NULL; 1895} 1896 1897 1898void 1899vec4_visitor::fail(const char *format, ...) 1900{ 1901 va_list va; 1902 char *msg; 1903 1904 if (failed) 1905 return; 1906 1907 failed = true; 1908 1909 va_start(va, format); 1910 msg = ralloc_vasprintf(mem_ctx, format, va); 1911 va_end(va); 1912 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); 1913 1914 this->fail_msg = msg; 1915 1916 if (debug_enabled) { 1917 fprintf(stderr, "%s", msg); 1918 } 1919} 1920 1921} /* namespace brw */ 1922