1/* Copyright © 2011 Intel Corporation 2 * 3 * Permission is hereby granted, free of charge, to any person obtaining a 4 * copy of this software and associated documentation files (the "Software"), 5 * to deal in the Software without restriction, including without limitation 6 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 7 * and/or sell copies of the Software, and to permit persons to whom the 8 * Software is furnished to do so, subject to the following conditions: 9 * 10 * The above copyright notice and this permission notice (including the next 11 * paragraph) shall be included in all copies or substantial portions of the 12 * Software. 13 * 14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 * IN THE SOFTWARE. 21 */ 22 23#include "brw_vec4.h" 24#include "brw_cfg.h" 25#include "brw_eu.h" 26#include "dev/gen_debug.h" 27 28using namespace brw; 29 30static void 31generate_math1_gen4(struct brw_codegen *p, 32 vec4_instruction *inst, 33 struct brw_reg dst, 34 struct brw_reg src) 35{ 36 gen4_math(p, 37 dst, 38 brw_math_function(inst->opcode), 39 inst->base_mrf, 40 src, 41 BRW_MATH_PRECISION_FULL); 42} 43 44static void 45check_gen6_math_src_arg(struct brw_reg src) 46{ 47 /* Source swizzles are ignored. */ 48 assert(!src.abs); 49 assert(!src.negate); 50 assert(src.swizzle == BRW_SWIZZLE_XYZW); 51} 52 53static void 54generate_math_gen6(struct brw_codegen *p, 55 vec4_instruction *inst, 56 struct brw_reg dst, 57 struct brw_reg src0, 58 struct brw_reg src1) 59{ 60 /* Can't do writemask because math can't be align16. */ 61 assert(dst.writemask == WRITEMASK_XYZW); 62 /* Source swizzles are ignored. */ 63 check_gen6_math_src_arg(src0); 64 if (src1.file == BRW_GENERAL_REGISTER_FILE) 65 check_gen6_math_src_arg(src1); 66 67 brw_set_default_access_mode(p, BRW_ALIGN_1); 68 gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1); 69 brw_set_default_access_mode(p, BRW_ALIGN_16); 70} 71 72static void 73generate_math2_gen4(struct brw_codegen *p, 74 vec4_instruction *inst, 75 struct brw_reg dst, 76 struct brw_reg src0, 77 struct brw_reg src1) 78{ 79 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 80 * "Message Payload": 81 * 82 * "Operand0[7]. For the INT DIV functions, this operand is the 83 * denominator." 84 * ... 85 * "Operand1[7]. For the INT DIV functions, this operand is the 86 * numerator." 87 */ 88 bool is_int_div = inst->opcode != SHADER_OPCODE_POW; 89 struct brw_reg &op0 = is_int_div ? src1 : src0; 90 struct brw_reg &op1 = is_int_div ? src0 : src1; 91 92 brw_push_insn_state(p); 93 brw_set_default_saturate(p, false); 94 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 95 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1); 96 brw_pop_insn_state(p); 97 98 gen4_math(p, 99 dst, 100 brw_math_function(inst->opcode), 101 inst->base_mrf, 102 op0, 103 BRW_MATH_PRECISION_FULL); 104} 105 106static void 107generate_tex(struct brw_codegen *p, 108 struct brw_vue_prog_data *prog_data, 109 gl_shader_stage stage, 110 vec4_instruction *inst, 111 struct brw_reg dst, 112 struct brw_reg src, 113 struct brw_reg surface_index, 114 struct brw_reg sampler_index) 115{ 116 const struct gen_device_info *devinfo = p->devinfo; 117 int msg_type = -1; 118 119 if (devinfo->gen >= 5) { 120 switch (inst->opcode) { 121 case SHADER_OPCODE_TEX: 122 case SHADER_OPCODE_TXL: 123 if (inst->shadow_compare) { 124 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; 125 } else { 126 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD; 127 } 128 break; 129 case SHADER_OPCODE_TXD: 130 if (inst->shadow_compare) { 131 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ 132 assert(devinfo->gen >= 8 || devinfo->is_haswell); 133 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; 134 } else { 135 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS; 136 } 137 break; 138 case SHADER_OPCODE_TXF: 139 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 140 break; 141 case SHADER_OPCODE_TXF_CMS_W: 142 assert(devinfo->gen >= 9); 143 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W; 144 break; 145 case SHADER_OPCODE_TXF_CMS: 146 if (devinfo->gen >= 7) 147 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; 148 else 149 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD; 150 break; 151 case SHADER_OPCODE_TXF_MCS: 152 assert(devinfo->gen >= 7); 153 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; 154 break; 155 case SHADER_OPCODE_TXS: 156 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO; 157 break; 158 case SHADER_OPCODE_TG4: 159 if (inst->shadow_compare) { 160 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; 161 } else { 162 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4; 163 } 164 break; 165 case SHADER_OPCODE_TG4_OFFSET: 166 if (inst->shadow_compare) { 167 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; 168 } else { 169 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; 170 } 171 break; 172 case SHADER_OPCODE_SAMPLEINFO: 173 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; 174 break; 175 default: 176 unreachable("should not get here: invalid vec4 texture opcode"); 177 } 178 } else { 179 switch (inst->opcode) { 180 case SHADER_OPCODE_TEX: 181 case SHADER_OPCODE_TXL: 182 if (inst->shadow_compare) { 183 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; 184 assert(inst->mlen == 3); 185 } else { 186 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; 187 assert(inst->mlen == 2); 188 } 189 break; 190 case SHADER_OPCODE_TXD: 191 /* There is no sample_d_c message; comparisons are done manually. */ 192 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; 193 assert(inst->mlen == 4); 194 break; 195 case SHADER_OPCODE_TXF: 196 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; 197 assert(inst->mlen == 2); 198 break; 199 case SHADER_OPCODE_TXS: 200 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; 201 assert(inst->mlen == 2); 202 break; 203 default: 204 unreachable("should not get here: invalid vec4 texture opcode"); 205 } 206 } 207 208 assert(msg_type != -1); 209 210 assert(sampler_index.type == BRW_REGISTER_TYPE_UD); 211 212 /* Load the message header if present. If there's a texture offset, we need 213 * to set it up explicitly and load the offset bitfield. Otherwise, we can 214 * use an implied move from g0 to the first message register. 215 */ 216 if (inst->header_size != 0) { 217 if (devinfo->gen < 6 && !inst->offset) { 218 /* Set up an implied move from g0 to the MRF. */ 219 src = brw_vec8_grf(0, 0); 220 } else { 221 struct brw_reg header = 222 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 223 uint32_t dw2 = 0; 224 225 /* Explicitly set up the message header by copying g0 to the MRF. */ 226 brw_push_insn_state(p); 227 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 228 brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 229 230 brw_set_default_access_mode(p, BRW_ALIGN_1); 231 232 if (inst->offset) 233 /* Set the texel offset bits in DWord 2. */ 234 dw2 = inst->offset; 235 236 if (devinfo->gen >= 9) 237 /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D, 238 * based on bit 22 in the header. 239 */ 240 dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2; 241 242 /* The VS, DS, and FS stages have the g0.2 payload delivered as 0, 243 * so header0.2 is 0 when g0 is copied. The HS and GS stages do 244 * not, so we must set to to 0 to avoid setting undesirable bits 245 * in the message header. 246 */ 247 if (dw2 || 248 stage == MESA_SHADER_TESS_CTRL || 249 stage == MESA_SHADER_GEOMETRY) { 250 brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); 251 } 252 253 brw_adjust_sampler_state_pointer(p, header, sampler_index); 254 brw_pop_insn_state(p); 255 } 256 } 257 258 uint32_t return_format; 259 260 switch (dst.type) { 261 case BRW_REGISTER_TYPE_D: 262 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; 263 break; 264 case BRW_REGISTER_TYPE_UD: 265 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; 266 break; 267 default: 268 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; 269 break; 270 } 271 272 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 || 273 inst->opcode == SHADER_OPCODE_TG4_OFFSET) 274 ? prog_data->base.binding_table.gather_texture_start 275 : prog_data->base.binding_table.texture_start; 276 277 if (surface_index.file == BRW_IMMEDIATE_VALUE && 278 sampler_index.file == BRW_IMMEDIATE_VALUE) { 279 uint32_t surface = surface_index.ud; 280 uint32_t sampler = sampler_index.ud; 281 282 brw_SAMPLE(p, 283 dst, 284 inst->base_mrf, 285 src, 286 surface + base_binding_table_index, 287 sampler % 16, 288 msg_type, 289 1, /* response length */ 290 inst->mlen, 291 inst->header_size != 0, 292 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 293 return_format); 294 } else { 295 /* Non-constant sampler index. */ 296 297 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 298 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); 299 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); 300 301 brw_push_insn_state(p); 302 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 303 brw_set_default_access_mode(p, BRW_ALIGN_1); 304 305 if (brw_regs_equal(&surface_reg, &sampler_reg)) { 306 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); 307 } else { 308 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { 309 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); 310 } else { 311 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); 312 brw_OR(p, addr, addr, surface_reg); 313 } 314 } 315 if (base_binding_table_index) 316 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index)); 317 brw_AND(p, addr, addr, brw_imm_ud(0xfff)); 318 319 brw_pop_insn_state(p); 320 321 if (inst->base_mrf != -1) 322 gen6_resolve_implied_move(p, &src, inst->base_mrf); 323 324 /* dst = send(offset, a0.0 | <descriptor>) */ 325 brw_send_indirect_message( 326 p, BRW_SFID_SAMPLER, dst, src, addr, 327 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | 328 brw_sampler_desc(devinfo, 329 0 /* surface */, 330 0 /* sampler */, 331 msg_type, 332 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 333 return_format), 334 false /* EOT */); 335 336 /* visitor knows more than we do about the surface limit required, 337 * so has already done marking. 338 */ 339 } 340} 341 342static void 343generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 344{ 345 brw_urb_WRITE(p, 346 brw_null_reg(), /* dest */ 347 inst->base_mrf, /* starting mrf reg nr */ 348 brw_vec8_grf(0, 0), /* src */ 349 inst->urb_write_flags, 350 inst->mlen, 351 0, /* response len */ 352 inst->offset, /* urb destination offset */ 353 BRW_URB_SWIZZLE_INTERLEAVE); 354} 355 356static void 357generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) 358{ 359 struct brw_reg src = brw_message_reg(inst->base_mrf); 360 brw_urb_WRITE(p, 361 brw_null_reg(), /* dest */ 362 inst->base_mrf, /* starting mrf reg nr */ 363 src, 364 inst->urb_write_flags, 365 inst->mlen, 366 0, /* response len */ 367 inst->offset, /* urb destination offset */ 368 BRW_URB_SWIZZLE_INTERLEAVE); 369} 370 371static void 372generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) 373{ 374 struct brw_reg src = brw_message_reg(inst->base_mrf); 375 376 /* We pass the temporary passed in src0 as the writeback register */ 377 brw_urb_WRITE(p, 378 inst->src[0].as_brw_reg(), /* dest */ 379 inst->base_mrf, /* starting mrf reg nr */ 380 src, 381 BRW_URB_WRITE_ALLOCATE_COMPLETE, 382 inst->mlen, 383 1, /* response len */ 384 inst->offset, /* urb destination offset */ 385 BRW_URB_SWIZZLE_INTERLEAVE); 386 387 /* Now put allocated urb handle in dst.0 */ 388 brw_push_insn_state(p); 389 brw_set_default_access_mode(p, BRW_ALIGN_1); 390 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 391 brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0), 392 get_element_ud(inst->src[0].as_brw_reg(), 0)); 393 brw_pop_insn_state(p); 394} 395 396static void 397generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 398{ 399 struct brw_reg src = brw_message_reg(inst->base_mrf); 400 brw_urb_WRITE(p, 401 brw_null_reg(), /* dest */ 402 inst->base_mrf, /* starting mrf reg nr */ 403 src, 404 BRW_URB_WRITE_EOT | inst->urb_write_flags, 405 inst->mlen, 406 0, /* response len */ 407 0, /* urb destination offset */ 408 BRW_URB_SWIZZLE_INTERLEAVE); 409} 410 411static void 412generate_gs_set_write_offset(struct brw_codegen *p, 413 struct brw_reg dst, 414 struct brw_reg src0, 415 struct brw_reg src1) 416{ 417 /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 418 * Header: M0.3): 419 * 420 * Slot 0 Offset. This field, after adding to the Global Offset field 421 * in the message descriptor, specifies the offset (in 256-bit units) 422 * from the start of the URB entry, as referenced by URB Handle 0, at 423 * which the data will be accessed. 424 * 425 * Similar text describes DWORD M0.4, which is slot 1 offset. 426 * 427 * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components 428 * of the register for geometry shader invocations 0 and 1) by the 429 * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. 430 * 431 * We can do this with the following EU instruction: 432 * 433 * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } 434 */ 435 brw_push_insn_state(p); 436 brw_set_default_access_mode(p, BRW_ALIGN_1); 437 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 438 assert(p->devinfo->gen >= 7 && 439 src1.file == BRW_IMMEDIATE_VALUE && 440 src1.type == BRW_REGISTER_TYPE_UD && 441 src1.ud <= USHRT_MAX); 442 if (src0.file == BRW_IMMEDIATE_VALUE) { 443 brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), 444 brw_imm_ud(src0.ud * src1.ud)); 445 } else { 446 brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), 447 retype(src1, BRW_REGISTER_TYPE_UW)); 448 } 449 brw_pop_insn_state(p); 450} 451 452static void 453generate_gs_set_vertex_count(struct brw_codegen *p, 454 struct brw_reg dst, 455 struct brw_reg src) 456{ 457 brw_push_insn_state(p); 458 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 459 460 if (p->devinfo->gen >= 8) { 461 /* Move the vertex count into the second MRF for the EOT write. */ 462 brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD), 463 src); 464 } else { 465 /* If we think of the src and dst registers as composed of 8 DWORDs each, 466 * we want to pick up the contents of DWORDs 0 and 4 from src, truncate 467 * them to WORDs, and then pack them into DWORD 2 of dst. 468 * 469 * It's easier to get the EU to do this if we think of the src and dst 470 * registers as composed of 16 WORDS each; then, we want to pick up the 471 * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 472 * of dst. 473 * 474 * We can do that by the following EU instruction: 475 * 476 * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } 477 */ 478 brw_set_default_access_mode(p, BRW_ALIGN_1); 479 brw_MOV(p, 480 suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), 481 stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); 482 } 483 brw_pop_insn_state(p); 484} 485 486static void 487generate_gs_svb_write(struct brw_codegen *p, 488 struct brw_vue_prog_data *prog_data, 489 vec4_instruction *inst, 490 struct brw_reg dst, 491 struct brw_reg src0, 492 struct brw_reg src1) 493{ 494 int binding = inst->sol_binding; 495 bool final_write = inst->sol_final_write; 496 497 brw_push_insn_state(p); 498 brw_set_default_exec_size(p, BRW_EXECUTE_4); 499 /* Copy Vertex data into M0.x */ 500 brw_MOV(p, stride(dst, 4, 4, 1), 501 stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); 502 brw_pop_insn_state(p); 503 504 brw_push_insn_state(p); 505 /* Send SVB Write */ 506 brw_svb_write(p, 507 final_write ? src1 : brw_null_reg(), /* dest == src1 */ 508 1, /* msg_reg_nr */ 509 dst, /* src0 == previous dst */ 510 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */ 511 final_write); /* send_commit_msg */ 512 513 /* Finally, wait for the write commit to occur so that we can proceed to 514 * other things safely. 515 * 516 * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: 517 * 518 * The write commit does not modify the destination register, but 519 * merely clears the dependency associated with the destination 520 * register. Thus, a simple “mov” instruction using the register as a 521 * source is sufficient to wait for the write commit to occur. 522 */ 523 if (final_write) { 524 brw_MOV(p, src1, src1); 525 } 526 brw_pop_insn_state(p); 527} 528 529static void 530generate_gs_svb_set_destination_index(struct brw_codegen *p, 531 vec4_instruction *inst, 532 struct brw_reg dst, 533 struct brw_reg src) 534{ 535 int vertex = inst->sol_vertex; 536 brw_push_insn_state(p); 537 brw_set_default_access_mode(p, BRW_ALIGN_1); 538 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 539 brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); 540 brw_pop_insn_state(p); 541} 542 543static void 544generate_gs_set_dword_2(struct brw_codegen *p, 545 struct brw_reg dst, 546 struct brw_reg src) 547{ 548 brw_push_insn_state(p); 549 brw_set_default_access_mode(p, BRW_ALIGN_1); 550 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 551 brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); 552 brw_pop_insn_state(p); 553} 554 555static void 556generate_gs_prepare_channel_masks(struct brw_codegen *p, 557 struct brw_reg dst) 558{ 559 /* We want to left shift just DWORD 4 (the x component belonging to the 560 * second geometry shader invocation) by 4 bits. So generate the 561 * instruction: 562 * 563 * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } 564 */ 565 dst = suboffset(vec1(dst), 4); 566 brw_push_insn_state(p); 567 brw_set_default_access_mode(p, BRW_ALIGN_1); 568 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 569 brw_SHL(p, dst, dst, brw_imm_ud(4)); 570 brw_pop_insn_state(p); 571} 572 573static void 574generate_gs_set_channel_masks(struct brw_codegen *p, 575 struct brw_reg dst, 576 struct brw_reg src) 577{ 578 /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message 579 * Header: M0.5): 580 * 581 * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask 582 * 583 * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 584 * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls 585 * Vertex 0 DATA[7]. This bit is ANDed with the corresponding 586 * channel enable to determine the final channel enable. For the 587 * URB_READ_OWORD & URB_READ_HWORD messages, when final channel 588 * enable is 1 it indicates that Vertex 1 DATA [3] will be included 589 * in the writeback message. For the URB_WRITE_OWORD & 590 * URB_WRITE_HWORD messages, when final channel enable is 1 it 591 * indicates that Vertex 1 DATA [3] will be written to the surface. 592 * 593 * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included 594 * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included 595 * 596 * 14 Vertex 1 DATA [2] Channel Mask 597 * 13 Vertex 1 DATA [1] Channel Mask 598 * 12 Vertex 1 DATA [0] Channel Mask 599 * 11 Vertex 0 DATA [3] Channel Mask 600 * 10 Vertex 0 DATA [2] Channel Mask 601 * 9 Vertex 0 DATA [1] Channel Mask 602 * 8 Vertex 0 DATA [0] Channel Mask 603 * 604 * (This is from a section of the PRM that is agnostic to the particular 605 * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to 606 * geometry shader invocations 0 and 1, respectively). Since we have the 607 * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, 608 * and the enable flags for geometry shader invocation 1 in bits 7:0 of 609 * DWORD 4, we just need to OR them together and store the result in bits 610 * 15:8 of DWORD 5. 611 * 612 * It's easier to get the EU to do this if we think of the src and dst 613 * registers as composed of 32 bytes each; then, we want to pick up the 614 * contents of bytes 0 and 16 from src, OR them together, and store them in 615 * byte 21. 616 * 617 * We can do that by the following EU instruction: 618 * 619 * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } 620 * 621 * Note: this relies on the source register having zeros in (a) bits 7:4 of 622 * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the 623 * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which 624 * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to 625 * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to 626 * contain valid channel mask values (which are in the range 0x0-0xf). 627 */ 628 dst = retype(dst, BRW_REGISTER_TYPE_UB); 629 src = retype(src, BRW_REGISTER_TYPE_UB); 630 brw_push_insn_state(p); 631 brw_set_default_access_mode(p, BRW_ALIGN_1); 632 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 633 brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); 634 brw_pop_insn_state(p); 635} 636 637static void 638generate_gs_get_instance_id(struct brw_codegen *p, 639 struct brw_reg dst) 640{ 641 /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT 642 * and store into dst.0 & dst.4. So generate the instruction: 643 * 644 * shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } 645 */ 646 brw_push_insn_state(p); 647 brw_set_default_access_mode(p, BRW_ALIGN_1); 648 dst = retype(dst, BRW_REGISTER_TYPE_UD); 649 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 650 brw_SHR(p, dst, stride(r0, 1, 4, 0), 651 brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); 652 brw_pop_insn_state(p); 653} 654 655static void 656generate_gs_ff_sync_set_primitives(struct brw_codegen *p, 657 struct brw_reg dst, 658 struct brw_reg src0, 659 struct brw_reg src1, 660 struct brw_reg src2) 661{ 662 brw_push_insn_state(p); 663 brw_set_default_access_mode(p, BRW_ALIGN_1); 664 /* Save src0 data in 16:31 bits of dst.0 */ 665 brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), 666 brw_imm_ud(0xffffu)); 667 brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); 668 /* Save src1 data in 0:15 bits of dst.0 */ 669 brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), 670 brw_imm_ud(0xffffu)); 671 brw_OR(p, suboffset(vec1(dst), 0), 672 suboffset(vec1(dst), 0), 673 suboffset(vec1(src2), 0)); 674 brw_pop_insn_state(p); 675} 676 677static void 678generate_gs_ff_sync(struct brw_codegen *p, 679 vec4_instruction *inst, 680 struct brw_reg dst, 681 struct brw_reg src0, 682 struct brw_reg src1) 683{ 684 /* This opcode uses an implied MRF register for: 685 * - the header of the ff_sync message. And as such it is expected to be 686 * initialized to r0 before calling here. 687 * - the destination where we will write the allocated URB handle. 688 */ 689 struct brw_reg header = 690 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); 691 692 /* Overwrite dword 0 of the header (SO vertices to write) and 693 * dword 1 (number of primitives written). 694 */ 695 brw_push_insn_state(p); 696 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 697 brw_set_default_access_mode(p, BRW_ALIGN_1); 698 brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); 699 brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); 700 brw_pop_insn_state(p); 701 702 /* Allocate URB handle in dst */ 703 brw_ff_sync(p, 704 dst, 705 0, 706 header, 707 1, /* allocate */ 708 1, /* response length */ 709 0 /* eot */); 710 711 /* Now put allocated urb handle in header.0 */ 712 brw_push_insn_state(p); 713 brw_set_default_access_mode(p, BRW_ALIGN_1); 714 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 715 brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); 716 717 /* src1 is not an immediate when we use transform feedback */ 718 if (src1.file != BRW_IMMEDIATE_VALUE) { 719 brw_set_default_exec_size(p, BRW_EXECUTE_4); 720 brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); 721 } 722 723 brw_pop_insn_state(p); 724} 725 726static void 727generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) 728{ 729 /* In gen6, PrimitiveID is delivered in R0.1 of the payload */ 730 struct brw_reg src = brw_vec8_grf(0, 0); 731 brw_push_insn_state(p); 732 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 733 brw_set_default_access_mode(p, BRW_ALIGN_1); 734 brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); 735 brw_pop_insn_state(p); 736} 737 738static void 739generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) 740{ 741 const struct gen_device_info *devinfo = p->devinfo; 742 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 743 744 /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. 745 * 746 * Since we operate in SIMD4x2 mode, we need run half as many threads 747 * as necessary. So we assign (2i + 1, 2i) as the thread counts. We 748 * shift right by one less to accomplish the multiplication by two. 749 */ 750 dst = retype(dst, BRW_REGISTER_TYPE_UD); 751 struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 752 753 brw_push_insn_state(p); 754 brw_set_default_access_mode(p, BRW_ALIGN_1); 755 756 const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); 757 const int shift = ivb ? 16 : 17; 758 759 brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); 760 brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), 761 brw_imm_ud(shift - 1)); 762 brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); 763 764 brw_pop_insn_state(p); 765} 766 767static void 768generate_tcs_urb_write(struct brw_codegen *p, 769 vec4_instruction *inst, 770 struct brw_reg urb_header) 771{ 772 const struct gen_device_info *devinfo = p->devinfo; 773 774 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 775 brw_set_dest(p, send, brw_null_reg()); 776 brw_set_src0(p, send, urb_header); 777 brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true)); 778 779 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); 780 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); 781 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 782 if (inst->urb_write_flags & BRW_URB_WRITE_EOT) { 783 brw_inst_set_eot(devinfo, send, 1); 784 } else { 785 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 786 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 787 } 788 789 /* what happens to swizzles? */ 790} 791 792 793static void 794generate_tcs_input_urb_offsets(struct brw_codegen *p, 795 struct brw_reg dst, 796 struct brw_reg vertex, 797 struct brw_reg offset) 798{ 799 /* Generates an URB read/write message header for HS/DS operation. 800 * Inputs are a vertex index, and a byte offset from the beginning of 801 * the vertex. */ 802 803 /* If `vertex` is not an immediate, we clobber a0.0 */ 804 805 assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); 806 assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); 807 808 assert(dst.file == BRW_GENERAL_REGISTER_FILE); 809 810 brw_push_insn_state(p); 811 brw_set_default_access_mode(p, BRW_ALIGN_1); 812 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 813 brw_MOV(p, dst, brw_imm_ud(0)); 814 815 /* m0.5 bits 8-15 are channel enables */ 816 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 817 818 /* m0.0-0.1: URB handles */ 819 if (vertex.file == BRW_IMMEDIATE_VALUE) { 820 uint32_t vertex_index = vertex.ud; 821 struct brw_reg index_reg = brw_vec1_grf( 822 1 + (vertex_index >> 3), vertex_index & 7); 823 824 brw_MOV(p, vec2(get_element_ud(dst, 0)), 825 retype(index_reg, BRW_REGISTER_TYPE_UD)); 826 } else { 827 /* Use indirect addressing. ICP Handles are DWords (single channels 828 * of a register) and start at g1.0. 829 * 830 * In order to start our region at g1.0, we add 8 to the vertex index, 831 * effectively skipping over the 8 channels in g0.0. This gives us a 832 * DWord offset to the ICP Handle. 833 * 834 * Indirect addressing works in terms of bytes, so we then multiply 835 * the DWord offset by 4 (by shifting left by 2). 836 */ 837 struct brw_reg addr = brw_address_reg(0); 838 839 /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ 840 brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW), 841 brw_imm_uw(0x8)); 842 brw_SHL(p, addr, addr, brw_imm_uw(2)); 843 brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); 844 845 /* top half: m0.1 = g[1.0 + vertex.4]UD */ 846 brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW), 847 brw_imm_uw(0x8)); 848 brw_SHL(p, addr, addr, brw_imm_uw(2)); 849 brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); 850 } 851 852 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 853 if (offset.file != ARF) 854 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 855 856 brw_pop_insn_state(p); 857} 858 859 860static void 861generate_tcs_output_urb_offsets(struct brw_codegen *p, 862 struct brw_reg dst, 863 struct brw_reg write_mask, 864 struct brw_reg offset) 865{ 866 /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ 867 assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); 868 869 assert(write_mask.file == BRW_IMMEDIATE_VALUE); 870 assert(write_mask.type == BRW_REGISTER_TYPE_UD); 871 872 brw_push_insn_state(p); 873 874 brw_set_default_access_mode(p, BRW_ALIGN_1); 875 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 876 brw_MOV(p, dst, brw_imm_ud(0)); 877 878 unsigned mask = write_mask.ud; 879 880 /* m0.5 bits 15:12 and 11:8 are channel enables */ 881 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); 882 883 /* HS patch URB handle is delivered in r0.0 */ 884 struct brw_reg urb_handle = brw_vec1_grf(0, 0); 885 886 /* m0.0-0.1: URB handles */ 887 brw_MOV(p, vec2(get_element_ud(dst, 0)), 888 retype(urb_handle, BRW_REGISTER_TYPE_UD)); 889 890 /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ 891 if (offset.file != ARF) 892 brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); 893 894 brw_pop_insn_state(p); 895} 896 897static void 898generate_tes_create_input_read_header(struct brw_codegen *p, 899 struct brw_reg dst) 900{ 901 brw_push_insn_state(p); 902 brw_set_default_access_mode(p, BRW_ALIGN_1); 903 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 904 905 /* Initialize the register to 0 */ 906 brw_MOV(p, dst, brw_imm_ud(0)); 907 908 /* Enable all the channels in m0.5 bits 15:8 */ 909 brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); 910 911 /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety, 912 * mask out irrelevant "Reserved" bits, as they're not marked MBZ. 913 */ 914 brw_AND(p, vec2(get_element_ud(dst, 0)), 915 retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD), 916 brw_imm_ud(0x1fff)); 917 brw_pop_insn_state(p); 918} 919 920static void 921generate_tes_add_indirect_urb_offset(struct brw_codegen *p, 922 struct brw_reg dst, 923 struct brw_reg header, 924 struct brw_reg offset) 925{ 926 brw_push_insn_state(p); 927 brw_set_default_access_mode(p, BRW_ALIGN_1); 928 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 929 930 brw_MOV(p, dst, header); 931 932 /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>. 933 * Other values get <4;1,0>. 934 */ 935 struct brw_reg restrided_offset; 936 if (offset.vstride == BRW_VERTICAL_STRIDE_0 && 937 offset.width == BRW_WIDTH_4 && 938 offset.hstride == BRW_HORIZONTAL_STRIDE_1) { 939 restrided_offset = stride(offset, 0, 1, 0); 940 } else { 941 restrided_offset = stride(offset, 4, 1, 0); 942 } 943 944 /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */ 945 brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset); 946 947 brw_pop_insn_state(p); 948} 949 950static void 951generate_vec4_urb_read(struct brw_codegen *p, 952 vec4_instruction *inst, 953 struct brw_reg dst, 954 struct brw_reg header) 955{ 956 const struct gen_device_info *devinfo = p->devinfo; 957 958 assert(header.file == BRW_GENERAL_REGISTER_FILE); 959 assert(header.type == BRW_REGISTER_TYPE_UD); 960 961 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 962 brw_set_dest(p, send, dst); 963 brw_set_src0(p, send, header); 964 965 brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true)); 966 967 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); 968 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 969 brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); 970 brw_inst_set_urb_per_slot_offset(devinfo, send, 1); 971 972 brw_inst_set_urb_global_offset(devinfo, send, inst->offset); 973} 974 975static void 976generate_tcs_release_input(struct brw_codegen *p, 977 struct brw_reg header, 978 struct brw_reg vertex, 979 struct brw_reg is_unpaired) 980{ 981 const struct gen_device_info *devinfo = p->devinfo; 982 983 assert(vertex.file == BRW_IMMEDIATE_VALUE); 984 assert(vertex.type == BRW_REGISTER_TYPE_UD); 985 986 /* m0.0-0.1: URB handles */ 987 struct brw_reg urb_handles = 988 retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), 989 BRW_REGISTER_TYPE_UD); 990 991 brw_push_insn_state(p); 992 brw_set_default_access_mode(p, BRW_ALIGN_1); 993 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 994 brw_MOV(p, header, brw_imm_ud(0)); 995 brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); 996 brw_pop_insn_state(p); 997 998 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 999 brw_set_dest(p, send, brw_null_reg()); 1000 brw_set_src0(p, send, header); 1001 brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true)); 1002 1003 brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); 1004 brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); 1005 brw_inst_set_urb_complete(devinfo, send, 1); 1006 brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? 1007 BRW_URB_SWIZZLE_NONE : 1008 BRW_URB_SWIZZLE_INTERLEAVE); 1009} 1010 1011static void 1012generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst) 1013{ 1014 struct brw_reg header = brw_message_reg(inst->base_mrf); 1015 1016 brw_push_insn_state(p); 1017 brw_set_default_access_mode(p, BRW_ALIGN_1); 1018 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1019 brw_MOV(p, header, brw_imm_ud(0)); 1020 brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8)); 1021 brw_MOV(p, get_element_ud(header, 0), 1022 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1023 brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u)); 1024 brw_pop_insn_state(p); 1025 1026 brw_urb_WRITE(p, 1027 brw_null_reg(), /* dest */ 1028 inst->base_mrf, /* starting mrf reg nr */ 1029 header, 1030 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD | 1031 BRW_URB_WRITE_USE_CHANNEL_MASKS, 1032 inst->mlen, 1033 0, /* response len */ 1034 0, /* urb destination offset */ 1035 0); 1036} 1037 1038static void 1039generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1040{ 1041 brw_push_insn_state(p); 1042 brw_set_default_access_mode(p, BRW_ALIGN_1); 1043 brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D)); 1044 brw_pop_insn_state(p); 1045} 1046 1047static void 1048generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) 1049{ 1050 brw_push_insn_state(p); 1051 brw_set_default_access_mode(p, BRW_ALIGN_1); 1052 brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); 1053 brw_pop_insn_state(p); 1054} 1055 1056static void 1057generate_tcs_create_barrier_header(struct brw_codegen *p, 1058 struct brw_vue_prog_data *prog_data, 1059 struct brw_reg dst) 1060{ 1061 const struct gen_device_info *devinfo = p->devinfo; 1062 const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail; 1063 struct brw_reg m0_2 = get_element_ud(dst, 2); 1064 unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; 1065 1066 brw_push_insn_state(p); 1067 brw_set_default_access_mode(p, BRW_ALIGN_1); 1068 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1069 1070 /* Zero the message header */ 1071 brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); 1072 1073 /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */ 1074 brw_AND(p, m0_2, 1075 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 1076 brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13))); 1077 1078 /* Shift it up to bits 27:24. */ 1079 brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11)); 1080 1081 /* Set the Barrier Count and the enable bit */ 1082 brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); 1083 1084 brw_pop_insn_state(p); 1085} 1086 1087static void 1088generate_oword_dual_block_offsets(struct brw_codegen *p, 1089 struct brw_reg m1, 1090 struct brw_reg index) 1091{ 1092 int second_vertex_offset; 1093 1094 if (p->devinfo->gen >= 6) 1095 second_vertex_offset = 1; 1096 else 1097 second_vertex_offset = 16; 1098 1099 m1 = retype(m1, BRW_REGISTER_TYPE_D); 1100 1101 /* Set up M1 (message payload). Only the block offsets in M1.0 and 1102 * M1.4 are used, and the rest are ignored. 1103 */ 1104 struct brw_reg m1_0 = suboffset(vec1(m1), 0); 1105 struct brw_reg m1_4 = suboffset(vec1(m1), 4); 1106 struct brw_reg index_0 = suboffset(vec1(index), 0); 1107 struct brw_reg index_4 = suboffset(vec1(index), 4); 1108 1109 brw_push_insn_state(p); 1110 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1111 brw_set_default_access_mode(p, BRW_ALIGN_1); 1112 1113 brw_MOV(p, m1_0, index_0); 1114 1115 if (index.file == BRW_IMMEDIATE_VALUE) { 1116 index_4.ud += second_vertex_offset; 1117 brw_MOV(p, m1_4, index_4); 1118 } else { 1119 brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); 1120 } 1121 1122 brw_pop_insn_state(p); 1123} 1124 1125static void 1126generate_unpack_flags(struct brw_codegen *p, 1127 struct brw_reg dst) 1128{ 1129 brw_push_insn_state(p); 1130 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1131 brw_set_default_access_mode(p, BRW_ALIGN_1); 1132 1133 struct brw_reg flags = brw_flag_reg(0, 0); 1134 struct brw_reg dst_0 = suboffset(vec1(dst), 0); 1135 struct brw_reg dst_4 = suboffset(vec1(dst), 4); 1136 1137 brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); 1138 brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); 1139 brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); 1140 1141 brw_pop_insn_state(p); 1142} 1143 1144static void 1145generate_scratch_read(struct brw_codegen *p, 1146 vec4_instruction *inst, 1147 struct brw_reg dst, 1148 struct brw_reg index) 1149{ 1150 const struct gen_device_info *devinfo = p->devinfo; 1151 struct brw_reg header = brw_vec8_grf(0, 0); 1152 1153 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1154 1155 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1156 index); 1157 1158 uint32_t msg_type; 1159 1160 if (devinfo->gen >= 6) 1161 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1162 else if (devinfo->gen == 5 || devinfo->is_g4x) 1163 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1164 else 1165 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1166 1167 const unsigned target_cache = 1168 devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1169 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1170 BRW_SFID_DATAPORT_READ; 1171 1172 /* Each of the 8 channel enables is considered for whether each 1173 * dword is written. 1174 */ 1175 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1176 brw_inst_set_sfid(devinfo, send, target_cache); 1177 brw_set_dest(p, send, dst); 1178 brw_set_src0(p, send, header); 1179 if (devinfo->gen < 6) 1180 brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); 1181 brw_set_desc(p, send, 1182 brw_message_desc(devinfo, 2, 1, true) | 1183 brw_dp_read_desc(devinfo, 1184 brw_scratch_surface_idx(p), 1185 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1186 msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE)); 1187} 1188 1189static void 1190generate_scratch_write(struct brw_codegen *p, 1191 vec4_instruction *inst, 1192 struct brw_reg dst, 1193 struct brw_reg src, 1194 struct brw_reg index) 1195{ 1196 const struct gen_device_info *devinfo = p->devinfo; 1197 const unsigned target_cache = 1198 (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE : 1199 devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE : 1200 BRW_SFID_DATAPORT_WRITE); 1201 struct brw_reg header = brw_vec8_grf(0, 0); 1202 bool write_commit; 1203 1204 /* If the instruction is predicated, we'll predicate the send, not 1205 * the header setup. 1206 */ 1207 brw_set_default_predicate_control(p, false); 1208 1209 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1210 1211 generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), 1212 index); 1213 1214 brw_MOV(p, 1215 retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D), 1216 retype(src, BRW_REGISTER_TYPE_D)); 1217 1218 uint32_t msg_type; 1219 1220 if (devinfo->gen >= 7) 1221 msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE; 1222 else if (devinfo->gen == 6) 1223 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1224 else 1225 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; 1226 1227 brw_set_default_predicate_control(p, inst->predicate); 1228 1229 /* Pre-gen6, we have to specify write commits to ensure ordering 1230 * between reads and writes within a thread. Afterwards, that's 1231 * guaranteed and write commits only matter for inter-thread 1232 * synchronization. 1233 */ 1234 if (devinfo->gen >= 6) { 1235 write_commit = false; 1236 } else { 1237 /* The visitor set up our destination register to be g0. This 1238 * means that when the next read comes along, we will end up 1239 * reading from g0 and causing a block on the write commit. For 1240 * write-after-read, we are relying on the value of the previous 1241 * read being used (and thus blocking on completion) before our 1242 * write is executed. This means we have to be careful in 1243 * instruction scheduling to not violate this assumption. 1244 */ 1245 write_commit = true; 1246 } 1247 1248 /* Each of the 8 channel enables is considered for whether each 1249 * dword is written. 1250 */ 1251 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1252 brw_inst_set_sfid(p->devinfo, send, target_cache); 1253 brw_set_dest(p, send, dst); 1254 brw_set_src0(p, send, header); 1255 if (devinfo->gen < 6) 1256 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1257 brw_set_desc(p, send, 1258 brw_message_desc(devinfo, 3, write_commit, true) | 1259 brw_dp_write_desc(devinfo, 1260 brw_scratch_surface_idx(p), 1261 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1262 msg_type, 1263 false, /* not a render target write */ 1264 write_commit)); 1265} 1266 1267static void 1268generate_pull_constant_load(struct brw_codegen *p, 1269 struct brw_vue_prog_data *prog_data, 1270 vec4_instruction *inst, 1271 struct brw_reg dst, 1272 struct brw_reg index, 1273 struct brw_reg offset) 1274{ 1275 const struct gen_device_info *devinfo = p->devinfo; 1276 const unsigned target_cache = 1277 (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE : 1278 BRW_SFID_DATAPORT_READ); 1279 assert(index.file == BRW_IMMEDIATE_VALUE && 1280 index.type == BRW_REGISTER_TYPE_UD); 1281 uint32_t surf_index = index.ud; 1282 1283 struct brw_reg header = brw_vec8_grf(0, 0); 1284 1285 gen6_resolve_implied_move(p, &header, inst->base_mrf); 1286 1287 if (devinfo->gen >= 6) { 1288 if (offset.file == BRW_IMMEDIATE_VALUE) { 1289 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1290 BRW_REGISTER_TYPE_D), 1291 brw_imm_d(offset.ud >> 4)); 1292 } else { 1293 brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1), 1294 BRW_REGISTER_TYPE_D), 1295 offset, brw_imm_d(4)); 1296 } 1297 } else { 1298 brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), 1299 BRW_REGISTER_TYPE_D), 1300 offset); 1301 } 1302 1303 uint32_t msg_type; 1304 1305 if (devinfo->gen >= 6) 1306 msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1307 else if (devinfo->gen == 5 || devinfo->is_g4x) 1308 msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1309 else 1310 msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; 1311 1312 /* Each of the 8 channel enables is considered for whether each 1313 * dword is written. 1314 */ 1315 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); 1316 brw_inst_set_sfid(devinfo, send, target_cache); 1317 brw_set_dest(p, send, dst); 1318 brw_set_src0(p, send, header); 1319 if (devinfo->gen < 6) 1320 brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); 1321 brw_set_desc(p, send, 1322 brw_message_desc(devinfo, 2, 1, true) | 1323 brw_dp_read_desc(devinfo, surf_index, 1324 BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, 1325 msg_type, 1326 BRW_DATAPORT_READ_TARGET_DATA_CACHE)); 1327} 1328 1329static void 1330generate_get_buffer_size(struct brw_codegen *p, 1331 struct brw_vue_prog_data *prog_data, 1332 vec4_instruction *inst, 1333 struct brw_reg dst, 1334 struct brw_reg src, 1335 struct brw_reg surf_index) 1336{ 1337 assert(p->devinfo->gen >= 7); 1338 assert(surf_index.type == BRW_REGISTER_TYPE_UD && 1339 surf_index.file == BRW_IMMEDIATE_VALUE); 1340 1341 brw_SAMPLE(p, 1342 dst, 1343 inst->base_mrf, 1344 src, 1345 surf_index.ud, 1346 0, 1347 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO, 1348 1, /* response length */ 1349 inst->mlen, 1350 inst->header_size > 0, 1351 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1352 BRW_SAMPLER_RETURN_FORMAT_SINT32); 1353} 1354 1355static void 1356generate_pull_constant_load_gen7(struct brw_codegen *p, 1357 struct brw_vue_prog_data *prog_data, 1358 vec4_instruction *inst, 1359 struct brw_reg dst, 1360 struct brw_reg surf_index, 1361 struct brw_reg offset) 1362{ 1363 const struct gen_device_info *devinfo = p->devinfo; 1364 assert(surf_index.type == BRW_REGISTER_TYPE_UD); 1365 1366 if (surf_index.file == BRW_IMMEDIATE_VALUE) { 1367 1368 brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); 1369 brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER); 1370 brw_set_dest(p, insn, dst); 1371 brw_set_src0(p, insn, offset); 1372 brw_set_desc(p, insn, 1373 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | 1374 brw_sampler_desc(devinfo, surf_index.ud, 1375 0, /* LD message ignores sampler unit */ 1376 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1377 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0)); 1378 } else { 1379 1380 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); 1381 1382 brw_push_insn_state(p); 1383 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1384 brw_set_default_access_mode(p, BRW_ALIGN_1); 1385 1386 /* a0.0 = surf_index & 0xff */ 1387 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); 1388 brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1); 1389 brw_set_dest(p, insn_and, addr); 1390 brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); 1391 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); 1392 1393 brw_pop_insn_state(p); 1394 1395 /* dst = send(offset, a0.0 | <descriptor>) */ 1396 brw_send_indirect_message( 1397 p, BRW_SFID_SAMPLER, dst, offset, addr, 1398 brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | 1399 brw_sampler_desc(devinfo, 1400 0 /* surface */, 1401 0 /* sampler */, 1402 GEN5_SAMPLER_MESSAGE_SAMPLE_LD, 1403 BRW_SAMPLER_SIMD_MODE_SIMD4X2, 1404 0), 1405 false /* EOT */); 1406 } 1407} 1408 1409static void 1410generate_set_simd4x2_header_gen9(struct brw_codegen *p, 1411 vec4_instruction *, 1412 struct brw_reg dst) 1413{ 1414 brw_push_insn_state(p); 1415 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1416 1417 brw_set_default_exec_size(p, BRW_EXECUTE_8); 1418 brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); 1419 1420 brw_set_default_access_mode(p, BRW_ALIGN_1); 1421 brw_MOV(p, get_element_ud(dst, 2), 1422 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2)); 1423 1424 brw_pop_insn_state(p); 1425} 1426 1427static void 1428generate_mov_indirect(struct brw_codegen *p, 1429 vec4_instruction *, 1430 struct brw_reg dst, struct brw_reg reg, 1431 struct brw_reg indirect) 1432{ 1433 assert(indirect.type == BRW_REGISTER_TYPE_UD); 1434 assert(p->devinfo->gen >= 6); 1435 1436 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); 1437 1438 /* This instruction acts in align1 mode */ 1439 assert(dst.writemask == WRITEMASK_XYZW); 1440 1441 if (indirect.file == BRW_IMMEDIATE_VALUE) { 1442 imm_byte_offset += indirect.ud; 1443 1444 reg.nr = imm_byte_offset / REG_SIZE; 1445 reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2; 1446 unsigned shift = (imm_byte_offset / 4) % 4; 1447 reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); 1448 1449 brw_MOV(p, dst, reg); 1450 } else { 1451 brw_push_insn_state(p); 1452 brw_set_default_access_mode(p, BRW_ALIGN_1); 1453 brw_set_default_mask_control(p, BRW_MASK_DISABLE); 1454 1455 struct brw_reg addr = vec8(brw_address_reg(0)); 1456 1457 /* We need to move the indirect value into the address register. In 1458 * order to make things make some sense, we want to respect at least the 1459 * X component of the swizzle. In order to do that, we need to convert 1460 * the subnr (probably 0) to an align1 subnr and add in the swizzle. 1461 */ 1462 assert(brw_is_single_value_swizzle(indirect.swizzle)); 1463 indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)); 1464 1465 /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of 1466 * the indirect and splat it out to all four channels of the given half 1467 * of a0. 1468 */ 1469 indirect.subnr *= 2; 1470 indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); 1471 brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); 1472 1473 /* Now we need to incorporate the swizzle from the source register */ 1474 if (reg.swizzle != BRW_SWIZZLE_XXXX) { 1475 uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 | 1476 BRW_GET_SWZ(reg.swizzle, 1) << 6 | 1477 BRW_GET_SWZ(reg.swizzle, 2) << 10 | 1478 BRW_GET_SWZ(reg.swizzle, 3) << 14; 1479 uv_swiz |= uv_swiz << 16; 1480 1481 brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz)); 1482 } 1483 1484 brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type)); 1485 1486 brw_pop_insn_state(p); 1487 } 1488} 1489 1490static void 1491generate_code(struct brw_codegen *p, 1492 const struct brw_compiler *compiler, 1493 void *log_data, 1494 const nir_shader *nir, 1495 struct brw_vue_prog_data *prog_data, 1496 const struct cfg_t *cfg) 1497{ 1498 const struct gen_device_info *devinfo = p->devinfo; 1499 const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage); 1500 bool debug_flag = INTEL_DEBUG & 1501 intel_debug_flag_for_shader_stage(nir->info.stage); 1502 struct disasm_info *disasm_info = disasm_initialize(devinfo, cfg); 1503 int spill_count = 0, fill_count = 0; 1504 int loop_count = 0; 1505 1506 foreach_block_and_inst (block, vec4_instruction, inst, cfg) { 1507 struct brw_reg src[3], dst; 1508 1509 if (unlikely(debug_flag)) 1510 disasm_annotate(disasm_info, inst, p->next_insn_offset); 1511 1512 for (unsigned int i = 0; i < 3; i++) { 1513 src[i] = inst->src[i].as_brw_reg(); 1514 } 1515 dst = inst->dst.as_brw_reg(); 1516 1517 brw_set_default_predicate_control(p, inst->predicate); 1518 brw_set_default_predicate_inverse(p, inst->predicate_inverse); 1519 brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2); 1520 brw_set_default_saturate(p, inst->saturate); 1521 brw_set_default_mask_control(p, inst->force_writemask_all); 1522 brw_set_default_acc_write_control(p, inst->writes_accumulator); 1523 1524 assert(inst->group % inst->exec_size == 0); 1525 assert(inst->group % 4 == 0); 1526 1527 /* There are some instructions where the destination is 64-bit 1528 * but we retype it to a smaller type. In that case, we cannot 1529 * double the exec_size. 1530 */ 1531 const bool is_df = (get_exec_type_size(inst) == 8 || 1532 inst->dst.type == BRW_REGISTER_TYPE_DF) && 1533 inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT && 1534 inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT && 1535 inst->opcode != VEC4_OPCODE_SET_LOW_32BIT && 1536 inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT; 1537 1538 unsigned exec_size = inst->exec_size; 1539 if (devinfo->gen == 7 && !devinfo->is_haswell && is_df) 1540 exec_size *= 2; 1541 1542 brw_set_default_exec_size(p, cvt(exec_size) - 1); 1543 1544 if (!inst->force_writemask_all) 1545 brw_set_default_group(p, inst->group); 1546 1547 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen)); 1548 assert(inst->mlen <= BRW_MAX_MSG_LENGTH); 1549 1550 unsigned pre_emit_nr_insn = p->nr_insn; 1551 1552 switch (inst->opcode) { 1553 case VEC4_OPCODE_UNPACK_UNIFORM: 1554 case BRW_OPCODE_MOV: 1555 brw_MOV(p, dst, src[0]); 1556 break; 1557 case BRW_OPCODE_ADD: 1558 brw_ADD(p, dst, src[0], src[1]); 1559 break; 1560 case BRW_OPCODE_MUL: 1561 brw_MUL(p, dst, src[0], src[1]); 1562 break; 1563 case BRW_OPCODE_MACH: 1564 brw_MACH(p, dst, src[0], src[1]); 1565 break; 1566 1567 case BRW_OPCODE_MAD: 1568 assert(devinfo->gen >= 6); 1569 brw_MAD(p, dst, src[0], src[1], src[2]); 1570 break; 1571 1572 case BRW_OPCODE_FRC: 1573 brw_FRC(p, dst, src[0]); 1574 break; 1575 case BRW_OPCODE_RNDD: 1576 brw_RNDD(p, dst, src[0]); 1577 break; 1578 case BRW_OPCODE_RNDE: 1579 brw_RNDE(p, dst, src[0]); 1580 break; 1581 case BRW_OPCODE_RNDZ: 1582 brw_RNDZ(p, dst, src[0]); 1583 break; 1584 1585 case BRW_OPCODE_AND: 1586 brw_AND(p, dst, src[0], src[1]); 1587 break; 1588 case BRW_OPCODE_OR: 1589 brw_OR(p, dst, src[0], src[1]); 1590 break; 1591 case BRW_OPCODE_XOR: 1592 brw_XOR(p, dst, src[0], src[1]); 1593 break; 1594 case BRW_OPCODE_NOT: 1595 brw_NOT(p, dst, src[0]); 1596 break; 1597 case BRW_OPCODE_ASR: 1598 brw_ASR(p, dst, src[0], src[1]); 1599 break; 1600 case BRW_OPCODE_SHR: 1601 brw_SHR(p, dst, src[0], src[1]); 1602 break; 1603 case BRW_OPCODE_SHL: 1604 brw_SHL(p, dst, src[0], src[1]); 1605 break; 1606 1607 case BRW_OPCODE_CMP: 1608 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); 1609 break; 1610 case BRW_OPCODE_SEL: 1611 brw_SEL(p, dst, src[0], src[1]); 1612 break; 1613 1614 case BRW_OPCODE_DPH: 1615 brw_DPH(p, dst, src[0], src[1]); 1616 break; 1617 1618 case BRW_OPCODE_DP4: 1619 brw_DP4(p, dst, src[0], src[1]); 1620 break; 1621 1622 case BRW_OPCODE_DP3: 1623 brw_DP3(p, dst, src[0], src[1]); 1624 break; 1625 1626 case BRW_OPCODE_DP2: 1627 brw_DP2(p, dst, src[0], src[1]); 1628 break; 1629 1630 case BRW_OPCODE_F32TO16: 1631 assert(devinfo->gen >= 7); 1632 brw_F32TO16(p, dst, src[0]); 1633 break; 1634 1635 case BRW_OPCODE_F16TO32: 1636 assert(devinfo->gen >= 7); 1637 brw_F16TO32(p, dst, src[0]); 1638 break; 1639 1640 case BRW_OPCODE_LRP: 1641 assert(devinfo->gen >= 6); 1642 brw_LRP(p, dst, src[0], src[1], src[2]); 1643 break; 1644 1645 case BRW_OPCODE_BFREV: 1646 assert(devinfo->gen >= 7); 1647 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), 1648 retype(src[0], BRW_REGISTER_TYPE_UD)); 1649 break; 1650 case BRW_OPCODE_FBH: 1651 assert(devinfo->gen >= 7); 1652 brw_FBH(p, retype(dst, src[0].type), src[0]); 1653 break; 1654 case BRW_OPCODE_FBL: 1655 assert(devinfo->gen >= 7); 1656 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), 1657 retype(src[0], BRW_REGISTER_TYPE_UD)); 1658 break; 1659 case BRW_OPCODE_LZD: 1660 brw_LZD(p, dst, src[0]); 1661 break; 1662 case BRW_OPCODE_CBIT: 1663 assert(devinfo->gen >= 7); 1664 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), 1665 retype(src[0], BRW_REGISTER_TYPE_UD)); 1666 break; 1667 case BRW_OPCODE_ADDC: 1668 assert(devinfo->gen >= 7); 1669 brw_ADDC(p, dst, src[0], src[1]); 1670 break; 1671 case BRW_OPCODE_SUBB: 1672 assert(devinfo->gen >= 7); 1673 brw_SUBB(p, dst, src[0], src[1]); 1674 break; 1675 case BRW_OPCODE_MAC: 1676 brw_MAC(p, dst, src[0], src[1]); 1677 break; 1678 1679 case BRW_OPCODE_BFE: 1680 assert(devinfo->gen >= 7); 1681 brw_BFE(p, dst, src[0], src[1], src[2]); 1682 break; 1683 1684 case BRW_OPCODE_BFI1: 1685 assert(devinfo->gen >= 7); 1686 brw_BFI1(p, dst, src[0], src[1]); 1687 break; 1688 case BRW_OPCODE_BFI2: 1689 assert(devinfo->gen >= 7); 1690 brw_BFI2(p, dst, src[0], src[1], src[2]); 1691 break; 1692 1693 case BRW_OPCODE_IF: 1694 if (!inst->src[0].is_null()) { 1695 /* The instruction has an embedded compare (only allowed on gen6) */ 1696 assert(devinfo->gen == 6); 1697 gen6_IF(p, inst->conditional_mod, src[0], src[1]); 1698 } else { 1699 brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); 1700 brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate); 1701 } 1702 break; 1703 1704 case BRW_OPCODE_ELSE: 1705 brw_ELSE(p); 1706 break; 1707 case BRW_OPCODE_ENDIF: 1708 brw_ENDIF(p); 1709 break; 1710 1711 case BRW_OPCODE_DO: 1712 brw_DO(p, BRW_EXECUTE_8); 1713 break; 1714 1715 case BRW_OPCODE_BREAK: 1716 brw_BREAK(p); 1717 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1718 break; 1719 case BRW_OPCODE_CONTINUE: 1720 brw_CONT(p); 1721 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); 1722 break; 1723 1724 case BRW_OPCODE_WHILE: 1725 brw_WHILE(p); 1726 loop_count++; 1727 break; 1728 1729 case SHADER_OPCODE_RCP: 1730 case SHADER_OPCODE_RSQ: 1731 case SHADER_OPCODE_SQRT: 1732 case SHADER_OPCODE_EXP2: 1733 case SHADER_OPCODE_LOG2: 1734 case SHADER_OPCODE_SIN: 1735 case SHADER_OPCODE_COS: 1736 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1737 if (devinfo->gen >= 7) { 1738 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], 1739 brw_null_reg()); 1740 } else if (devinfo->gen == 6) { 1741 generate_math_gen6(p, inst, dst, src[0], brw_null_reg()); 1742 } else { 1743 generate_math1_gen4(p, inst, dst, src[0]); 1744 } 1745 break; 1746 1747 case SHADER_OPCODE_POW: 1748 case SHADER_OPCODE_INT_QUOTIENT: 1749 case SHADER_OPCODE_INT_REMAINDER: 1750 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); 1751 if (devinfo->gen >= 7) { 1752 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); 1753 } else if (devinfo->gen == 6) { 1754 generate_math_gen6(p, inst, dst, src[0], src[1]); 1755 } else { 1756 generate_math2_gen4(p, inst, dst, src[0], src[1]); 1757 } 1758 break; 1759 1760 case SHADER_OPCODE_TEX: 1761 case SHADER_OPCODE_TXD: 1762 case SHADER_OPCODE_TXF: 1763 case SHADER_OPCODE_TXF_CMS: 1764 case SHADER_OPCODE_TXF_CMS_W: 1765 case SHADER_OPCODE_TXF_MCS: 1766 case SHADER_OPCODE_TXL: 1767 case SHADER_OPCODE_TXS: 1768 case SHADER_OPCODE_TG4: 1769 case SHADER_OPCODE_TG4_OFFSET: 1770 case SHADER_OPCODE_SAMPLEINFO: 1771 generate_tex(p, prog_data, nir->info.stage, 1772 inst, dst, src[0], src[1], src[2]); 1773 break; 1774 1775 case SHADER_OPCODE_GET_BUFFER_SIZE: 1776 generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]); 1777 break; 1778 1779 case VS_OPCODE_URB_WRITE: 1780 generate_vs_urb_write(p, inst); 1781 break; 1782 1783 case SHADER_OPCODE_GEN4_SCRATCH_READ: 1784 generate_scratch_read(p, inst, dst, src[0]); 1785 fill_count++; 1786 break; 1787 1788 case SHADER_OPCODE_GEN4_SCRATCH_WRITE: 1789 generate_scratch_write(p, inst, dst, src[0], src[1]); 1790 spill_count++; 1791 break; 1792 1793 case VS_OPCODE_PULL_CONSTANT_LOAD: 1794 generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]); 1795 break; 1796 1797 case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7: 1798 generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]); 1799 break; 1800 1801 case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9: 1802 generate_set_simd4x2_header_gen9(p, inst, dst); 1803 break; 1804 1805 case GS_OPCODE_URB_WRITE: 1806 generate_gs_urb_write(p, inst); 1807 break; 1808 1809 case GS_OPCODE_URB_WRITE_ALLOCATE: 1810 generate_gs_urb_write_allocate(p, inst); 1811 break; 1812 1813 case GS_OPCODE_SVB_WRITE: 1814 generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]); 1815 break; 1816 1817 case GS_OPCODE_SVB_SET_DST_INDEX: 1818 generate_gs_svb_set_destination_index(p, inst, dst, src[0]); 1819 break; 1820 1821 case GS_OPCODE_THREAD_END: 1822 generate_gs_thread_end(p, inst); 1823 break; 1824 1825 case GS_OPCODE_SET_WRITE_OFFSET: 1826 generate_gs_set_write_offset(p, dst, src[0], src[1]); 1827 break; 1828 1829 case GS_OPCODE_SET_VERTEX_COUNT: 1830 generate_gs_set_vertex_count(p, dst, src[0]); 1831 break; 1832 1833 case GS_OPCODE_FF_SYNC: 1834 generate_gs_ff_sync(p, inst, dst, src[0], src[1]); 1835 break; 1836 1837 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: 1838 generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); 1839 break; 1840 1841 case GS_OPCODE_SET_PRIMITIVE_ID: 1842 generate_gs_set_primitive_id(p, dst); 1843 break; 1844 1845 case GS_OPCODE_SET_DWORD_2: 1846 generate_gs_set_dword_2(p, dst, src[0]); 1847 break; 1848 1849 case GS_OPCODE_PREPARE_CHANNEL_MASKS: 1850 generate_gs_prepare_channel_masks(p, dst); 1851 break; 1852 1853 case GS_OPCODE_SET_CHANNEL_MASKS: 1854 generate_gs_set_channel_masks(p, dst, src[0]); 1855 break; 1856 1857 case GS_OPCODE_GET_INSTANCE_ID: 1858 generate_gs_get_instance_id(p, dst); 1859 break; 1860 1861 case SHADER_OPCODE_SHADER_TIME_ADD: 1862 brw_shader_time_add(p, src[0], 1863 prog_data->base.binding_table.shader_time_start); 1864 break; 1865 1866 case VEC4_OPCODE_UNTYPED_ATOMIC: 1867 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1868 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, 1869 !inst->dst.is_null(), inst->header_size); 1870 break; 1871 1872 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 1873 assert(!inst->header_size); 1874 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1875 brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, 1876 src[2].ud); 1877 break; 1878 1879 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 1880 assert(src[2].file == BRW_IMMEDIATE_VALUE); 1881 brw_untyped_surface_write(p, src[0], src[1], inst->mlen, 1882 src[2].ud, inst->header_size); 1883 break; 1884 1885 case SHADER_OPCODE_MEMORY_FENCE: 1886 brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false); 1887 break; 1888 1889 case SHADER_OPCODE_FIND_LIVE_CHANNEL: { 1890 const struct brw_reg mask = 1891 brw_stage_has_packed_dispatch(devinfo, nir->info.stage, 1892 &prog_data->base) ? brw_imm_ud(~0u) : 1893 brw_dmask_reg(); 1894 brw_find_live_channel(p, dst, mask); 1895 break; 1896 } 1897 1898 case SHADER_OPCODE_BROADCAST: 1899 assert(inst->force_writemask_all); 1900 brw_broadcast(p, dst, src[0], src[1]); 1901 break; 1902 1903 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: 1904 generate_unpack_flags(p, dst); 1905 break; 1906 1907 case VEC4_OPCODE_MOV_BYTES: { 1908 /* Moves the low byte from each channel, using an Align1 access mode 1909 * and a <4,1,0> source region. 1910 */ 1911 assert(src[0].type == BRW_REGISTER_TYPE_UB || 1912 src[0].type == BRW_REGISTER_TYPE_B); 1913 1914 brw_set_default_access_mode(p, BRW_ALIGN_1); 1915 src[0].vstride = BRW_VERTICAL_STRIDE_4; 1916 src[0].width = BRW_WIDTH_1; 1917 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 1918 brw_MOV(p, dst, src[0]); 1919 brw_set_default_access_mode(p, BRW_ALIGN_16); 1920 break; 1921 } 1922 1923 case VEC4_OPCODE_DOUBLE_TO_F32: 1924 case VEC4_OPCODE_DOUBLE_TO_D32: 1925 case VEC4_OPCODE_DOUBLE_TO_U32: { 1926 assert(type_sz(src[0].type) == 8); 1927 assert(type_sz(dst.type) == 8); 1928 1929 brw_reg_type dst_type; 1930 1931 switch (inst->opcode) { 1932 case VEC4_OPCODE_DOUBLE_TO_F32: 1933 dst_type = BRW_REGISTER_TYPE_F; 1934 break; 1935 case VEC4_OPCODE_DOUBLE_TO_D32: 1936 dst_type = BRW_REGISTER_TYPE_D; 1937 break; 1938 case VEC4_OPCODE_DOUBLE_TO_U32: 1939 dst_type = BRW_REGISTER_TYPE_UD; 1940 break; 1941 default: 1942 unreachable("Not supported conversion"); 1943 } 1944 dst = retype(dst, dst_type); 1945 1946 brw_set_default_access_mode(p, BRW_ALIGN_1); 1947 1948 /* When converting from DF->F, we set destination's stride as 2 as an 1949 * aligment requirement. But in IVB/BYT, each DF implicitly writes 1950 * two floats, being the first one the converted value. So we don't 1951 * need to explicitly set stride 2, but 1. 1952 */ 1953 struct brw_reg spread_dst; 1954 if (devinfo->gen == 7 && !devinfo->is_haswell) 1955 spread_dst = stride(dst, 8, 4, 1); 1956 else 1957 spread_dst = stride(dst, 8, 4, 2); 1958 1959 brw_MOV(p, spread_dst, src[0]); 1960 1961 brw_set_default_access_mode(p, BRW_ALIGN_16); 1962 break; 1963 } 1964 1965 case VEC4_OPCODE_TO_DOUBLE: { 1966 assert(type_sz(src[0].type) == 4); 1967 assert(type_sz(dst.type) == 8); 1968 1969 brw_set_default_access_mode(p, BRW_ALIGN_1); 1970 1971 brw_MOV(p, dst, src[0]); 1972 1973 brw_set_default_access_mode(p, BRW_ALIGN_16); 1974 break; 1975 } 1976 1977 case VEC4_OPCODE_PICK_LOW_32BIT: 1978 case VEC4_OPCODE_PICK_HIGH_32BIT: { 1979 /* Stores the low/high 32-bit of each 64-bit element in src[0] into 1980 * dst using ALIGN1 mode and a <8,4,2>:UD region on the source. 1981 */ 1982 assert(type_sz(src[0].type) == 8); 1983 assert(type_sz(dst.type) == 4); 1984 1985 brw_set_default_access_mode(p, BRW_ALIGN_1); 1986 1987 dst = retype(dst, BRW_REGISTER_TYPE_UD); 1988 dst.hstride = BRW_HORIZONTAL_STRIDE_1; 1989 1990 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 1991 if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT) 1992 src[0] = suboffset(src[0], 1); 1993 src[0] = spread(src[0], 2); 1994 brw_MOV(p, dst, src[0]); 1995 1996 brw_set_default_access_mode(p, BRW_ALIGN_16); 1997 break; 1998 } 1999 2000 case VEC4_OPCODE_SET_LOW_32BIT: 2001 case VEC4_OPCODE_SET_HIGH_32BIT: { 2002 /* Reads consecutive 32-bit elements from src[0] and writes 2003 * them to the low/high 32-bit of each 64-bit element in dst. 2004 */ 2005 assert(type_sz(src[0].type) == 4); 2006 assert(type_sz(dst.type) == 8); 2007 2008 brw_set_default_access_mode(p, BRW_ALIGN_1); 2009 2010 dst = retype(dst, BRW_REGISTER_TYPE_UD); 2011 if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT) 2012 dst = suboffset(dst, 1); 2013 dst.hstride = BRW_HORIZONTAL_STRIDE_2; 2014 2015 src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); 2016 brw_MOV(p, dst, src[0]); 2017 2018 brw_set_default_access_mode(p, BRW_ALIGN_16); 2019 break; 2020 } 2021 2022 case VEC4_OPCODE_PACK_BYTES: { 2023 /* Is effectively: 2024 * 2025 * mov(8) dst<16,4,1>:UB src<4,1,0>:UB 2026 * 2027 * but destinations' only regioning is horizontal stride, so instead we 2028 * have to use two instructions: 2029 * 2030 * mov(4) dst<1>:UB src<4,1,0>:UB 2031 * mov(4) dst.16<1>:UB src.16<4,1,0>:UB 2032 * 2033 * where they pack the four bytes from the low and high four DW. 2034 */ 2035 assert(_mesa_is_pow_two(dst.writemask) && 2036 dst.writemask != 0); 2037 unsigned offset = __builtin_ctz(dst.writemask); 2038 2039 dst.type = BRW_REGISTER_TYPE_UB; 2040 2041 brw_set_default_access_mode(p, BRW_ALIGN_1); 2042 2043 src[0].type = BRW_REGISTER_TYPE_UB; 2044 src[0].vstride = BRW_VERTICAL_STRIDE_4; 2045 src[0].width = BRW_WIDTH_1; 2046 src[0].hstride = BRW_HORIZONTAL_STRIDE_0; 2047 dst.subnr = offset * 4; 2048 struct brw_inst *insn = brw_MOV(p, dst, src[0]); 2049 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2050 brw_inst_set_no_dd_clear(p->devinfo, insn, true); 2051 brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check); 2052 2053 src[0].subnr = 16; 2054 dst.subnr = 16 + offset * 4; 2055 insn = brw_MOV(p, dst, src[0]); 2056 brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); 2057 brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear); 2058 brw_inst_set_no_dd_check(p->devinfo, insn, true); 2059 2060 brw_set_default_access_mode(p, BRW_ALIGN_16); 2061 break; 2062 } 2063 2064 case TCS_OPCODE_URB_WRITE: 2065 generate_tcs_urb_write(p, inst, src[0]); 2066 break; 2067 2068 case VEC4_OPCODE_URB_READ: 2069 generate_vec4_urb_read(p, inst, dst, src[0]); 2070 break; 2071 2072 case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 2073 generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); 2074 break; 2075 2076 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 2077 generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); 2078 break; 2079 2080 case TCS_OPCODE_GET_INSTANCE_ID: 2081 generate_tcs_get_instance_id(p, dst); 2082 break; 2083 2084 case TCS_OPCODE_GET_PRIMITIVE_ID: 2085 generate_tcs_get_primitive_id(p, dst); 2086 break; 2087 2088 case TCS_OPCODE_CREATE_BARRIER_HEADER: 2089 generate_tcs_create_barrier_header(p, prog_data, dst); 2090 break; 2091 2092 case TES_OPCODE_CREATE_INPUT_READ_HEADER: 2093 generate_tes_create_input_read_header(p, dst); 2094 break; 2095 2096 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 2097 generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]); 2098 break; 2099 2100 case TES_OPCODE_GET_PRIMITIVE_ID: 2101 generate_tes_get_primitive_id(p, dst); 2102 break; 2103 2104 case TCS_OPCODE_SRC0_010_IS_ZERO: 2105 /* If src_reg had stride like fs_reg, we wouldn't need this. */ 2106 brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); 2107 break; 2108 2109 case TCS_OPCODE_RELEASE_INPUT: 2110 generate_tcs_release_input(p, dst, src[0], src[1]); 2111 break; 2112 2113 case TCS_OPCODE_THREAD_END: 2114 generate_tcs_thread_end(p, inst); 2115 break; 2116 2117 case SHADER_OPCODE_BARRIER: 2118 brw_barrier(p, src[0]); 2119 brw_WAIT(p); 2120 break; 2121 2122 case SHADER_OPCODE_MOV_INDIRECT: 2123 generate_mov_indirect(p, inst, dst, src[0], src[1]); 2124 break; 2125 2126 case BRW_OPCODE_DIM: 2127 assert(devinfo->is_haswell); 2128 assert(src[0].type == BRW_REGISTER_TYPE_DF); 2129 assert(dst.type == BRW_REGISTER_TYPE_DF); 2130 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); 2131 break; 2132 2133 default: 2134 unreachable("Unsupported opcode"); 2135 } 2136 2137 if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { 2138 /* Handled dependency hints in the generator. */ 2139 2140 assert(!inst->conditional_mod); 2141 } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { 2142 assert(p->nr_insn == pre_emit_nr_insn + 1 || 2143 !"conditional_mod, no_dd_check, or no_dd_clear set for IR " 2144 "emitting more than 1 instruction"); 2145 2146 brw_inst *last = &p->store[pre_emit_nr_insn]; 2147 2148 if (inst->conditional_mod) 2149 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); 2150 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); 2151 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); 2152 } 2153 } 2154 2155 brw_set_uip_jip(p, 0); 2156 2157 /* end of program sentinel */ 2158 disasm_new_inst_group(disasm_info, p->next_insn_offset); 2159 2160#ifndef NDEBUG 2161 bool validated = 2162#else 2163 if (unlikely(debug_flag)) 2164#endif 2165 brw_validate_instructions(devinfo, p->store, 2166 0, p->next_insn_offset, 2167 disasm_info); 2168 2169 int before_size = p->next_insn_offset; 2170 brw_compact_instructions(p, 0, disasm_info); 2171 int after_size = p->next_insn_offset; 2172 2173 if (unlikely(debug_flag)) { 2174 fprintf(stderr, "Native code for %s %s shader %s:\n", 2175 nir->info.label ? nir->info.label : "unnamed", 2176 _mesa_shader_stage_to_string(nir->info.stage), nir->info.name); 2177 2178 fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " 2179 "spills:fills. Compacted %d to %d bytes (%.0f%%)\n", 2180 stage_abbrev, before_size / 16, loop_count, cfg->cycle_count, 2181 spill_count, fill_count, before_size, after_size, 2182 100.0f * (before_size - after_size) / before_size); 2183 2184 dump_assembly(p->store, disasm_info); 2185 } 2186 ralloc_free(disasm_info); 2187 assert(validated); 2188 2189 compiler->shader_debug_log(log_data, 2190 "%s vec4 shader: %d inst, %d loops, %u cycles, " 2191 "%d:%d spills:fills, compacted %d to %d bytes.", 2192 stage_abbrev, before_size / 16, 2193 loop_count, cfg->cycle_count, spill_count, 2194 fill_count, before_size, after_size); 2195 2196} 2197 2198extern "C" const unsigned * 2199brw_vec4_generate_assembly(const struct brw_compiler *compiler, 2200 void *log_data, 2201 void *mem_ctx, 2202 const nir_shader *nir, 2203 struct brw_vue_prog_data *prog_data, 2204 const struct cfg_t *cfg) 2205{ 2206 struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen); 2207 brw_init_codegen(compiler->devinfo, p, mem_ctx); 2208 brw_set_default_access_mode(p, BRW_ALIGN_16); 2209 2210 generate_code(p, compiler, log_data, nir, prog_data, cfg); 2211 2212 return brw_get_program(p, &prog_data->base.program_size); 2213} 2214