brw_fs_nir.cpp revision 9f464c52
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/glsl/ir.h" 25#include "brw_fs.h" 26#include "brw_nir.h" 27#include "nir_search_helpers.h" 28#include "util/u_math.h" 29#include "util/bitscan.h" 30 31using namespace brw; 32 33void 34fs_visitor::emit_nir_code() 35{ 36 /* emit the arrays used for inputs and outputs - load/store intrinsics will 37 * be converted to reads/writes of these arrays 38 */ 39 nir_setup_outputs(); 40 nir_setup_uniforms(); 41 nir_emit_system_values(); 42 43 nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 44} 45 46void 47fs_visitor::nir_setup_outputs() 48{ 49 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) 50 return; 51 52 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 53 54 /* Calculate the size of output registers in a separate pass, before 55 * allocating them. With ARB_enhanced_layouts, multiple output variables 56 * may occupy the same slot, but have different type sizes. 57 */ 58 nir_foreach_variable(var, &nir->outputs) { 59 const int loc = var->data.driver_location; 60 const unsigned var_vec4s = 61 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 62 : type_size_vec4(var->type, true); 63 vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 64 } 65 66 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 67 if (vec4s[loc] == 0) { 68 loc++; 69 continue; 70 } 71 72 unsigned reg_size = vec4s[loc]; 73 74 /* Check if there are any ranges that start within this range and extend 75 * past it. If so, include them in this allocation. 76 */ 77 for (unsigned i = 1; i < reg_size; i++) 78 reg_size = MAX2(vec4s[i + loc] + i, reg_size); 79 80 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 81 for (unsigned i = 0; i < reg_size; i++) 82 outputs[loc + i] = offset(reg, bld, 4 * i); 83 84 loc += reg_size; 85 } 86} 87 88void 89fs_visitor::nir_setup_uniforms() 90{ 91 /* Only the first compile gets to set up uniforms. */ 92 if (push_constant_loc) { 93 assert(pull_constant_loc); 94 return; 95 } 96 97 uniforms = nir->num_uniforms / 4; 98 99 if (stage == MESA_SHADER_COMPUTE) { 100 /* Add a uniform for the thread local id. It must be the last uniform 101 * on the list. 102 */ 103 assert(uniforms == prog_data->nr_params); 104 uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); 105 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 106 subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 107 } 108} 109 110static bool 111emit_system_values_block(nir_block *block, fs_visitor *v) 112{ 113 fs_reg *reg; 114 115 nir_foreach_instr(instr, block) { 116 if (instr->type != nir_instr_type_intrinsic) 117 continue; 118 119 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 120 switch (intrin->intrinsic) { 121 case nir_intrinsic_load_vertex_id: 122 case nir_intrinsic_load_base_vertex: 123 unreachable("should be lowered by nir_lower_system_values()."); 124 125 case nir_intrinsic_load_vertex_id_zero_base: 126 case nir_intrinsic_load_is_indexed_draw: 127 case nir_intrinsic_load_first_vertex: 128 case nir_intrinsic_load_instance_id: 129 case nir_intrinsic_load_base_instance: 130 case nir_intrinsic_load_draw_id: 131 unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 132 133 case nir_intrinsic_load_invocation_id: 134 if (v->stage == MESA_SHADER_TESS_CTRL) 135 break; 136 assert(v->stage == MESA_SHADER_GEOMETRY); 137 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 138 if (reg->file == BAD_FILE) { 139 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 140 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 141 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 142 abld.SHR(iid, g1, brw_imm_ud(27u)); 143 *reg = iid; 144 } 145 break; 146 147 case nir_intrinsic_load_sample_pos: 148 assert(v->stage == MESA_SHADER_FRAGMENT); 149 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 150 if (reg->file == BAD_FILE) 151 *reg = *v->emit_samplepos_setup(); 152 break; 153 154 case nir_intrinsic_load_sample_id: 155 assert(v->stage == MESA_SHADER_FRAGMENT); 156 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 157 if (reg->file == BAD_FILE) 158 *reg = *v->emit_sampleid_setup(); 159 break; 160 161 case nir_intrinsic_load_sample_mask_in: 162 assert(v->stage == MESA_SHADER_FRAGMENT); 163 assert(v->devinfo->gen >= 7); 164 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 165 if (reg->file == BAD_FILE) 166 *reg = *v->emit_samplemaskin_setup(); 167 break; 168 169 case nir_intrinsic_load_work_group_id: 170 assert(v->stage == MESA_SHADER_COMPUTE); 171 reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; 172 if (reg->file == BAD_FILE) 173 *reg = *v->emit_cs_work_group_id_setup(); 174 break; 175 176 case nir_intrinsic_load_helper_invocation: 177 assert(v->stage == MESA_SHADER_FRAGMENT); 178 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 179 if (reg->file == BAD_FILE) { 180 const fs_builder abld = 181 v->bld.annotate("gl_HelperInvocation", NULL); 182 183 /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the 184 * pixel mask is in g1.7 of the thread payload. 185 * 186 * We move the per-channel pixel enable bit to the low bit of each 187 * channel by shifting the byte containing the pixel mask by the 188 * vector immediate 0x76543210UV. 189 * 190 * The region of <1,8,0> reads only 1 byte (the pixel masks for 191 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 192 * masks for 2 and 3) in SIMD16. 193 */ 194 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 195 196 for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 197 const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 198 hbld.SHR(offset(shifted, hbld, i), 199 stride(retype(brw_vec1_grf(1 + i, 7), 200 BRW_REGISTER_TYPE_UB), 201 1, 8, 0), 202 brw_imm_v(0x76543210)); 203 } 204 205 /* A set bit in the pixel mask means the channel is enabled, but 206 * that is the opposite of gl_HelperInvocation so we need to invert 207 * the mask. 208 * 209 * The negate source-modifier bit of logical instructions on Gen8+ 210 * performs 1's complement negation, so we can use that instead of 211 * a NOT instruction. 212 */ 213 fs_reg inverted = negate(shifted); 214 if (v->devinfo->gen < 8) { 215 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 216 abld.NOT(inverted, shifted); 217 } 218 219 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 220 * with 1 and negating. 221 */ 222 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 223 abld.AND(anded, inverted, brw_imm_uw(1)); 224 225 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 226 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 227 *reg = dst; 228 } 229 break; 230 231 default: 232 break; 233 } 234 } 235 236 return true; 237} 238 239void 240fs_visitor::nir_emit_system_values() 241{ 242 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 243 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 244 nir_system_values[i] = fs_reg(); 245 } 246 247 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 248 * never end up using it. 249 */ 250 { 251 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 252 fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 253 reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 254 255 const fs_builder allbld8 = abld.group(8, 0).exec_all(); 256 allbld8.MOV(reg, brw_imm_v(0x76543210)); 257 if (dispatch_width > 8) 258 allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 259 if (dispatch_width > 16) { 260 const fs_builder allbld16 = abld.group(16, 0).exec_all(); 261 allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 262 } 263 } 264 265 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 266 nir_foreach_block(block, impl) 267 emit_system_values_block(block, this); 268} 269 270/* 271 * Returns a type based on a reference_type (word, float, half-float) and a 272 * given bit_size. 273 * 274 * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD. 275 * 276 * @FIXME: 64-bit return types are always DF on integer types to maintain 277 * compability with uses of DF previously to the introduction of int64 278 * support. 279 */ 280static brw_reg_type 281brw_reg_type_from_bit_size(const unsigned bit_size, 282 const brw_reg_type reference_type) 283{ 284 switch(reference_type) { 285 case BRW_REGISTER_TYPE_HF: 286 case BRW_REGISTER_TYPE_F: 287 case BRW_REGISTER_TYPE_DF: 288 switch(bit_size) { 289 case 16: 290 return BRW_REGISTER_TYPE_HF; 291 case 32: 292 return BRW_REGISTER_TYPE_F; 293 case 64: 294 return BRW_REGISTER_TYPE_DF; 295 default: 296 unreachable("Invalid bit size"); 297 } 298 case BRW_REGISTER_TYPE_B: 299 case BRW_REGISTER_TYPE_W: 300 case BRW_REGISTER_TYPE_D: 301 case BRW_REGISTER_TYPE_Q: 302 switch(bit_size) { 303 case 8: 304 return BRW_REGISTER_TYPE_B; 305 case 16: 306 return BRW_REGISTER_TYPE_W; 307 case 32: 308 return BRW_REGISTER_TYPE_D; 309 case 64: 310 return BRW_REGISTER_TYPE_Q; 311 default: 312 unreachable("Invalid bit size"); 313 } 314 case BRW_REGISTER_TYPE_UB: 315 case BRW_REGISTER_TYPE_UW: 316 case BRW_REGISTER_TYPE_UD: 317 case BRW_REGISTER_TYPE_UQ: 318 switch(bit_size) { 319 case 8: 320 return BRW_REGISTER_TYPE_UB; 321 case 16: 322 return BRW_REGISTER_TYPE_UW; 323 case 32: 324 return BRW_REGISTER_TYPE_UD; 325 case 64: 326 return BRW_REGISTER_TYPE_UQ; 327 default: 328 unreachable("Invalid bit size"); 329 } 330 default: 331 unreachable("Unknown type"); 332 } 333} 334 335void 336fs_visitor::nir_emit_impl(nir_function_impl *impl) 337{ 338 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 339 for (unsigned i = 0; i < impl->reg_alloc; i++) { 340 nir_locals[i] = fs_reg(); 341 } 342 343 foreach_list_typed(nir_register, reg, node, &impl->registers) { 344 unsigned array_elems = 345 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 346 unsigned size = array_elems * reg->num_components; 347 const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : 348 brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 349 nir_locals[reg->index] = bld.vgrf(reg_type, size); 350 } 351 352 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 353 impl->ssa_alloc); 354 355 nir_emit_cf_list(&impl->body); 356} 357 358void 359fs_visitor::nir_emit_cf_list(exec_list *list) 360{ 361 exec_list_validate(list); 362 foreach_list_typed(nir_cf_node, node, node, list) { 363 switch (node->type) { 364 case nir_cf_node_if: 365 nir_emit_if(nir_cf_node_as_if(node)); 366 break; 367 368 case nir_cf_node_loop: 369 nir_emit_loop(nir_cf_node_as_loop(node)); 370 break; 371 372 case nir_cf_node_block: 373 nir_emit_block(nir_cf_node_as_block(node)); 374 break; 375 376 default: 377 unreachable("Invalid CFG node block"); 378 } 379 } 380} 381 382void 383fs_visitor::nir_emit_if(nir_if *if_stmt) 384{ 385 bool invert; 386 fs_reg cond_reg; 387 388 /* If the condition has the form !other_condition, use other_condition as 389 * the source, but invert the predicate on the if instruction. 390 */ 391 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); 392 if (cond != NULL && cond->op == nir_op_inot) { 393 assert(!cond->src[0].negate); 394 assert(!cond->src[0].abs); 395 396 invert = true; 397 cond_reg = get_nir_src(cond->src[0].src); 398 } else { 399 invert = false; 400 cond_reg = get_nir_src(if_stmt->condition); 401 } 402 403 /* first, put the condition into f0 */ 404 fs_inst *inst = bld.MOV(bld.null_reg_d(), 405 retype(cond_reg, BRW_REGISTER_TYPE_D)); 406 inst->conditional_mod = BRW_CONDITIONAL_NZ; 407 408 bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; 409 410 nir_emit_cf_list(&if_stmt->then_list); 411 412 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 413 bld.emit(BRW_OPCODE_ELSE); 414 nir_emit_cf_list(&if_stmt->else_list); 415 } 416 417 bld.emit(BRW_OPCODE_ENDIF); 418 419 if (devinfo->gen < 7) 420 limit_dispatch_width(16, "Non-uniform control flow unsupported " 421 "in SIMD32 mode."); 422} 423 424void 425fs_visitor::nir_emit_loop(nir_loop *loop) 426{ 427 bld.emit(BRW_OPCODE_DO); 428 429 nir_emit_cf_list(&loop->body); 430 431 bld.emit(BRW_OPCODE_WHILE); 432 433 if (devinfo->gen < 7) 434 limit_dispatch_width(16, "Non-uniform control flow unsupported " 435 "in SIMD32 mode."); 436} 437 438void 439fs_visitor::nir_emit_block(nir_block *block) 440{ 441 nir_foreach_instr(instr, block) { 442 nir_emit_instr(instr); 443 } 444} 445 446void 447fs_visitor::nir_emit_instr(nir_instr *instr) 448{ 449 const fs_builder abld = bld.annotate(NULL, instr); 450 451 switch (instr->type) { 452 case nir_instr_type_alu: 453 nir_emit_alu(abld, nir_instr_as_alu(instr)); 454 break; 455 456 case nir_instr_type_deref: 457 unreachable("All derefs should've been lowered"); 458 break; 459 460 case nir_instr_type_intrinsic: 461 switch (stage) { 462 case MESA_SHADER_VERTEX: 463 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 464 break; 465 case MESA_SHADER_TESS_CTRL: 466 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 467 break; 468 case MESA_SHADER_TESS_EVAL: 469 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 470 break; 471 case MESA_SHADER_GEOMETRY: 472 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 473 break; 474 case MESA_SHADER_FRAGMENT: 475 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 476 break; 477 case MESA_SHADER_COMPUTE: 478 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 479 break; 480 default: 481 unreachable("unsupported shader stage"); 482 } 483 break; 484 485 case nir_instr_type_tex: 486 nir_emit_texture(abld, nir_instr_as_tex(instr)); 487 break; 488 489 case nir_instr_type_load_const: 490 nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 491 break; 492 493 case nir_instr_type_ssa_undef: 494 /* We create a new VGRF for undefs on every use (by handling 495 * them in get_nir_src()), rather than for each definition. 496 * This helps register coalescing eliminate MOVs from undef. 497 */ 498 break; 499 500 case nir_instr_type_jump: 501 nir_emit_jump(abld, nir_instr_as_jump(instr)); 502 break; 503 504 default: 505 unreachable("unknown instruction type"); 506 } 507} 508 509/** 510 * Recognizes a parent instruction of nir_op_extract_* and changes the type to 511 * match instr. 512 */ 513bool 514fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 515 const fs_reg &result) 516{ 517 if (!instr->src[0].src.is_ssa || 518 !instr->src[0].src.ssa->parent_instr) 519 return false; 520 521 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 522 return false; 523 524 nir_alu_instr *src0 = 525 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 526 527 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 528 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 529 return false; 530 531 /* If either opcode has source modifiers, bail. 532 * 533 * TODO: We can potentially handle source modifiers if both of the opcodes 534 * we're combining are signed integers. 535 */ 536 if (instr->src[0].abs || instr->src[0].negate || 537 src0->src[0].abs || src0->src[0].negate) 538 return false; 539 540 unsigned element = nir_src_as_uint(src0->src[1].src); 541 542 /* Element type to extract.*/ 543 const brw_reg_type type = brw_int_type( 544 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 545 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 546 547 fs_reg op0 = get_nir_src(src0->src[0].src); 548 op0.type = brw_type_for_nir_type(devinfo, 549 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 550 nir_src_bit_size(src0->src[0].src))); 551 op0 = offset(op0, bld, src0->src[0].swizzle[0]); 552 553 set_saturate(instr->dest.saturate, 554 bld.MOV(result, subscript(op0, type, element))); 555 return true; 556} 557 558bool 559fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 560 const fs_reg &result) 561{ 562 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); 563 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) 564 return false; 565 566 if (!nir_src_is_const(instr->src[1].src) || 567 !nir_src_is_const(instr->src[2].src)) 568 return false; 569 570 const float value1 = nir_src_as_float(instr->src[1].src); 571 const float value2 = nir_src_as_float(instr->src[2].src); 572 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) 573 return false; 574 575 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ 576 assert(value1 == -value2); 577 578 fs_reg tmp = vgrf(glsl_type::int_type); 579 580 if (devinfo->gen >= 6) { 581 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 582 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 583 584 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 585 * 586 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 587 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 588 * 589 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 590 * 591 * This negation looks like it's safe in practice, because bits 0:4 will 592 * surely be TRIANGLES 593 */ 594 595 if (value1 == -1.0f) { 596 g0.negate = true; 597 } 598 599 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 600 g0, brw_imm_uw(0x3f80)); 601 } else { 602 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 603 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 604 605 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 606 * 607 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 608 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 609 * 610 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 611 * 612 * This negation looks like it's safe in practice, because bits 0:4 will 613 * surely be TRIANGLES 614 */ 615 616 if (value1 == -1.0f) { 617 g1_6.negate = true; 618 } 619 620 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 621 } 622 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 623 624 return true; 625} 626 627static void 628emit_find_msb_using_lzd(const fs_builder &bld, 629 const fs_reg &result, 630 const fs_reg &src, 631 bool is_signed) 632{ 633 fs_inst *inst; 634 fs_reg temp = src; 635 636 if (is_signed) { 637 /* LZD of an absolute value source almost always does the right 638 * thing. There are two problem values: 639 * 640 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 641 * 0. However, findMSB(int(0x80000000)) == 30. 642 * 643 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 644 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 645 * 646 * For a value of zero or negative one, -1 will be returned. 647 * 648 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 649 * findMSB(-(1<<x)) should return x-1. 650 * 651 * For all negative number cases, including 0x80000000 and 652 * 0xffffffff, the correct value is obtained from LZD if instead of 653 * negating the (already negative) value the logical-not is used. A 654 * conditonal logical-not can be achieved in two instructions. 655 */ 656 temp = bld.vgrf(BRW_REGISTER_TYPE_D); 657 658 bld.ASR(temp, src, brw_imm_d(31)); 659 bld.XOR(temp, temp, src); 660 } 661 662 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 663 retype(temp, BRW_REGISTER_TYPE_UD)); 664 665 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 666 * from the LSB side. Subtract the result from 31 to convert the MSB 667 * count into an LSB count. If no bits are set, LZD will return 32. 668 * 31-32 = -1, which is exactly what findMSB() is supposed to return. 669 */ 670 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 671 inst->src[0].negate = true; 672} 673 674static brw_rnd_mode 675brw_rnd_mode_from_nir_op (const nir_op op) { 676 switch (op) { 677 case nir_op_f2f16_rtz: 678 return BRW_RND_MODE_RTZ; 679 case nir_op_f2f16_rtne: 680 return BRW_RND_MODE_RTNE; 681 default: 682 unreachable("Operation doesn't support rounding mode"); 683 } 684} 685 686fs_reg 687fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, 688 nir_alu_instr *instr, 689 fs_reg *op, 690 bool need_dest) 691{ 692 fs_reg result = 693 need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud(); 694 695 result.type = brw_type_for_nir_type(devinfo, 696 (nir_alu_type)(nir_op_infos[instr->op].output_type | 697 nir_dest_bit_size(instr->dest.dest))); 698 699 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 700 op[i] = get_nir_src(instr->src[i].src); 701 op[i].type = brw_type_for_nir_type(devinfo, 702 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 703 nir_src_bit_size(instr->src[i].src))); 704 op[i].abs = instr->src[i].abs; 705 op[i].negate = instr->src[i].negate; 706 } 707 708 /* Move and vecN instrutions may still be vectored. Return the raw, 709 * vectored source and destination so that fs_visitor::nir_emit_alu can 710 * handle it. Other callers should not have to handle these kinds of 711 * instructions. 712 */ 713 switch (instr->op) { 714 case nir_op_imov: 715 case nir_op_fmov: 716 case nir_op_vec2: 717 case nir_op_vec3: 718 case nir_op_vec4: 719 return result; 720 default: 721 break; 722 } 723 724 /* At this point, we have dealt with any instruction that operates on 725 * more than a single channel. Therefore, we can just adjust the source 726 * and destination registers for that channel and emit the instruction. 727 */ 728 unsigned channel = 0; 729 if (nir_op_infos[instr->op].output_size == 0) { 730 /* Since NIR is doing the scalarizing for us, we should only ever see 731 * vectorized operations with a single channel. 732 */ 733 assert(util_bitcount(instr->dest.write_mask) == 1); 734 channel = ffs(instr->dest.write_mask) - 1; 735 736 result = offset(result, bld, channel); 737 } 738 739 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 740 assert(nir_op_infos[instr->op].input_sizes[i] < 2); 741 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 742 } 743 744 return result; 745} 746 747void 748fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, 749 fs_reg *op) 750{ 751 for (unsigned i = 0; i < 2; i++) { 752 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); 753 754 if (inot_instr != NULL && inot_instr->op == nir_op_inot && 755 !inot_instr->src[0].abs && !inot_instr->src[0].negate) { 756 /* The source of the inot is now the source of instr. */ 757 prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); 758 759 assert(!op[i].negate); 760 op[i].negate = true; 761 } else { 762 op[i] = resolve_source_modifiers(op[i]); 763 } 764 } 765} 766 767bool 768fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, 769 fs_reg result, 770 nir_alu_instr *instr) 771{ 772 if (devinfo->gen < 6 || devinfo->gen >= 12) 773 return false; 774 775 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); 776 777 if (inot_instr == NULL || inot_instr->op != nir_op_inot) 778 return false; 779 780 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set 781 * of valid size-changing combinations is a bit more complex. 782 * 783 * The source restriction is just because I was lazy about generating the 784 * constant below. 785 */ 786 if (nir_dest_bit_size(instr->dest.dest) != 32 || 787 nir_src_bit_size(inot_instr->src[0].src) != 32) 788 return false; 789 790 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, 791 * this is float(1 + a). 792 */ 793 fs_reg op; 794 795 prepare_alu_destination_and_sources(bld, inot_instr, &op, false); 796 797 /* Ignore the saturate modifier, if there is one. The result of the 798 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. 799 */ 800 bld.ADD(result, op, brw_imm_d(1)); 801 802 return true; 803} 804 805/** 806 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul 807 * 808 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of 809 * the source of \c instr that is a \c nir_op_fsign. 810 */ 811void 812fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, 813 fs_reg result, fs_reg *op, unsigned fsign_src) 814{ 815 fs_inst *inst; 816 817 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); 818 assert(fsign_src < nir_op_infos[instr->op].num_inputs); 819 820 if (instr->op != nir_op_fsign) { 821 const nir_alu_instr *const fsign_instr = 822 nir_src_as_alu_instr(instr->src[fsign_src].src); 823 824 assert(!fsign_instr->dest.saturate); 825 826 /* op[fsign_src] has the nominal result of the fsign, and op[1 - 827 * fsign_src] has the other multiply source. This must be rearranged so 828 * that op[0] is the source of the fsign op[1] is the other multiply 829 * source. 830 */ 831 if (fsign_src != 0) 832 op[1] = op[0]; 833 834 op[0] = get_nir_src(fsign_instr->src[0].src); 835 836 const nir_alu_type t = 837 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | 838 nir_src_bit_size(fsign_instr->src[0].src)); 839 840 op[0].type = brw_type_for_nir_type(devinfo, t); 841 op[0].abs = fsign_instr->src[0].abs; 842 op[0].negate = fsign_instr->src[0].negate; 843 844 unsigned channel = 0; 845 if (nir_op_infos[instr->op].output_size == 0) { 846 /* Since NIR is doing the scalarizing for us, we should only ever see 847 * vectorized operations with a single channel. 848 */ 849 assert(util_bitcount(instr->dest.write_mask) == 1); 850 channel = ffs(instr->dest.write_mask) - 1; 851 } 852 853 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); 854 } else { 855 assert(!instr->dest.saturate); 856 } 857 858 if (op[0].abs) { 859 /* Straightforward since the source can be assumed to be either strictly 860 * >= 0 or strictly <= 0 depending on the setting of the negate flag. 861 */ 862 set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); 863 864 if (instr->op == nir_op_fsign) { 865 inst = (op[0].negate) 866 ? bld.MOV(result, brw_imm_f(-1.0f)) 867 : bld.MOV(result, brw_imm_f(1.0f)); 868 } else { 869 op[1].negate = (op[0].negate != op[1].negate); 870 inst = bld.MOV(result, op[1]); 871 } 872 873 set_predicate(BRW_PREDICATE_NORMAL, inst); 874 } else if (type_sz(op[0].type) == 2) { 875 /* AND(val, 0x8000) gives the sign bit. 876 * 877 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. 878 */ 879 fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); 880 bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); 881 882 op[0].type = BRW_REGISTER_TYPE_UW; 883 result.type = BRW_REGISTER_TYPE_UW; 884 bld.AND(result, op[0], brw_imm_uw(0x8000u)); 885 886 if (instr->op == nir_op_fsign) 887 inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); 888 else { 889 /* Use XOR here to get the result sign correct. */ 890 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); 891 } 892 893 inst->predicate = BRW_PREDICATE_NORMAL; 894 } else if (type_sz(op[0].type) == 4) { 895 /* AND(val, 0x80000000) gives the sign bit. 896 * 897 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 898 * zero. 899 */ 900 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 901 902 op[0].type = BRW_REGISTER_TYPE_UD; 903 result.type = BRW_REGISTER_TYPE_UD; 904 bld.AND(result, op[0], brw_imm_ud(0x80000000u)); 905 906 if (instr->op == nir_op_fsign) 907 inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); 908 else { 909 /* Use XOR here to get the result sign correct. */ 910 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); 911 } 912 913 inst->predicate = BRW_PREDICATE_NORMAL; 914 } else { 915 /* For doubles we do the same but we need to consider: 916 * 917 * - 2-src instructions can't operate with 64-bit immediates 918 * - The sign is encoded in the high 32-bit of each DF 919 * - We need to produce a DF result. 920 */ 921 922 fs_reg zero = vgrf(glsl_type::double_type); 923 bld.MOV(zero, setup_imm_df(bld, 0.0)); 924 bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 925 926 bld.MOV(result, zero); 927 928 fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 929 bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 930 brw_imm_ud(0x80000000u)); 931 932 if (instr->op == nir_op_fsign) { 933 set_predicate(BRW_PREDICATE_NORMAL, 934 bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 935 } else { 936 /* This could be done better in some cases. If the scale is an 937 * immediate with the low 32-bits all 0, emitting a separate XOR and 938 * OR would allow an algebraic optimization to remove the OR. There 939 * are currently zero instances of fsign(double(x))*IMM in shader-db 940 * or any test suite, so it is hard to care at this time. 941 */ 942 fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); 943 inst = bld.XOR(result_int64, result_int64, 944 retype(op[1], BRW_REGISTER_TYPE_UQ)); 945 } 946 } 947} 948 949/** 950 * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign 951 * 952 * Checks the operands of a \c nir_op_fmul to determine whether or not 953 * \c emit_fsign could fuse the multiplication with the \c sign() calculation. 954 * 955 * \param instr The multiplication instruction 956 * 957 * \param fsign_src The source of \c instr that may or may not be a 958 * \c nir_op_fsign 959 */ 960static bool 961can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) 962{ 963 assert(instr->op == nir_op_fmul); 964 965 nir_alu_instr *const fsign_instr = 966 nir_src_as_alu_instr(instr->src[fsign_src].src); 967 968 /* Rules: 969 * 970 * 1. instr->src[fsign_src] must be a nir_op_fsign. 971 * 2. The nir_op_fsign can only be used by this multiplication. 972 * 3. The source that is the nir_op_fsign does not have source modifiers. 973 * \c emit_fsign only examines the source modifiers of the source of the 974 * \c nir_op_fsign. 975 * 976 * The nir_op_fsign must also not have the saturate modifier, but steps 977 * have already been taken (in nir_opt_algebraic) to ensure that. 978 */ 979 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && 980 is_used_once(fsign_instr) && 981 !instr->src[fsign_src].abs && !instr->src[fsign_src].negate; 982} 983 984void 985fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) 986{ 987 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; 988 fs_inst *inst; 989 990 fs_reg op[4]; 991 fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, true); 992 993 switch (instr->op) { 994 case nir_op_imov: 995 case nir_op_fmov: 996 case nir_op_vec2: 997 case nir_op_vec3: 998 case nir_op_vec4: { 999 fs_reg temp = result; 1000 bool need_extra_copy = false; 1001 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1002 if (!instr->src[i].src.is_ssa && 1003 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 1004 need_extra_copy = true; 1005 temp = bld.vgrf(result.type, 4); 1006 break; 1007 } 1008 } 1009 1010 for (unsigned i = 0; i < 4; i++) { 1011 if (!(instr->dest.write_mask & (1 << i))) 1012 continue; 1013 1014 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { 1015 inst = bld.MOV(offset(temp, bld, i), 1016 offset(op[0], bld, instr->src[0].swizzle[i])); 1017 } else { 1018 inst = bld.MOV(offset(temp, bld, i), 1019 offset(op[i], bld, instr->src[i].swizzle[0])); 1020 } 1021 inst->saturate = instr->dest.saturate; 1022 } 1023 1024 /* In this case the source and destination registers were the same, 1025 * so we need to insert an extra set of moves in order to deal with 1026 * any swizzling. 1027 */ 1028 if (need_extra_copy) { 1029 for (unsigned i = 0; i < 4; i++) { 1030 if (!(instr->dest.write_mask & (1 << i))) 1031 continue; 1032 1033 bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 1034 } 1035 } 1036 return; 1037 } 1038 1039 case nir_op_i2f32: 1040 case nir_op_u2f32: 1041 if (optimize_extract_to_float(instr, result)) 1042 return; 1043 inst = bld.MOV(result, op[0]); 1044 inst->saturate = instr->dest.saturate; 1045 break; 1046 1047 case nir_op_f2f16_rtne: 1048 case nir_op_f2f16_rtz: 1049 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1050 brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); 1051 /* fallthrough */ 1052 case nir_op_f2f16: 1053 /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 1054 * on the HW gen, it is a special hw opcode or just a MOV, and 1055 * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 1056 * 1057 * But if we want to use that opcode, we need to provide support on 1058 * different optimizations and lowerings. As right now HF support is 1059 * only for gen8+, it will be better to use directly the MOV, and use 1060 * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7. 1061 */ 1062 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1063 inst = bld.MOV(result, op[0]); 1064 inst->saturate = instr->dest.saturate; 1065 break; 1066 1067 case nir_op_b2i8: 1068 case nir_op_b2i16: 1069 case nir_op_b2i32: 1070 case nir_op_b2i64: 1071 case nir_op_b2f16: 1072 case nir_op_b2f32: 1073 case nir_op_b2f64: 1074 if (try_emit_b2fi_of_inot(bld, result, instr)) 1075 break; 1076 op[0].type = BRW_REGISTER_TYPE_D; 1077 op[0].negate = !op[0].negate; 1078 /* fallthrough */ 1079 case nir_op_i2f64: 1080 case nir_op_i2i64: 1081 case nir_op_u2f64: 1082 case nir_op_u2u64: 1083 case nir_op_f2f64: 1084 case nir_op_f2i64: 1085 case nir_op_f2u64: 1086 case nir_op_i2i32: 1087 case nir_op_u2u32: 1088 case nir_op_f2f32: 1089 case nir_op_f2i32: 1090 case nir_op_f2u32: 1091 case nir_op_i2f16: 1092 case nir_op_i2i16: 1093 case nir_op_u2f16: 1094 case nir_op_u2u16: 1095 case nir_op_f2i16: 1096 case nir_op_f2u16: 1097 case nir_op_i2i8: 1098 case nir_op_u2u8: 1099 case nir_op_f2i8: 1100 case nir_op_f2u8: 1101 if (result.type == BRW_REGISTER_TYPE_B || 1102 result.type == BRW_REGISTER_TYPE_UB || 1103 result.type == BRW_REGISTER_TYPE_HF) 1104 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1105 1106 if (op[0].type == BRW_REGISTER_TYPE_B || 1107 op[0].type == BRW_REGISTER_TYPE_UB || 1108 op[0].type == BRW_REGISTER_TYPE_HF) 1109 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1110 1111 inst = bld.MOV(result, op[0]); 1112 inst->saturate = instr->dest.saturate; 1113 break; 1114 1115 case nir_op_fsign: 1116 emit_fsign(bld, instr, result, op, 0); 1117 break; 1118 1119 case nir_op_frcp: 1120 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 1121 inst->saturate = instr->dest.saturate; 1122 break; 1123 1124 case nir_op_fexp2: 1125 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 1126 inst->saturate = instr->dest.saturate; 1127 break; 1128 1129 case nir_op_flog2: 1130 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 1131 inst->saturate = instr->dest.saturate; 1132 break; 1133 1134 case nir_op_fsin: 1135 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 1136 inst->saturate = instr->dest.saturate; 1137 break; 1138 1139 case nir_op_fcos: 1140 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 1141 inst->saturate = instr->dest.saturate; 1142 break; 1143 1144 case nir_op_fddx: 1145 if (fs_key->high_quality_derivatives) { 1146 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1147 } else { 1148 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1149 } 1150 inst->saturate = instr->dest.saturate; 1151 break; 1152 case nir_op_fddx_fine: 1153 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1154 inst->saturate = instr->dest.saturate; 1155 break; 1156 case nir_op_fddx_coarse: 1157 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1158 inst->saturate = instr->dest.saturate; 1159 break; 1160 case nir_op_fddy: 1161 if (fs_key->high_quality_derivatives) { 1162 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1163 } else { 1164 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1165 } 1166 inst->saturate = instr->dest.saturate; 1167 break; 1168 case nir_op_fddy_fine: 1169 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1170 inst->saturate = instr->dest.saturate; 1171 break; 1172 case nir_op_fddy_coarse: 1173 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1174 inst->saturate = instr->dest.saturate; 1175 break; 1176 1177 case nir_op_iadd: 1178 case nir_op_fadd: 1179 inst = bld.ADD(result, op[0], op[1]); 1180 inst->saturate = instr->dest.saturate; 1181 break; 1182 1183 case nir_op_uadd_sat: 1184 inst = bld.ADD(result, op[0], op[1]); 1185 inst->saturate = true; 1186 break; 1187 1188 case nir_op_fmul: 1189 for (unsigned i = 0; i < 2; i++) { 1190 if (can_fuse_fmul_fsign(instr, i)) { 1191 emit_fsign(bld, instr, result, op, i); 1192 return; 1193 } 1194 } 1195 1196 inst = bld.MUL(result, op[0], op[1]); 1197 inst->saturate = instr->dest.saturate; 1198 break; 1199 1200 case nir_op_imul_2x32_64: 1201 case nir_op_umul_2x32_64: 1202 bld.MUL(result, op[0], op[1]); 1203 break; 1204 1205 case nir_op_imul: 1206 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1207 bld.MUL(result, op[0], op[1]); 1208 break; 1209 1210 case nir_op_imul_high: 1211 case nir_op_umul_high: 1212 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1213 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 1214 break; 1215 1216 case nir_op_idiv: 1217 case nir_op_udiv: 1218 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1219 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 1220 break; 1221 1222 case nir_op_uadd_carry: 1223 unreachable("Should have been lowered by carry_to_arith()."); 1224 1225 case nir_op_usub_borrow: 1226 unreachable("Should have been lowered by borrow_to_arith()."); 1227 1228 case nir_op_umod: 1229 case nir_op_irem: 1230 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1231 * appears that our hardware just does the right thing for signed 1232 * remainder. 1233 */ 1234 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1235 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1236 break; 1237 1238 case nir_op_imod: { 1239 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1240 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1241 1242 /* Math instructions don't support conditional mod */ 1243 inst = bld.MOV(bld.null_reg_d(), result); 1244 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1245 1246 /* Now, we need to determine if signs of the sources are different. 1247 * When we XOR the sources, the top bit is 0 if they are the same and 1 1248 * if they are different. We can then use a conditional modifier to 1249 * turn that into a predicate. This leads us to an XOR.l instruction. 1250 * 1251 * Technically, according to the PRM, you're not allowed to use .l on a 1252 * XOR instruction. However, emperical experiments and Curro's reading 1253 * of the simulator source both indicate that it's safe. 1254 */ 1255 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1256 inst = bld.XOR(tmp, op[0], op[1]); 1257 inst->predicate = BRW_PREDICATE_NORMAL; 1258 inst->conditional_mod = BRW_CONDITIONAL_L; 1259 1260 /* If the result of the initial remainder operation is non-zero and the 1261 * two sources have different signs, add in a copy of op[1] to get the 1262 * final integer modulus value. 1263 */ 1264 inst = bld.ADD(result, result, op[1]); 1265 inst->predicate = BRW_PREDICATE_NORMAL; 1266 break; 1267 } 1268 1269 case nir_op_flt32: 1270 case nir_op_fge32: 1271 case nir_op_feq32: 1272 case nir_op_fne32: { 1273 fs_reg dest = result; 1274 1275 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1276 if (bit_size != 32) 1277 dest = bld.vgrf(op[0].type, 1); 1278 1279 brw_conditional_mod cond; 1280 switch (instr->op) { 1281 case nir_op_flt32: 1282 cond = BRW_CONDITIONAL_L; 1283 break; 1284 case nir_op_fge32: 1285 cond = BRW_CONDITIONAL_GE; 1286 break; 1287 case nir_op_feq32: 1288 cond = BRW_CONDITIONAL_Z; 1289 break; 1290 case nir_op_fne32: 1291 cond = BRW_CONDITIONAL_NZ; 1292 break; 1293 default: 1294 unreachable("bad opcode"); 1295 } 1296 1297 bld.CMP(dest, op[0], op[1], cond); 1298 1299 if (bit_size > 32) { 1300 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1301 } else if(bit_size < 32) { 1302 /* When we convert the result to 32-bit we need to be careful and do 1303 * it as a signed conversion to get sign extension (for 32-bit true) 1304 */ 1305 const brw_reg_type src_type = 1306 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1307 1308 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1309 } 1310 break; 1311 } 1312 1313 case nir_op_ilt32: 1314 case nir_op_ult32: 1315 case nir_op_ige32: 1316 case nir_op_uge32: 1317 case nir_op_ieq32: 1318 case nir_op_ine32: { 1319 fs_reg dest = result; 1320 1321 /* On Gen11 we have an additional issue being that src1 cannot be a byte 1322 * type. So we convert both operands for the comparison. 1323 */ 1324 fs_reg temp_op[2]; 1325 temp_op[0] = bld.fix_byte_src(op[0]); 1326 temp_op[1] = bld.fix_byte_src(op[1]); 1327 1328 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1329 if (bit_size != 32) 1330 dest = bld.vgrf(temp_op[0].type, 1); 1331 1332 brw_conditional_mod cond; 1333 switch (instr->op) { 1334 case nir_op_ilt32: 1335 case nir_op_ult32: 1336 cond = BRW_CONDITIONAL_L; 1337 break; 1338 case nir_op_ige32: 1339 case nir_op_uge32: 1340 cond = BRW_CONDITIONAL_GE; 1341 break; 1342 case nir_op_ieq32: 1343 cond = BRW_CONDITIONAL_Z; 1344 break; 1345 case nir_op_ine32: 1346 cond = BRW_CONDITIONAL_NZ; 1347 break; 1348 default: 1349 unreachable("bad opcode"); 1350 } 1351 bld.CMP(dest, temp_op[0], temp_op[1], cond); 1352 1353 if (bit_size > 32) { 1354 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1355 } else if (bit_size < 32) { 1356 /* When we convert the result to 32-bit we need to be careful and do 1357 * it as a signed conversion to get sign extension (for 32-bit true) 1358 */ 1359 const brw_reg_type src_type = 1360 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1361 1362 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1363 } 1364 break; 1365 } 1366 1367 case nir_op_inot: 1368 if (devinfo->gen >= 8) { 1369 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); 1370 1371 if (inot_src_instr != NULL && 1372 (inot_src_instr->op == nir_op_ior || 1373 inot_src_instr->op == nir_op_ixor || 1374 inot_src_instr->op == nir_op_iand) && 1375 !inot_src_instr->src[0].abs && 1376 !inot_src_instr->src[0].negate && 1377 !inot_src_instr->src[1].abs && 1378 !inot_src_instr->src[1].negate) { 1379 /* The sources of the source logical instruction are now the 1380 * sources of the instruction that will be generated. 1381 */ 1382 prepare_alu_destination_and_sources(bld, inot_src_instr, op, false); 1383 resolve_inot_sources(bld, inot_src_instr, op); 1384 1385 /* Smash all of the sources and destination to be signed. This 1386 * doesn't matter for the operation of the instruction, but cmod 1387 * propagation fails on unsigned sources with negation (due to 1388 * fs_inst::can_do_cmod returning false). 1389 */ 1390 result.type = 1391 brw_type_for_nir_type(devinfo, 1392 (nir_alu_type)(nir_type_int | 1393 nir_dest_bit_size(instr->dest.dest))); 1394 op[0].type = 1395 brw_type_for_nir_type(devinfo, 1396 (nir_alu_type)(nir_type_int | 1397 nir_src_bit_size(inot_src_instr->src[0].src))); 1398 op[1].type = 1399 brw_type_for_nir_type(devinfo, 1400 (nir_alu_type)(nir_type_int | 1401 nir_src_bit_size(inot_src_instr->src[1].src))); 1402 1403 /* For XOR, only invert one of the sources. Arbitrarily choose 1404 * the first source. 1405 */ 1406 op[0].negate = !op[0].negate; 1407 if (inot_src_instr->op != nir_op_ixor) 1408 op[1].negate = !op[1].negate; 1409 1410 switch (inot_src_instr->op) { 1411 case nir_op_ior: 1412 bld.AND(result, op[0], op[1]); 1413 return; 1414 1415 case nir_op_iand: 1416 bld.OR(result, op[0], op[1]); 1417 return; 1418 1419 case nir_op_ixor: 1420 bld.XOR(result, op[0], op[1]); 1421 return; 1422 1423 default: 1424 unreachable("impossible opcode"); 1425 } 1426 } 1427 op[0] = resolve_source_modifiers(op[0]); 1428 } 1429 bld.NOT(result, op[0]); 1430 break; 1431 case nir_op_ixor: 1432 if (devinfo->gen >= 8) { 1433 resolve_inot_sources(bld, instr, op); 1434 } 1435 bld.XOR(result, op[0], op[1]); 1436 break; 1437 case nir_op_ior: 1438 if (devinfo->gen >= 8) { 1439 resolve_inot_sources(bld, instr, op); 1440 } 1441 bld.OR(result, op[0], op[1]); 1442 break; 1443 case nir_op_iand: 1444 if (devinfo->gen >= 8) { 1445 resolve_inot_sources(bld, instr, op); 1446 } 1447 bld.AND(result, op[0], op[1]); 1448 break; 1449 1450 case nir_op_fdot2: 1451 case nir_op_fdot3: 1452 case nir_op_fdot4: 1453 case nir_op_b32all_fequal2: 1454 case nir_op_b32all_iequal2: 1455 case nir_op_b32all_fequal3: 1456 case nir_op_b32all_iequal3: 1457 case nir_op_b32all_fequal4: 1458 case nir_op_b32all_iequal4: 1459 case nir_op_b32any_fnequal2: 1460 case nir_op_b32any_inequal2: 1461 case nir_op_b32any_fnequal3: 1462 case nir_op_b32any_inequal3: 1463 case nir_op_b32any_fnequal4: 1464 case nir_op_b32any_inequal4: 1465 unreachable("Lowered by nir_lower_alu_reductions"); 1466 1467 case nir_op_fnoise1_1: 1468 case nir_op_fnoise1_2: 1469 case nir_op_fnoise1_3: 1470 case nir_op_fnoise1_4: 1471 case nir_op_fnoise2_1: 1472 case nir_op_fnoise2_2: 1473 case nir_op_fnoise2_3: 1474 case nir_op_fnoise2_4: 1475 case nir_op_fnoise3_1: 1476 case nir_op_fnoise3_2: 1477 case nir_op_fnoise3_3: 1478 case nir_op_fnoise3_4: 1479 case nir_op_fnoise4_1: 1480 case nir_op_fnoise4_2: 1481 case nir_op_fnoise4_3: 1482 case nir_op_fnoise4_4: 1483 unreachable("not reached: should be handled by lower_noise"); 1484 1485 case nir_op_ldexp: 1486 unreachable("not reached: should be handled by ldexp_to_arith()"); 1487 1488 case nir_op_fsqrt: 1489 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1490 inst->saturate = instr->dest.saturate; 1491 break; 1492 1493 case nir_op_frsq: 1494 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1495 inst->saturate = instr->dest.saturate; 1496 break; 1497 1498 case nir_op_i2b32: 1499 case nir_op_f2b32: { 1500 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1501 if (bit_size == 64) { 1502 /* two-argument instructions can't take 64-bit immediates */ 1503 fs_reg zero; 1504 fs_reg tmp; 1505 1506 if (instr->op == nir_op_f2b32) { 1507 zero = vgrf(glsl_type::double_type); 1508 tmp = vgrf(glsl_type::double_type); 1509 bld.MOV(zero, setup_imm_df(bld, 0.0)); 1510 } else { 1511 zero = vgrf(glsl_type::int64_t_type); 1512 tmp = vgrf(glsl_type::int64_t_type); 1513 bld.MOV(zero, brw_imm_q(0)); 1514 } 1515 1516 /* A SIMD16 execution needs to be split in two instructions, so use 1517 * a vgrf instead of the flag register as dst so instruction splitting 1518 * works 1519 */ 1520 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1521 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1522 } else { 1523 fs_reg zero; 1524 if (bit_size == 32) { 1525 zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0); 1526 } else { 1527 assert(bit_size == 16); 1528 zero = instr->op == nir_op_f2b32 ? 1529 retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1530 } 1531 bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1532 } 1533 break; 1534 } 1535 1536 case nir_op_ftrunc: 1537 inst = bld.RNDZ(result, op[0]); 1538 inst->saturate = instr->dest.saturate; 1539 break; 1540 1541 case nir_op_fceil: { 1542 op[0].negate = !op[0].negate; 1543 fs_reg temp = vgrf(glsl_type::float_type); 1544 bld.RNDD(temp, op[0]); 1545 temp.negate = true; 1546 inst = bld.MOV(result, temp); 1547 inst->saturate = instr->dest.saturate; 1548 break; 1549 } 1550 case nir_op_ffloor: 1551 inst = bld.RNDD(result, op[0]); 1552 inst->saturate = instr->dest.saturate; 1553 break; 1554 case nir_op_ffract: 1555 inst = bld.FRC(result, op[0]); 1556 inst->saturate = instr->dest.saturate; 1557 break; 1558 case nir_op_fround_even: 1559 inst = bld.RNDE(result, op[0]); 1560 inst->saturate = instr->dest.saturate; 1561 break; 1562 1563 case nir_op_fquantize2f16: { 1564 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1565 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1566 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1567 1568 /* The destination stride must be at least as big as the source stride. */ 1569 tmp16.type = BRW_REGISTER_TYPE_W; 1570 tmp16.stride = 2; 1571 1572 /* Check for denormal */ 1573 fs_reg abs_src0 = op[0]; 1574 abs_src0.abs = true; 1575 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1576 BRW_CONDITIONAL_L); 1577 /* Get the appropriately signed zero */ 1578 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1579 retype(op[0], BRW_REGISTER_TYPE_UD), 1580 brw_imm_ud(0x80000000)); 1581 /* Do the actual F32 -> F16 -> F32 conversion */ 1582 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1583 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1584 /* Select that or zero based on normal status */ 1585 inst = bld.SEL(result, zero, tmp32); 1586 inst->predicate = BRW_PREDICATE_NORMAL; 1587 inst->saturate = instr->dest.saturate; 1588 break; 1589 } 1590 1591 case nir_op_imin: 1592 case nir_op_umin: 1593 case nir_op_fmin: 1594 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1595 inst->saturate = instr->dest.saturate; 1596 break; 1597 1598 case nir_op_imax: 1599 case nir_op_umax: 1600 case nir_op_fmax: 1601 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1602 inst->saturate = instr->dest.saturate; 1603 break; 1604 1605 case nir_op_pack_snorm_2x16: 1606 case nir_op_pack_snorm_4x8: 1607 case nir_op_pack_unorm_2x16: 1608 case nir_op_pack_unorm_4x8: 1609 case nir_op_unpack_snorm_2x16: 1610 case nir_op_unpack_snorm_4x8: 1611 case nir_op_unpack_unorm_2x16: 1612 case nir_op_unpack_unorm_4x8: 1613 case nir_op_unpack_half_2x16: 1614 case nir_op_pack_half_2x16: 1615 unreachable("not reached: should be handled by lower_packing_builtins"); 1616 1617 case nir_op_unpack_half_2x16_split_x: 1618 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1619 subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1620 inst->saturate = instr->dest.saturate; 1621 break; 1622 case nir_op_unpack_half_2x16_split_y: 1623 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1624 subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1625 inst->saturate = instr->dest.saturate; 1626 break; 1627 1628 case nir_op_pack_64_2x32_split: 1629 case nir_op_pack_32_2x16_split: 1630 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1631 break; 1632 1633 case nir_op_unpack_64_2x32_split_x: 1634 case nir_op_unpack_64_2x32_split_y: { 1635 if (instr->op == nir_op_unpack_64_2x32_split_x) 1636 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1637 else 1638 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1639 break; 1640 } 1641 1642 case nir_op_unpack_32_2x16_split_x: 1643 case nir_op_unpack_32_2x16_split_y: { 1644 if (instr->op == nir_op_unpack_32_2x16_split_x) 1645 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1646 else 1647 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1648 break; 1649 } 1650 1651 case nir_op_fpow: 1652 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1653 inst->saturate = instr->dest.saturate; 1654 break; 1655 1656 case nir_op_bitfield_reverse: 1657 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1658 bld.BFREV(result, op[0]); 1659 break; 1660 1661 case nir_op_bit_count: 1662 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1663 bld.CBIT(result, op[0]); 1664 break; 1665 1666 case nir_op_ufind_msb: { 1667 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1668 emit_find_msb_using_lzd(bld, result, op[0], false); 1669 break; 1670 } 1671 1672 case nir_op_ifind_msb: { 1673 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1674 1675 if (devinfo->gen < 7) { 1676 emit_find_msb_using_lzd(bld, result, op[0], true); 1677 } else { 1678 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1679 1680 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1681 * count from the LSB side. If FBH didn't return an error 1682 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1683 * count into an LSB count. 1684 */ 1685 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1686 1687 inst = bld.ADD(result, result, brw_imm_d(31)); 1688 inst->predicate = BRW_PREDICATE_NORMAL; 1689 inst->src[0].negate = true; 1690 } 1691 break; 1692 } 1693 1694 case nir_op_find_lsb: 1695 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1696 1697 if (devinfo->gen < 7) { 1698 fs_reg temp = vgrf(glsl_type::int_type); 1699 1700 /* (x & -x) generates a value that consists of only the LSB of x. 1701 * For all powers of 2, findMSB(y) == findLSB(y). 1702 */ 1703 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1704 fs_reg negated_src = src; 1705 1706 /* One must be negated, and the other must be non-negated. It 1707 * doesn't matter which is which. 1708 */ 1709 negated_src.negate = true; 1710 src.negate = false; 1711 1712 bld.AND(temp, src, negated_src); 1713 emit_find_msb_using_lzd(bld, result, temp, false); 1714 } else { 1715 bld.FBL(result, op[0]); 1716 } 1717 break; 1718 1719 case nir_op_ubitfield_extract: 1720 case nir_op_ibitfield_extract: 1721 unreachable("should have been lowered"); 1722 case nir_op_ubfe: 1723 case nir_op_ibfe: 1724 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1725 bld.BFE(result, op[2], op[1], op[0]); 1726 break; 1727 case nir_op_bfm: 1728 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1729 bld.BFI1(result, op[0], op[1]); 1730 break; 1731 case nir_op_bfi: 1732 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1733 bld.BFI2(result, op[0], op[1], op[2]); 1734 break; 1735 1736 case nir_op_bitfield_insert: 1737 unreachable("not reached: should have been lowered"); 1738 1739 case nir_op_ishl: 1740 bld.SHL(result, op[0], op[1]); 1741 break; 1742 case nir_op_ishr: 1743 bld.ASR(result, op[0], op[1]); 1744 break; 1745 case nir_op_ushr: 1746 bld.SHR(result, op[0], op[1]); 1747 break; 1748 1749 case nir_op_pack_half_2x16_split: 1750 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1751 break; 1752 1753 case nir_op_ffma: 1754 inst = bld.MAD(result, op[2], op[1], op[0]); 1755 inst->saturate = instr->dest.saturate; 1756 break; 1757 1758 case nir_op_flrp: 1759 inst = bld.LRP(result, op[0], op[1], op[2]); 1760 inst->saturate = instr->dest.saturate; 1761 break; 1762 1763 case nir_op_b32csel: 1764 if (optimize_frontfacing_ternary(instr, result)) 1765 return; 1766 1767 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1768 inst = bld.SEL(result, op[1], op[2]); 1769 inst->predicate = BRW_PREDICATE_NORMAL; 1770 break; 1771 1772 case nir_op_extract_u8: 1773 case nir_op_extract_i8: { 1774 unsigned byte = nir_src_as_uint(instr->src[1].src); 1775 1776 /* The PRMs say: 1777 * 1778 * BDW+ 1779 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1780 * Use two instructions and a word or DWord intermediate integer type. 1781 */ 1782 if (nir_dest_bit_size(instr->dest.dest) == 64) { 1783 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1784 1785 if (instr->op == nir_op_extract_i8) { 1786 /* If we need to sign extend, extract to a word first */ 1787 fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1788 bld.MOV(w_temp, subscript(op[0], type, byte)); 1789 bld.MOV(result, w_temp); 1790 } else if (byte & 1) { 1791 /* Extract the high byte from the word containing the desired byte 1792 * offset. 1793 */ 1794 bld.SHR(result, 1795 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1796 brw_imm_uw(8)); 1797 } else { 1798 /* Otherwise use an AND with 0xff and a word type */ 1799 bld.AND(result, 1800 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1801 brw_imm_uw(0xff)); 1802 } 1803 } else { 1804 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1805 bld.MOV(result, subscript(op[0], type, byte)); 1806 } 1807 break; 1808 } 1809 1810 case nir_op_extract_u16: 1811 case nir_op_extract_i16: { 1812 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 1813 unsigned word = nir_src_as_uint(instr->src[1].src); 1814 bld.MOV(result, subscript(op[0], type, word)); 1815 break; 1816 } 1817 1818 default: 1819 unreachable("unhandled instruction"); 1820 } 1821 1822 /* If we need to do a boolean resolve, replace the result with -(x & 1) 1823 * to sign extend the low bit to 0/~0 1824 */ 1825 if (devinfo->gen <= 5 && 1826 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 1827 fs_reg masked = vgrf(glsl_type::int_type); 1828 bld.AND(masked, result, brw_imm_d(1)); 1829 masked.negate = true; 1830 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 1831 } 1832} 1833 1834void 1835fs_visitor::nir_emit_load_const(const fs_builder &bld, 1836 nir_load_const_instr *instr) 1837{ 1838 const brw_reg_type reg_type = 1839 brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 1840 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 1841 1842 switch (instr->def.bit_size) { 1843 case 8: 1844 for (unsigned i = 0; i < instr->def.num_components; i++) 1845 bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); 1846 break; 1847 1848 case 16: 1849 for (unsigned i = 0; i < instr->def.num_components; i++) 1850 bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); 1851 break; 1852 1853 case 32: 1854 for (unsigned i = 0; i < instr->def.num_components; i++) 1855 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); 1856 break; 1857 1858 case 64: 1859 assert(devinfo->gen >= 7); 1860 if (devinfo->gen == 7) { 1861 /* We don't get 64-bit integer types until gen8 */ 1862 for (unsigned i = 0; i < instr->def.num_components; i++) { 1863 bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 1864 setup_imm_df(bld, instr->value[i].f64)); 1865 } 1866 } else { 1867 for (unsigned i = 0; i < instr->def.num_components; i++) 1868 bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); 1869 } 1870 break; 1871 1872 default: 1873 unreachable("Invalid bit size"); 1874 } 1875 1876 nir_ssa_values[instr->def.index] = reg; 1877} 1878 1879fs_reg 1880fs_visitor::get_nir_src(const nir_src &src) 1881{ 1882 fs_reg reg; 1883 if (src.is_ssa) { 1884 if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { 1885 const brw_reg_type reg_type = 1886 brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 1887 reg = bld.vgrf(reg_type, src.ssa->num_components); 1888 } else { 1889 reg = nir_ssa_values[src.ssa->index]; 1890 } 1891 } else { 1892 /* We don't handle indirects on locals */ 1893 assert(src.reg.indirect == NULL); 1894 reg = offset(nir_locals[src.reg.reg->index], bld, 1895 src.reg.base_offset * src.reg.reg->num_components); 1896 } 1897 1898 if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) { 1899 /* The only 64-bit type available on gen7 is DF, so use that. */ 1900 reg.type = BRW_REGISTER_TYPE_DF; 1901 } else { 1902 /* To avoid floating-point denorm flushing problems, set the type by 1903 * default to an integer type - instructions that need floating point 1904 * semantics will set this to F if they need to 1905 */ 1906 reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 1907 BRW_REGISTER_TYPE_D); 1908 } 1909 1910 return reg; 1911} 1912 1913/** 1914 * Return an IMM for constants; otherwise call get_nir_src() as normal. 1915 * 1916 * This function should not be called on any value which may be 64 bits. 1917 * We could theoretically support 64-bit on gen8+ but we choose not to 1918 * because it wouldn't work in general (no gen7 support) and there are 1919 * enough restrictions in 64-bit immediates that you can't take the return 1920 * value and treat it the same as the result of get_nir_src(). 1921 */ 1922fs_reg 1923fs_visitor::get_nir_src_imm(const nir_src &src) 1924{ 1925 assert(nir_src_bit_size(src) == 32); 1926 return nir_src_is_const(src) ? 1927 fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src); 1928} 1929 1930fs_reg 1931fs_visitor::get_nir_dest(const nir_dest &dest) 1932{ 1933 if (dest.is_ssa) { 1934 const brw_reg_type reg_type = 1935 brw_reg_type_from_bit_size(dest.ssa.bit_size, 1936 dest.ssa.bit_size == 8 ? 1937 BRW_REGISTER_TYPE_D : 1938 BRW_REGISTER_TYPE_F); 1939 nir_ssa_values[dest.ssa.index] = 1940 bld.vgrf(reg_type, dest.ssa.num_components); 1941 return nir_ssa_values[dest.ssa.index]; 1942 } else { 1943 /* We don't handle indirects on locals */ 1944 assert(dest.reg.indirect == NULL); 1945 return offset(nir_locals[dest.reg.reg->index], bld, 1946 dest.reg.base_offset * dest.reg.reg->num_components); 1947 } 1948} 1949 1950void 1951fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 1952 unsigned wr_mask) 1953{ 1954 for (unsigned i = 0; i < 4; i++) { 1955 if (!((wr_mask >> i) & 1)) 1956 continue; 1957 1958 fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 1959 new_inst->dst = offset(new_inst->dst, bld, i); 1960 for (unsigned j = 0; j < new_inst->sources; j++) 1961 if (new_inst->src[j].file == VGRF) 1962 new_inst->src[j] = offset(new_inst->src[j], bld, i); 1963 1964 bld.emit(new_inst); 1965 } 1966} 1967 1968static fs_inst * 1969emit_pixel_interpolater_send(const fs_builder &bld, 1970 enum opcode opcode, 1971 const fs_reg &dst, 1972 const fs_reg &src, 1973 const fs_reg &desc, 1974 glsl_interp_mode interpolation) 1975{ 1976 struct brw_wm_prog_data *wm_prog_data = 1977 brw_wm_prog_data(bld.shader->stage_prog_data); 1978 1979 fs_inst *inst = bld.emit(opcode, dst, src, desc); 1980 /* 2 floats per slot returned */ 1981 inst->size_written = 2 * dst.component_size(inst->exec_size); 1982 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; 1983 1984 wm_prog_data->pulls_bary = true; 1985 1986 return inst; 1987} 1988 1989/** 1990 * Computes 1 << x, given a D/UD register containing some value x. 1991 */ 1992static fs_reg 1993intexp2(const fs_builder &bld, const fs_reg &x) 1994{ 1995 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 1996 1997 fs_reg result = bld.vgrf(x.type, 1); 1998 fs_reg one = bld.vgrf(x.type, 1); 1999 2000 bld.MOV(one, retype(brw_imm_d(1), one.type)); 2001 bld.SHL(result, one, x); 2002 return result; 2003} 2004 2005void 2006fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 2007{ 2008 assert(stage == MESA_SHADER_GEOMETRY); 2009 2010 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2011 2012 if (gs_compile->control_data_header_size_bits == 0) 2013 return; 2014 2015 /* We can only do EndPrimitive() functionality when the control data 2016 * consists of cut bits. Fortunately, the only time it isn't is when the 2017 * output type is points, in which case EndPrimitive() is a no-op. 2018 */ 2019 if (gs_prog_data->control_data_format != 2020 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 2021 return; 2022 } 2023 2024 /* Cut bits use one bit per vertex. */ 2025 assert(gs_compile->control_data_bits_per_vertex == 1); 2026 2027 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2028 vertex_count.type = BRW_REGISTER_TYPE_UD; 2029 2030 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 2031 * vertex n, 0 otherwise. So all we need to do here is mark bit 2032 * (vertex_count - 1) % 32 in the cut_bits register to indicate that 2033 * EndPrimitive() was called after emitting vertex (vertex_count - 1); 2034 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 2035 * 2036 * Note that if EndPrimitive() is called before emitting any vertices, this 2037 * will cause us to set bit 31 of the control_data_bits register to 1. 2038 * That's fine because: 2039 * 2040 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 2041 * output, so the hardware will ignore cut bit 31. 2042 * 2043 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 2044 * last vertex, so setting cut bit 31 has no effect (since the primitive 2045 * is automatically ended when the GS terminates). 2046 * 2047 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 2048 * control_data_bits register to 0 when the first vertex is emitted. 2049 */ 2050 2051 const fs_builder abld = bld.annotate("end primitive"); 2052 2053 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 2054 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2055 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2056 fs_reg mask = intexp2(abld, prev_count); 2057 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2058 * attention to the lower 5 bits of its second source argument, so on this 2059 * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 2060 * ((vertex_count - 1) % 32). 2061 */ 2062 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2063} 2064 2065void 2066fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 2067{ 2068 assert(stage == MESA_SHADER_GEOMETRY); 2069 assert(gs_compile->control_data_bits_per_vertex != 0); 2070 2071 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2072 2073 const fs_builder abld = bld.annotate("emit control data bits"); 2074 const fs_builder fwa_bld = bld.exec_all(); 2075 2076 /* We use a single UD register to accumulate control data bits (32 bits 2077 * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 2078 * at a time. 2079 * 2080 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 2081 * We have select a 128-bit group via the Global and Per-Slot Offsets, then 2082 * use the Channel Mask phase to enable/disable which DWord within that 2083 * group to write. (Remember, different SIMD8 channels may have emitted 2084 * different numbers of vertices, so we may need per-slot offsets.) 2085 * 2086 * Channel masking presents an annoying problem: we may have to replicate 2087 * the data up to 4 times: 2088 * 2089 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 2090 * 2091 * To avoid penalizing shaders that emit a small number of vertices, we 2092 * can avoid these sometimes: if the size of the control data header is 2093 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 2094 * land in the same 128-bit group, so we can skip per-slot offsets. 2095 * 2096 * Similarly, if the control data header is <= 32 bits, there is only one 2097 * DWord, so we can skip channel masks. 2098 */ 2099 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 2100 2101 fs_reg channel_mask, per_slot_offset; 2102 2103 if (gs_compile->control_data_header_size_bits > 32) { 2104 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2105 channel_mask = vgrf(glsl_type::uint_type); 2106 } 2107 2108 if (gs_compile->control_data_header_size_bits > 128) { 2109 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; 2110 per_slot_offset = vgrf(glsl_type::uint_type); 2111 } 2112 2113 /* Figure out which DWord we're trying to write to using the formula: 2114 * 2115 * dword_index = (vertex_count - 1) * bits_per_vertex / 32 2116 * 2117 * Since bits_per_vertex is a power of two, and is known at compile 2118 * time, this can be optimized to: 2119 * 2120 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 2121 */ 2122 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { 2123 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2124 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2125 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2126 unsigned log2_bits_per_vertex = 2127 util_last_bit(gs_compile->control_data_bits_per_vertex); 2128 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 2129 2130 if (per_slot_offset.file != BAD_FILE) { 2131 /* Set the per-slot offset to dword_index / 4, so that we'll write to 2132 * the appropriate OWord within the control data header. 2133 */ 2134 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 2135 } 2136 2137 /* Set the channel masks to 1 << (dword_index % 4), so that we'll 2138 * write to the appropriate DWORD within the OWORD. 2139 */ 2140 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2141 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 2142 channel_mask = intexp2(fwa_bld, channel); 2143 /* Then the channel masks need to be in bits 23:16. */ 2144 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 2145 } 2146 2147 /* Store the control data bits in the message payload and send it. */ 2148 unsigned mlen = 2; 2149 if (channel_mask.file != BAD_FILE) 2150 mlen += 4; /* channel masks, plus 3 extra copies of the data */ 2151 if (per_slot_offset.file != BAD_FILE) 2152 mlen++; 2153 2154 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2155 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); 2156 unsigned i = 0; 2157 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 2158 if (per_slot_offset.file != BAD_FILE) 2159 sources[i++] = per_slot_offset; 2160 if (channel_mask.file != BAD_FILE) 2161 sources[i++] = channel_mask; 2162 while (i < mlen) { 2163 sources[i++] = this->control_data_bits; 2164 } 2165 2166 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); 2167 fs_inst *inst = abld.emit(opcode, reg_undef, payload); 2168 inst->mlen = mlen; 2169 /* We need to increment Global Offset by 256-bits to make room for 2170 * Broadwell's extra "Vertex Count" payload at the beginning of the 2171 * URB entry. Since this is an OWord message, Global Offset is counted 2172 * in 128-bit units, so we must set it to 2. 2173 */ 2174 if (gs_prog_data->static_vertex_count == -1) 2175 inst->offset = 2; 2176} 2177 2178void 2179fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 2180 unsigned stream_id) 2181{ 2182 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 2183 2184 /* Note: we are calling this *before* increasing vertex_count, so 2185 * this->vertex_count == vertex_count - 1 in the formula above. 2186 */ 2187 2188 /* Stream mode uses 2 bits per vertex */ 2189 assert(gs_compile->control_data_bits_per_vertex == 2); 2190 2191 /* Must be a valid stream */ 2192 assert(stream_id < MAX_VERTEX_STREAMS); 2193 2194 /* Control data bits are initialized to 0 so we don't have to set any 2195 * bits when sending vertices to stream 0. 2196 */ 2197 if (stream_id == 0) 2198 return; 2199 2200 const fs_builder abld = bld.annotate("set stream control data bits", NULL); 2201 2202 /* reg::sid = stream_id */ 2203 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2204 abld.MOV(sid, brw_imm_ud(stream_id)); 2205 2206 /* reg:shift_count = 2 * (vertex_count - 1) */ 2207 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2208 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 2209 2210 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2211 * attention to the lower 5 bits of its second source argument, so on this 2212 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 2213 * stream_id << ((2 * (vertex_count - 1)) % 32). 2214 */ 2215 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2216 abld.SHL(mask, sid, shift_count); 2217 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2218} 2219 2220void 2221fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 2222 unsigned stream_id) 2223{ 2224 assert(stage == MESA_SHADER_GEOMETRY); 2225 2226 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2227 2228 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2229 vertex_count.type = BRW_REGISTER_TYPE_UD; 2230 2231 /* Haswell and later hardware ignores the "Render Stream Select" bits 2232 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 2233 * and instead sends all primitives down the pipeline for rasterization. 2234 * If the SOL stage is enabled, "Render Stream Select" is honored and 2235 * primitives bound to non-zero streams are discarded after stream output. 2236 * 2237 * Since the only purpose of primives sent to non-zero streams is to 2238 * be recorded by transform feedback, we can simply discard all geometry 2239 * bound to these streams when transform feedback is disabled. 2240 */ 2241 if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 2242 return; 2243 2244 /* If we're outputting 32 control data bits or less, then we can wait 2245 * until the shader is over to output them all. Otherwise we need to 2246 * output them as we go. Now is the time to do it, since we're about to 2247 * output the vertex_count'th vertex, so it's guaranteed that the 2248 * control data bits associated with the (vertex_count - 1)th vertex are 2249 * correct. 2250 */ 2251 if (gs_compile->control_data_header_size_bits > 32) { 2252 const fs_builder abld = 2253 bld.annotate("emit vertex: emit control data bits"); 2254 2255 /* Only emit control data bits if we've finished accumulating a batch 2256 * of 32 bits. This is the case when: 2257 * 2258 * (vertex_count * bits_per_vertex) % 32 == 0 2259 * 2260 * (in other words, when the last 5 bits of vertex_count * 2261 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 2262 * integer n (which is always the case, since bits_per_vertex is 2263 * always 1 or 2), this is equivalent to requiring that the last 5-n 2264 * bits of vertex_count are 0: 2265 * 2266 * vertex_count & (2^(5-n) - 1) == 0 2267 * 2268 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 2269 * equivalent to: 2270 * 2271 * vertex_count & (32 / bits_per_vertex - 1) == 0 2272 * 2273 * TODO: If vertex_count is an immediate, we could do some of this math 2274 * at compile time... 2275 */ 2276 fs_inst *inst = 2277 abld.AND(bld.null_reg_d(), vertex_count, 2278 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2279 inst->conditional_mod = BRW_CONDITIONAL_Z; 2280 2281 abld.IF(BRW_PREDICATE_NORMAL); 2282 /* If vertex_count is 0, then no control data bits have been 2283 * accumulated yet, so we can skip emitting them. 2284 */ 2285 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2286 BRW_CONDITIONAL_NEQ); 2287 abld.IF(BRW_PREDICATE_NORMAL); 2288 emit_gs_control_data_bits(vertex_count); 2289 abld.emit(BRW_OPCODE_ENDIF); 2290 2291 /* Reset control_data_bits to 0 so we can start accumulating a new 2292 * batch. 2293 * 2294 * Note: in the case where vertex_count == 0, this neutralizes the 2295 * effect of any call to EndPrimitive() that the shader may have 2296 * made before outputting its first vertex. 2297 */ 2298 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2299 inst->force_writemask_all = true; 2300 abld.emit(BRW_OPCODE_ENDIF); 2301 } 2302 2303 emit_urb_writes(vertex_count); 2304 2305 /* In stream mode we have to set control data bits for all vertices 2306 * unless we have disabled control data bits completely (which we do 2307 * do for GL_POINTS outputs that don't use streams). 2308 */ 2309 if (gs_compile->control_data_header_size_bits > 0 && 2310 gs_prog_data->control_data_format == 2311 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2312 set_gs_stream_control_data_bits(vertex_count, stream_id); 2313 } 2314} 2315 2316void 2317fs_visitor::emit_gs_input_load(const fs_reg &dst, 2318 const nir_src &vertex_src, 2319 unsigned base_offset, 2320 const nir_src &offset_src, 2321 unsigned num_components, 2322 unsigned first_component) 2323{ 2324 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2325 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2326 2327 /* TODO: figure out push input layout for invocations == 1 */ 2328 /* TODO: make this work with 64-bit inputs */ 2329 if (gs_prog_data->invocations == 1 && 2330 type_sz(dst.type) <= 4 && 2331 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 2332 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { 2333 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + 2334 nir_src_as_uint(vertex_src) * push_reg_count; 2335 for (unsigned i = 0; i < num_components; i++) { 2336 bld.MOV(offset(dst, bld, i), 2337 fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2338 } 2339 return; 2340 } 2341 2342 /* Resort to the pull model. Ensure the VUE handles are provided. */ 2343 assert(gs_prog_data->base.include_vue_handles); 2344 2345 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2346 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2347 2348 if (gs_prog_data->invocations == 1) { 2349 if (nir_src_is_const(vertex_src)) { 2350 /* The vertex index is constant; just select the proper URB handle. */ 2351 icp_handle = 2352 retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0), 2353 BRW_REGISTER_TYPE_UD); 2354 } else { 2355 /* The vertex index is non-constant. We need to use indirect 2356 * addressing to fetch the proper URB handle. 2357 * 2358 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2359 * indicating that channel <n> should read the handle from 2360 * DWord <n>. We convert that to bytes by multiplying by 4. 2361 * 2362 * Next, we convert the vertex index to bytes by multiplying 2363 * by 32 (shifting by 5), and add the two together. This is 2364 * the final indirect byte offset. 2365 */ 2366 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2367 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2368 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2369 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2370 2371 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2372 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2373 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2374 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2375 /* Convert vertex_index to bytes (multiply by 32) */ 2376 bld.SHL(vertex_offset_bytes, 2377 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2378 brw_imm_ud(5u)); 2379 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2380 2381 /* Use first_icp_handle as the base offset. There is one register 2382 * of URB handles per vertex, so inform the register allocator that 2383 * we might read up to nir->info.gs.vertices_in registers. 2384 */ 2385 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2386 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2387 fs_reg(icp_offset_bytes), 2388 brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2389 } 2390 } else { 2391 assert(gs_prog_data->invocations > 1); 2392 2393 if (nir_src_is_const(vertex_src)) { 2394 unsigned vertex = nir_src_as_uint(vertex_src); 2395 assert(devinfo->gen >= 9 || vertex <= 5); 2396 bld.MOV(icp_handle, 2397 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8), 2398 BRW_REGISTER_TYPE_UD)); 2399 } else { 2400 /* The vertex index is non-constant. We need to use indirect 2401 * addressing to fetch the proper URB handle. 2402 * 2403 */ 2404 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2405 2406 /* Convert vertex_index to bytes (multiply by 4) */ 2407 bld.SHL(icp_offset_bytes, 2408 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2409 brw_imm_ud(2u)); 2410 2411 /* Use first_icp_handle as the base offset. There is one DWord 2412 * of URB handles per vertex, so inform the register allocator that 2413 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2414 */ 2415 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2416 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2417 fs_reg(icp_offset_bytes), 2418 brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2419 REG_SIZE)); 2420 } 2421 } 2422 2423 fs_inst *inst; 2424 2425 fs_reg tmp_dst = dst; 2426 fs_reg indirect_offset = get_nir_src(offset_src); 2427 unsigned num_iterations = 1; 2428 unsigned orig_num_components = num_components; 2429 2430 if (type_sz(dst.type) == 8) { 2431 if (num_components > 2) { 2432 num_iterations = 2; 2433 num_components = 2; 2434 } 2435 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2436 tmp_dst = tmp; 2437 first_component = first_component / 2; 2438 } 2439 2440 for (unsigned iter = 0; iter < num_iterations; iter++) { 2441 if (nir_src_is_const(offset_src)) { 2442 /* Constant indexing - use global offset. */ 2443 if (first_component != 0) { 2444 unsigned read_components = num_components + first_component; 2445 fs_reg tmp = bld.vgrf(dst.type, read_components); 2446 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2447 inst->size_written = read_components * 2448 tmp.component_size(inst->exec_size); 2449 for (unsigned i = 0; i < num_components; i++) { 2450 bld.MOV(offset(tmp_dst, bld, i), 2451 offset(tmp, bld, i + first_component)); 2452 } 2453 } else { 2454 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, 2455 icp_handle); 2456 inst->size_written = num_components * 2457 tmp_dst.component_size(inst->exec_size); 2458 } 2459 inst->offset = base_offset + nir_src_as_uint(offset_src); 2460 inst->mlen = 1; 2461 } else { 2462 /* Indirect indexing - use per-slot offsets as well. */ 2463 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2464 unsigned read_components = num_components + first_component; 2465 fs_reg tmp = bld.vgrf(dst.type, read_components); 2466 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2467 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2468 if (first_component != 0) { 2469 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2470 payload); 2471 inst->size_written = read_components * 2472 tmp.component_size(inst->exec_size); 2473 for (unsigned i = 0; i < num_components; i++) { 2474 bld.MOV(offset(tmp_dst, bld, i), 2475 offset(tmp, bld, i + first_component)); 2476 } 2477 } else { 2478 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, 2479 payload); 2480 inst->size_written = num_components * 2481 tmp_dst.component_size(inst->exec_size); 2482 } 2483 inst->offset = base_offset; 2484 inst->mlen = 2; 2485 } 2486 2487 if (type_sz(dst.type) == 8) { 2488 shuffle_from_32bit_read(bld, 2489 offset(dst, bld, iter * 2), 2490 retype(tmp_dst, BRW_REGISTER_TYPE_D), 2491 0, 2492 num_components); 2493 } 2494 2495 if (num_iterations > 1) { 2496 num_components = orig_num_components - 2; 2497 if(nir_src_is_const(offset_src)) { 2498 base_offset++; 2499 } else { 2500 fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2501 bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); 2502 indirect_offset = new_indirect; 2503 } 2504 } 2505 } 2506} 2507 2508fs_reg 2509fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2510{ 2511 nir_src *offset_src = nir_get_io_offset_src(instr); 2512 2513 if (nir_src_is_const(*offset_src)) { 2514 /* The only constant offset we should find is 0. brw_nir.c's 2515 * add_const_offset_to_base() will fold other constant offsets 2516 * into instr->const_index[0]. 2517 */ 2518 assert(nir_src_as_uint(*offset_src) == 0); 2519 return fs_reg(); 2520 } 2521 2522 return get_nir_src(*offset_src); 2523} 2524 2525void 2526fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2527 nir_intrinsic_instr *instr) 2528{ 2529 assert(stage == MESA_SHADER_VERTEX); 2530 2531 fs_reg dest; 2532 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2533 dest = get_nir_dest(instr->dest); 2534 2535 switch (instr->intrinsic) { 2536 case nir_intrinsic_load_vertex_id: 2537 case nir_intrinsic_load_base_vertex: 2538 unreachable("should be lowered by nir_lower_system_values()"); 2539 2540 case nir_intrinsic_load_input: { 2541 fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2542 unsigned first_component = nir_intrinsic_component(instr); 2543 unsigned num_components = instr->num_components; 2544 2545 src = offset(src, bld, nir_src_as_uint(instr->src[0])); 2546 2547 if (type_sz(dest.type) == 8) 2548 first_component /= 2; 2549 2550 /* For 16-bit support maybe a temporary will be needed to copy from 2551 * the ATTR file. 2552 */ 2553 shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D), 2554 first_component, num_components); 2555 break; 2556 } 2557 2558 case nir_intrinsic_load_vertex_id_zero_base: 2559 case nir_intrinsic_load_instance_id: 2560 case nir_intrinsic_load_base_instance: 2561 case nir_intrinsic_load_draw_id: 2562 case nir_intrinsic_load_first_vertex: 2563 case nir_intrinsic_load_is_indexed_draw: 2564 unreachable("lowered by brw_nir_lower_vs_inputs"); 2565 2566 default: 2567 nir_emit_intrinsic(bld, instr); 2568 break; 2569 } 2570} 2571 2572void 2573fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2574 nir_intrinsic_instr *instr) 2575{ 2576 assert(stage == MESA_SHADER_TESS_CTRL); 2577 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2578 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2579 2580 fs_reg dst; 2581 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2582 dst = get_nir_dest(instr->dest); 2583 2584 switch (instr->intrinsic) { 2585 case nir_intrinsic_load_primitive_id: 2586 bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); 2587 break; 2588 case nir_intrinsic_load_invocation_id: 2589 bld.MOV(retype(dst, invocation_id.type), invocation_id); 2590 break; 2591 case nir_intrinsic_load_patch_vertices_in: 2592 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2593 brw_imm_d(tcs_key->input_vertices)); 2594 break; 2595 2596 case nir_intrinsic_barrier: { 2597 if (tcs_prog_data->instances == 1) 2598 break; 2599 2600 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2601 fs_reg m0_2 = component(m0, 2); 2602 2603 const fs_builder chanbld = bld.exec_all().group(1, 0); 2604 2605 /* Zero the message header */ 2606 bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2607 2608 if (devinfo->gen < 11) { 2609 /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2610 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2611 brw_imm_ud(INTEL_MASK(16, 13))); 2612 2613 /* Shift it up to bits 27:24. */ 2614 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2615 } else { 2616 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2617 brw_imm_ud(INTEL_MASK(30, 24))); 2618 } 2619 2620 /* Set the Barrier Count and the enable bit */ 2621 if (devinfo->gen < 11) { 2622 chanbld.OR(m0_2, m0_2, 2623 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2624 } else { 2625 chanbld.OR(m0_2, m0_2, 2626 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); 2627 } 2628 2629 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2630 break; 2631 } 2632 2633 case nir_intrinsic_load_input: 2634 unreachable("nir_lower_io should never give us these."); 2635 break; 2636 2637 case nir_intrinsic_load_per_vertex_input: { 2638 fs_reg indirect_offset = get_indirect_offset(instr); 2639 unsigned imm_offset = instr->const_index[0]; 2640 2641 const nir_src &vertex_src = instr->src[0]; 2642 2643 fs_inst *inst; 2644 2645 fs_reg icp_handle; 2646 2647 if (nir_src_is_const(vertex_src)) { 2648 /* Emit a MOV to resolve <0,1,0> regioning. */ 2649 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2650 unsigned vertex = nir_src_as_uint(vertex_src); 2651 bld.MOV(icp_handle, 2652 retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), 2653 BRW_REGISTER_TYPE_UD)); 2654 } else if (tcs_prog_data->instances == 1 && 2655 nir_src_as_intrinsic(vertex_src) != NULL && 2656 nir_src_as_intrinsic(vertex_src)->intrinsic == nir_intrinsic_load_invocation_id) { 2657 /* For the common case of only 1 instance, an array index of 2658 * gl_InvocationID means reading g1. Skip all the indirect work. 2659 */ 2660 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2661 } else { 2662 /* The vertex index is non-constant. We need to use indirect 2663 * addressing to fetch the proper URB handle. 2664 */ 2665 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2666 2667 /* Each ICP handle is a single DWord (4 bytes) */ 2668 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2669 bld.SHL(vertex_offset_bytes, 2670 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2671 brw_imm_ud(2u)); 2672 2673 /* Start at g1. We might read up to 4 registers. */ 2674 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2675 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2676 brw_imm_ud(4 * REG_SIZE)); 2677 } 2678 2679 /* We can only read two double components with each URB read, so 2680 * we send two read messages in that case, each one loading up to 2681 * two double components. 2682 */ 2683 unsigned num_iterations = 1; 2684 unsigned num_components = instr->num_components; 2685 unsigned first_component = nir_intrinsic_component(instr); 2686 fs_reg orig_dst = dst; 2687 if (type_sz(dst.type) == 8) { 2688 first_component = first_component / 2; 2689 if (instr->num_components > 2) { 2690 num_iterations = 2; 2691 num_components = 2; 2692 } 2693 2694 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2695 dst = tmp; 2696 } 2697 2698 for (unsigned iter = 0; iter < num_iterations; iter++) { 2699 if (indirect_offset.file == BAD_FILE) { 2700 /* Constant indexing - use global offset. */ 2701 if (first_component != 0) { 2702 unsigned read_components = num_components + first_component; 2703 fs_reg tmp = bld.vgrf(dst.type, read_components); 2704 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2705 for (unsigned i = 0; i < num_components; i++) { 2706 bld.MOV(offset(dst, bld, i), 2707 offset(tmp, bld, i + first_component)); 2708 } 2709 } else { 2710 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2711 } 2712 inst->offset = imm_offset; 2713 inst->mlen = 1; 2714 } else { 2715 /* Indirect indexing - use per-slot offsets as well. */ 2716 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2717 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2718 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2719 if (first_component != 0) { 2720 unsigned read_components = num_components + first_component; 2721 fs_reg tmp = bld.vgrf(dst.type, read_components); 2722 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2723 payload); 2724 for (unsigned i = 0; i < num_components; i++) { 2725 bld.MOV(offset(dst, bld, i), 2726 offset(tmp, bld, i + first_component)); 2727 } 2728 } else { 2729 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2730 payload); 2731 } 2732 inst->offset = imm_offset; 2733 inst->mlen = 2; 2734 } 2735 inst->size_written = (num_components + first_component) * 2736 inst->dst.component_size(inst->exec_size); 2737 2738 /* If we are reading 64-bit data using 32-bit read messages we need 2739 * build proper 64-bit data elements by shuffling the low and high 2740 * 32-bit components around like we do for other things like UBOs 2741 * or SSBOs. 2742 */ 2743 if (type_sz(dst.type) == 8) { 2744 shuffle_from_32bit_read(bld, 2745 offset(orig_dst, bld, iter * 2), 2746 retype(dst, BRW_REGISTER_TYPE_D), 2747 0, num_components); 2748 } 2749 2750 /* Copy the temporary to the destination to deal with writemasking. 2751 * 2752 * Also attempt to deal with gl_PointSize being in the .w component. 2753 */ 2754 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2755 assert(type_sz(dst.type) < 8); 2756 inst->dst = bld.vgrf(dst.type, 4); 2757 inst->size_written = 4 * REG_SIZE; 2758 bld.MOV(dst, offset(inst->dst, bld, 3)); 2759 } 2760 2761 /* If we are loading double data and we need a second read message 2762 * adjust the write offset 2763 */ 2764 if (num_iterations > 1) { 2765 num_components = instr->num_components - 2; 2766 imm_offset++; 2767 } 2768 } 2769 break; 2770 } 2771 2772 case nir_intrinsic_load_output: 2773 case nir_intrinsic_load_per_vertex_output: { 2774 fs_reg indirect_offset = get_indirect_offset(instr); 2775 unsigned imm_offset = instr->const_index[0]; 2776 unsigned first_component = nir_intrinsic_component(instr); 2777 2778 fs_inst *inst; 2779 if (indirect_offset.file == BAD_FILE) { 2780 /* Replicate the patch handle to all enabled channels */ 2781 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2782 bld.MOV(patch_handle, 2783 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2784 2785 { 2786 if (first_component != 0) { 2787 unsigned read_components = 2788 instr->num_components + first_component; 2789 fs_reg tmp = bld.vgrf(dst.type, read_components); 2790 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2791 patch_handle); 2792 inst->size_written = read_components * REG_SIZE; 2793 for (unsigned i = 0; i < instr->num_components; i++) { 2794 bld.MOV(offset(dst, bld, i), 2795 offset(tmp, bld, i + first_component)); 2796 } 2797 } else { 2798 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 2799 patch_handle); 2800 inst->size_written = instr->num_components * REG_SIZE; 2801 } 2802 inst->offset = imm_offset; 2803 inst->mlen = 1; 2804 } 2805 } else { 2806 /* Indirect indexing - use per-slot offsets as well. */ 2807 const fs_reg srcs[] = { 2808 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2809 indirect_offset 2810 }; 2811 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2812 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2813 if (first_component != 0) { 2814 unsigned read_components = 2815 instr->num_components + first_component; 2816 fs_reg tmp = bld.vgrf(dst.type, read_components); 2817 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2818 payload); 2819 inst->size_written = read_components * REG_SIZE; 2820 for (unsigned i = 0; i < instr->num_components; i++) { 2821 bld.MOV(offset(dst, bld, i), 2822 offset(tmp, bld, i + first_component)); 2823 } 2824 } else { 2825 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2826 payload); 2827 inst->size_written = instr->num_components * REG_SIZE; 2828 } 2829 inst->offset = imm_offset; 2830 inst->mlen = 2; 2831 } 2832 break; 2833 } 2834 2835 case nir_intrinsic_store_output: 2836 case nir_intrinsic_store_per_vertex_output: { 2837 fs_reg value = get_nir_src(instr->src[0]); 2838 bool is_64bit = (instr->src[0].is_ssa ? 2839 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; 2840 fs_reg indirect_offset = get_indirect_offset(instr); 2841 unsigned imm_offset = instr->const_index[0]; 2842 unsigned mask = instr->const_index[1]; 2843 unsigned header_regs = 0; 2844 fs_reg srcs[7]; 2845 srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2846 2847 if (indirect_offset.file != BAD_FILE) { 2848 srcs[header_regs++] = indirect_offset; 2849 } 2850 2851 if (mask == 0) 2852 break; 2853 2854 unsigned num_components = util_last_bit(mask); 2855 enum opcode opcode; 2856 2857 /* We can only pack two 64-bit components in a single message, so send 2858 * 2 messages if we have more components 2859 */ 2860 unsigned num_iterations = 1; 2861 unsigned iter_components = num_components; 2862 unsigned first_component = nir_intrinsic_component(instr); 2863 if (is_64bit) { 2864 first_component = first_component / 2; 2865 if (instr->num_components > 2) { 2866 num_iterations = 2; 2867 iter_components = 2; 2868 } 2869 } 2870 2871 mask = mask << first_component; 2872 2873 for (unsigned iter = 0; iter < num_iterations; iter++) { 2874 if (!is_64bit && mask != WRITEMASK_XYZW) { 2875 srcs[header_regs++] = brw_imm_ud(mask << 16); 2876 opcode = indirect_offset.file != BAD_FILE ? 2877 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2878 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2879 } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { 2880 /* Expand the 64-bit mask to 32-bit channels. We only handle 2881 * two channels in each iteration, so we only care about X/Y. 2882 */ 2883 unsigned mask32 = 0; 2884 if (mask & WRITEMASK_X) 2885 mask32 |= WRITEMASK_XY; 2886 if (mask & WRITEMASK_Y) 2887 mask32 |= WRITEMASK_ZW; 2888 2889 /* If the mask does not include any of the channels X or Y there 2890 * is nothing to do in this iteration. Move on to the next couple 2891 * of 64-bit channels. 2892 */ 2893 if (!mask32) { 2894 mask >>= 2; 2895 imm_offset++; 2896 continue; 2897 } 2898 2899 srcs[header_regs++] = brw_imm_ud(mask32 << 16); 2900 opcode = indirect_offset.file != BAD_FILE ? 2901 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2902 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2903 } else { 2904 opcode = indirect_offset.file != BAD_FILE ? 2905 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : 2906 SHADER_OPCODE_URB_WRITE_SIMD8; 2907 } 2908 2909 for (unsigned i = 0; i < iter_components; i++) { 2910 if (!(mask & (1 << (i + first_component)))) 2911 continue; 2912 2913 if (!is_64bit) { 2914 srcs[header_regs + i + first_component] = offset(value, bld, i); 2915 } else { 2916 /* We need to shuffle the 64-bit data to match the layout 2917 * expected by our 32-bit URB write messages. We use a temporary 2918 * for that. 2919 */ 2920 unsigned channel = iter * 2 + i; 2921 fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1); 2922 2923 srcs[header_regs + (i + first_component) * 2] = dest; 2924 srcs[header_regs + (i + first_component) * 2 + 1] = 2925 offset(dest, bld, 1); 2926 } 2927 } 2928 2929 unsigned mlen = 2930 header_regs + (is_64bit ? 2 * iter_components : iter_components) + 2931 (is_64bit ? 2 * first_component : first_component); 2932 fs_reg payload = 2933 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2934 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); 2935 2936 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); 2937 inst->offset = imm_offset; 2938 inst->mlen = mlen; 2939 2940 /* If this is a 64-bit attribute, select the next two 64-bit channels 2941 * to be handled in the next iteration. 2942 */ 2943 if (is_64bit) { 2944 mask >>= 2; 2945 imm_offset++; 2946 } 2947 } 2948 break; 2949 } 2950 2951 default: 2952 nir_emit_intrinsic(bld, instr); 2953 break; 2954 } 2955} 2956 2957void 2958fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 2959 nir_intrinsic_instr *instr) 2960{ 2961 assert(stage == MESA_SHADER_TESS_EVAL); 2962 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 2963 2964 fs_reg dest; 2965 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2966 dest = get_nir_dest(instr->dest); 2967 2968 switch (instr->intrinsic) { 2969 case nir_intrinsic_load_primitive_id: 2970 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 2971 break; 2972 case nir_intrinsic_load_tess_coord: 2973 /* gl_TessCoord is part of the payload in g1-3 */ 2974 for (unsigned i = 0; i < 3; i++) { 2975 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 2976 } 2977 break; 2978 2979 case nir_intrinsic_load_input: 2980 case nir_intrinsic_load_per_vertex_input: { 2981 fs_reg indirect_offset = get_indirect_offset(instr); 2982 unsigned imm_offset = instr->const_index[0]; 2983 unsigned first_component = nir_intrinsic_component(instr); 2984 2985 if (type_sz(dest.type) == 8) { 2986 first_component = first_component / 2; 2987 } 2988 2989 fs_inst *inst; 2990 if (indirect_offset.file == BAD_FILE) { 2991 /* Arbitrarily only push up to 32 vec4 slots worth of data, 2992 * which is 16 registers (since each holds 2 vec4 slots). 2993 */ 2994 unsigned slot_count = 1; 2995 if (type_sz(dest.type) == 8 && instr->num_components > 2) 2996 slot_count++; 2997 2998 const unsigned max_push_slots = 32; 2999 if (imm_offset + slot_count <= max_push_slots) { 3000 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 3001 for (int i = 0; i < instr->num_components; i++) { 3002 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + 3003 i + first_component; 3004 bld.MOV(offset(dest, bld, i), component(src, comp)); 3005 } 3006 3007 tes_prog_data->base.urb_read_length = 3008 MAX2(tes_prog_data->base.urb_read_length, 3009 DIV_ROUND_UP(imm_offset + slot_count, 2)); 3010 } else { 3011 /* Replicate the patch handle to all enabled channels */ 3012 const fs_reg srcs[] = { 3013 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) 3014 }; 3015 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 3016 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); 3017 3018 if (first_component != 0) { 3019 unsigned read_components = 3020 instr->num_components + first_component; 3021 fs_reg tmp = bld.vgrf(dest.type, read_components); 3022 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 3023 patch_handle); 3024 inst->size_written = read_components * REG_SIZE; 3025 for (unsigned i = 0; i < instr->num_components; i++) { 3026 bld.MOV(offset(dest, bld, i), 3027 offset(tmp, bld, i + first_component)); 3028 } 3029 } else { 3030 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, 3031 patch_handle); 3032 inst->size_written = instr->num_components * REG_SIZE; 3033 } 3034 inst->mlen = 1; 3035 inst->offset = imm_offset; 3036 } 3037 } else { 3038 /* Indirect indexing - use per-slot offsets as well. */ 3039 3040 /* We can only read two double components with each URB read, so 3041 * we send two read messages in that case, each one loading up to 3042 * two double components. 3043 */ 3044 unsigned num_iterations = 1; 3045 unsigned num_components = instr->num_components; 3046 fs_reg orig_dest = dest; 3047 if (type_sz(dest.type) == 8) { 3048 if (instr->num_components > 2) { 3049 num_iterations = 2; 3050 num_components = 2; 3051 } 3052 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); 3053 dest = tmp; 3054 } 3055 3056 for (unsigned iter = 0; iter < num_iterations; iter++) { 3057 const fs_reg srcs[] = { 3058 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 3059 indirect_offset 3060 }; 3061 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3062 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 3063 3064 if (first_component != 0) { 3065 unsigned read_components = 3066 num_components + first_component; 3067 fs_reg tmp = bld.vgrf(dest.type, read_components); 3068 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 3069 payload); 3070 for (unsigned i = 0; i < num_components; i++) { 3071 bld.MOV(offset(dest, bld, i), 3072 offset(tmp, bld, i + first_component)); 3073 } 3074 } else { 3075 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, 3076 payload); 3077 } 3078 inst->mlen = 2; 3079 inst->offset = imm_offset; 3080 inst->size_written = (num_components + first_component) * 3081 inst->dst.component_size(inst->exec_size); 3082 3083 /* If we are reading 64-bit data using 32-bit read messages we need 3084 * build proper 64-bit data elements by shuffling the low and high 3085 * 32-bit components around like we do for other things like UBOs 3086 * or SSBOs. 3087 */ 3088 if (type_sz(dest.type) == 8) { 3089 shuffle_from_32bit_read(bld, 3090 offset(orig_dest, bld, iter * 2), 3091 retype(dest, BRW_REGISTER_TYPE_D), 3092 0, num_components); 3093 } 3094 3095 /* If we are loading double data and we need a second read message 3096 * adjust the offset 3097 */ 3098 if (num_iterations > 1) { 3099 num_components = instr->num_components - 2; 3100 imm_offset++; 3101 } 3102 } 3103 } 3104 break; 3105 } 3106 default: 3107 nir_emit_intrinsic(bld, instr); 3108 break; 3109 } 3110} 3111 3112void 3113fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 3114 nir_intrinsic_instr *instr) 3115{ 3116 assert(stage == MESA_SHADER_GEOMETRY); 3117 fs_reg indirect_offset; 3118 3119 fs_reg dest; 3120 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3121 dest = get_nir_dest(instr->dest); 3122 3123 switch (instr->intrinsic) { 3124 case nir_intrinsic_load_primitive_id: 3125 assert(stage == MESA_SHADER_GEOMETRY); 3126 assert(brw_gs_prog_data(prog_data)->include_primitive_id); 3127 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 3128 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 3129 break; 3130 3131 case nir_intrinsic_load_input: 3132 unreachable("load_input intrinsics are invalid for the GS stage"); 3133 3134 case nir_intrinsic_load_per_vertex_input: 3135 emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 3136 instr->src[1], instr->num_components, 3137 nir_intrinsic_component(instr)); 3138 break; 3139 3140 case nir_intrinsic_emit_vertex_with_counter: 3141 emit_gs_vertex(instr->src[0], instr->const_index[0]); 3142 break; 3143 3144 case nir_intrinsic_end_primitive_with_counter: 3145 emit_gs_end_primitive(instr->src[0]); 3146 break; 3147 3148 case nir_intrinsic_set_vertex_count: 3149 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 3150 break; 3151 3152 case nir_intrinsic_load_invocation_id: { 3153 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 3154 assert(val.file != BAD_FILE); 3155 dest.type = val.type; 3156 bld.MOV(dest, val); 3157 break; 3158 } 3159 3160 default: 3161 nir_emit_intrinsic(bld, instr); 3162 break; 3163 } 3164} 3165 3166/** 3167 * Fetch the current render target layer index. 3168 */ 3169static fs_reg 3170fetch_render_target_array_index(const fs_builder &bld) 3171{ 3172 if (bld.shader->devinfo->gen >= 6) { 3173 /* The render target array index is provided in the thread payload as 3174 * bits 26:16 of r0.0. 3175 */ 3176 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3177 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3178 brw_imm_uw(0x7ff)); 3179 return idx; 3180 } else { 3181 /* Pre-SNB we only ever render into the first layer of the framebuffer 3182 * since layered rendering is not implemented. 3183 */ 3184 return brw_imm_ud(0); 3185 } 3186} 3187 3188/** 3189 * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3190 * framebuffer at the current fragment coordinates and sample index. 3191 */ 3192fs_inst * 3193fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3194 unsigned target) 3195{ 3196 const struct gen_device_info *devinfo = bld.shader->devinfo; 3197 3198 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3199 const brw_wm_prog_key *wm_key = 3200 reinterpret_cast<const brw_wm_prog_key *>(key); 3201 assert(!wm_key->coherent_fb_fetch); 3202 const struct brw_wm_prog_data *wm_prog_data = 3203 brw_wm_prog_data(stage_prog_data); 3204 3205 /* Calculate the surface index relative to the start of the texture binding 3206 * table block, since that's what the texturing messages expect. 3207 */ 3208 const unsigned surface = target + 3209 wm_prog_data->binding_table.render_target_read_start - 3210 wm_prog_data->base.binding_table.texture_start; 3211 3212 /* Calculate the fragment coordinates. */ 3213 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3214 bld.MOV(offset(coords, bld, 0), pixel_x); 3215 bld.MOV(offset(coords, bld, 1), pixel_y); 3216 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3217 3218 /* Calculate the sample index and MCS payload when multisampling. Luckily 3219 * the MCS fetch message behaves deterministically for UMS surfaces, so it 3220 * shouldn't be necessary to recompile based on whether the framebuffer is 3221 * CMS or UMS. 3222 */ 3223 if (wm_key->multisample_fbo && 3224 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3225 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 3226 3227 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3228 const fs_reg mcs = wm_key->multisample_fbo ? 3229 emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg(); 3230 3231 /* Use either a normal or a CMS texel fetch message depending on whether 3232 * the framebuffer is single or multisample. On SKL+ use the wide CMS 3233 * message just in case the framebuffer uses 16x multisampling, it should 3234 * be equivalent to the normal CMS fetch for lower multisampling modes. 3235 */ 3236 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : 3237 devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : 3238 SHADER_OPCODE_TXF_CMS_LOGICAL; 3239 3240 /* Emit the instruction. */ 3241 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 3242 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; 3243 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); 3244 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; 3245 srcs[TEX_LOGICAL_SRC_MCS] = mcs; 3246 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3247 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 3248 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); 3249 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); 3250 3251 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3252 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3253 3254 return inst; 3255} 3256 3257/** 3258 * Actual coherent framebuffer read implemented using the native render target 3259 * read message. Requires SKL+. 3260 */ 3261static fs_inst * 3262emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3263{ 3264 assert(bld.shader->devinfo->gen >= 9); 3265 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3266 inst->target = target; 3267 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3268 3269 return inst; 3270} 3271 3272static fs_reg 3273alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3274{ 3275 if (n && regs[0].file != BAD_FILE) { 3276 return regs[0]; 3277 3278 } else { 3279 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3280 3281 for (unsigned i = 0; i < n; i++) 3282 regs[i] = tmp; 3283 3284 return tmp; 3285 } 3286} 3287 3288static fs_reg 3289alloc_frag_output(fs_visitor *v, unsigned location) 3290{ 3291 assert(v->stage == MESA_SHADER_FRAGMENT); 3292 const brw_wm_prog_key *const key = 3293 reinterpret_cast<const brw_wm_prog_key *>(v->key); 3294 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3295 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3296 3297 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3298 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3299 3300 else if (l == FRAG_RESULT_COLOR) 3301 return alloc_temporary(v->bld, 4, v->outputs, 3302 MAX2(key->nr_color_regions, 1)); 3303 3304 else if (l == FRAG_RESULT_DEPTH) 3305 return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3306 3307 else if (l == FRAG_RESULT_STENCIL) 3308 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3309 3310 else if (l == FRAG_RESULT_SAMPLE_MASK) 3311 return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3312 3313 else if (l >= FRAG_RESULT_DATA0 && 3314 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3315 return alloc_temporary(v->bld, 4, 3316 &v->outputs[l - FRAG_RESULT_DATA0], 1); 3317 3318 else 3319 unreachable("Invalid location"); 3320} 3321 3322void 3323fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3324 nir_intrinsic_instr *instr) 3325{ 3326 assert(stage == MESA_SHADER_FRAGMENT); 3327 3328 fs_reg dest; 3329 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3330 dest = get_nir_dest(instr->dest); 3331 3332 switch (instr->intrinsic) { 3333 case nir_intrinsic_load_front_face: 3334 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3335 *emit_frontfacing_interpolation()); 3336 break; 3337 3338 case nir_intrinsic_load_sample_pos: { 3339 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3340 assert(sample_pos.file != BAD_FILE); 3341 dest.type = sample_pos.type; 3342 bld.MOV(dest, sample_pos); 3343 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3344 break; 3345 } 3346 3347 case nir_intrinsic_load_layer_id: 3348 dest.type = BRW_REGISTER_TYPE_UD; 3349 bld.MOV(dest, fetch_render_target_array_index(bld)); 3350 break; 3351 3352 case nir_intrinsic_load_helper_invocation: 3353 case nir_intrinsic_load_sample_mask_in: 3354 case nir_intrinsic_load_sample_id: { 3355 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3356 fs_reg val = nir_system_values[sv]; 3357 assert(val.file != BAD_FILE); 3358 dest.type = val.type; 3359 bld.MOV(dest, val); 3360 break; 3361 } 3362 3363 case nir_intrinsic_store_output: { 3364 const fs_reg src = get_nir_src(instr->src[0]); 3365 const unsigned store_offset = nir_src_as_uint(instr->src[1]); 3366 const unsigned location = nir_intrinsic_base(instr) + 3367 SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); 3368 const fs_reg new_dest = retype(alloc_frag_output(this, location), 3369 src.type); 3370 3371 for (unsigned j = 0; j < instr->num_components; j++) 3372 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3373 offset(src, bld, j)); 3374 3375 break; 3376 } 3377 3378 case nir_intrinsic_load_output: { 3379 const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3380 BRW_NIR_FRAG_OUTPUT_LOCATION); 3381 assert(l >= FRAG_RESULT_DATA0); 3382 const unsigned load_offset = nir_src_as_uint(instr->src[0]); 3383 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; 3384 const fs_reg tmp = bld.vgrf(dest.type, 4); 3385 3386 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3387 emit_coherent_fb_read(bld, tmp, target); 3388 else 3389 emit_non_coherent_fb_read(bld, tmp, target); 3390 3391 for (unsigned j = 0; j < instr->num_components; j++) { 3392 bld.MOV(offset(dest, bld, j), 3393 offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3394 } 3395 3396 break; 3397 } 3398 3399 case nir_intrinsic_discard: 3400 case nir_intrinsic_discard_if: { 3401 /* We track our discarded pixels in f0.1. By predicating on it, we can 3402 * update just the flag bits that aren't yet discarded. If there's no 3403 * condition, we emit a CMP of g0 != g0, so all currently executing 3404 * channels will get turned off. 3405 */ 3406 fs_inst *cmp; 3407 if (instr->intrinsic == nir_intrinsic_discard_if) { 3408 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3409 brw_imm_d(0), BRW_CONDITIONAL_Z); 3410 } else { 3411 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3412 BRW_REGISTER_TYPE_UW)); 3413 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3414 } 3415 cmp->predicate = BRW_PREDICATE_NORMAL; 3416 cmp->flag_subreg = 1; 3417 3418 if (devinfo->gen >= 6) { 3419 emit_discard_jump(); 3420 } 3421 3422 limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode."); 3423 break; 3424 } 3425 3426 case nir_intrinsic_load_input: { 3427 /* load_input is only used for flat inputs */ 3428 unsigned base = nir_intrinsic_base(instr); 3429 unsigned comp = nir_intrinsic_component(instr); 3430 unsigned num_components = instr->num_components; 3431 fs_reg orig_dest = dest; 3432 enum brw_reg_type type = dest.type; 3433 3434 /* Special case fields in the VUE header */ 3435 if (base == VARYING_SLOT_LAYER) 3436 comp = 1; 3437 else if (base == VARYING_SLOT_VIEWPORT) 3438 comp = 2; 3439 3440 if (nir_dest_bit_size(instr->dest) == 64) { 3441 /* const_index is in 32-bit type size units that could not be aligned 3442 * with DF. We need to read the double vector as if it was a float 3443 * vector of twice the number of components to fetch the right data. 3444 */ 3445 type = BRW_REGISTER_TYPE_F; 3446 num_components *= 2; 3447 dest = bld.vgrf(type, num_components); 3448 } 3449 3450 for (unsigned int i = 0; i < num_components; i++) { 3451 bld.MOV(offset(retype(dest, type), bld, i), 3452 retype(component(interp_reg(base, comp + i), 3), type)); 3453 } 3454 3455 if (nir_dest_bit_size(instr->dest) == 64) { 3456 shuffle_from_32bit_read(bld, orig_dest, dest, 0, 3457 instr->num_components); 3458 } 3459 break; 3460 } 3461 3462 case nir_intrinsic_load_barycentric_pixel: 3463 case nir_intrinsic_load_barycentric_centroid: 3464 case nir_intrinsic_load_barycentric_sample: 3465 /* Do nothing - load_interpolated_input handling will handle it later. */ 3466 break; 3467 3468 case nir_intrinsic_load_barycentric_at_sample: { 3469 const glsl_interp_mode interpolation = 3470 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3471 3472 if (nir_src_is_const(instr->src[0])) { 3473 unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; 3474 3475 emit_pixel_interpolater_send(bld, 3476 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3477 dest, 3478 fs_reg(), /* src */ 3479 brw_imm_ud(msg_data), 3480 interpolation); 3481 } else { 3482 const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3483 BRW_REGISTER_TYPE_UD); 3484 3485 if (nir_src_is_dynamically_uniform(instr->src[0])) { 3486 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3487 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3488 bld.exec_all().group(1, 0) 3489 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3490 emit_pixel_interpolater_send(bld, 3491 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3492 dest, 3493 fs_reg(), /* src */ 3494 msg_data, 3495 interpolation); 3496 } else { 3497 /* Make a loop that sends a message to the pixel interpolater 3498 * for the sample number in each live channel. If there are 3499 * multiple channels with the same sample number then these 3500 * will be handled simultaneously with a single interation of 3501 * the loop. 3502 */ 3503 bld.emit(BRW_OPCODE_DO); 3504 3505 /* Get the next live sample number into sample_id_reg */ 3506 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3507 3508 /* Set the flag register so that we can perform the send 3509 * message on all channels that have the same sample number 3510 */ 3511 bld.CMP(bld.null_reg_ud(), 3512 sample_src, sample_id, 3513 BRW_CONDITIONAL_EQ); 3514 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3515 bld.exec_all().group(1, 0) 3516 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3517 fs_inst *inst = 3518 emit_pixel_interpolater_send(bld, 3519 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3520 dest, 3521 fs_reg(), /* src */ 3522 component(msg_data, 0), 3523 interpolation); 3524 set_predicate(BRW_PREDICATE_NORMAL, inst); 3525 3526 /* Continue the loop if there are any live channels left */ 3527 set_predicate_inv(BRW_PREDICATE_NORMAL, 3528 true, /* inverse */ 3529 bld.emit(BRW_OPCODE_WHILE)); 3530 } 3531 } 3532 break; 3533 } 3534 3535 case nir_intrinsic_load_barycentric_at_offset: { 3536 const glsl_interp_mode interpolation = 3537 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3538 3539 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3540 3541 if (const_offset) { 3542 assert(nir_src_bit_size(instr->src[0]) == 32); 3543 unsigned off_x = MIN2((int)(const_offset[0].f32 * 16), 7) & 0xf; 3544 unsigned off_y = MIN2((int)(const_offset[1].f32 * 16), 7) & 0xf; 3545 3546 emit_pixel_interpolater_send(bld, 3547 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3548 dest, 3549 fs_reg(), /* src */ 3550 brw_imm_ud(off_x | (off_y << 4)), 3551 interpolation); 3552 } else { 3553 fs_reg src = vgrf(glsl_type::ivec2_type); 3554 fs_reg offset_src = retype(get_nir_src(instr->src[0]), 3555 BRW_REGISTER_TYPE_F); 3556 for (int i = 0; i < 2; i++) { 3557 fs_reg temp = vgrf(glsl_type::float_type); 3558 bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); 3559 fs_reg itemp = vgrf(glsl_type::int_type); 3560 /* float to int */ 3561 bld.MOV(itemp, temp); 3562 3563 /* Clamp the upper end of the range to +7/16. 3564 * ARB_gpu_shader5 requires that we support a maximum offset 3565 * of +0.5, which isn't representable in a S0.4 value -- if 3566 * we didn't clamp it, we'd end up with -8/16, which is the 3567 * opposite of what the shader author wanted. 3568 * 3569 * This is legal due to ARB_gpu_shader5's quantization 3570 * rules: 3571 * 3572 * "Not all values of <offset> may be supported; x and y 3573 * offsets may be rounded to fixed-point values with the 3574 * number of fraction bits given by the 3575 * implementation-dependent constant 3576 * FRAGMENT_INTERPOLATION_OFFSET_BITS" 3577 */ 3578 set_condmod(BRW_CONDITIONAL_L, 3579 bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); 3580 } 3581 3582 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3583 emit_pixel_interpolater_send(bld, 3584 opcode, 3585 dest, 3586 src, 3587 brw_imm_ud(0u), 3588 interpolation); 3589 } 3590 break; 3591 } 3592 3593 case nir_intrinsic_load_interpolated_input: { 3594 if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { 3595 emit_fragcoord_interpolation(dest); 3596 break; 3597 } 3598 3599 assert(instr->src[0].ssa && 3600 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3601 nir_intrinsic_instr *bary_intrinsic = 3602 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3603 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3604 enum glsl_interp_mode interp_mode = 3605 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3606 fs_reg dst_xy; 3607 3608 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3609 bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3610 /* Use the result of the PI message */ 3611 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3612 } else { 3613 /* Use the delta_xy values computed from the payload */ 3614 enum brw_barycentric_mode bary = 3615 brw_barycentric_mode(interp_mode, bary_intrin); 3616 3617 dst_xy = this->delta_xy[bary]; 3618 } 3619 3620 for (unsigned int i = 0; i < instr->num_components; i++) { 3621 fs_reg interp = 3622 component(interp_reg(nir_intrinsic_base(instr), 3623 nir_intrinsic_component(instr) + i), 0); 3624 interp.type = BRW_REGISTER_TYPE_F; 3625 dest.type = BRW_REGISTER_TYPE_F; 3626 3627 if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3628 fs_reg tmp = vgrf(glsl_type::float_type); 3629 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3630 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3631 } else { 3632 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3633 } 3634 } 3635 break; 3636 } 3637 3638 default: 3639 nir_emit_intrinsic(bld, instr); 3640 break; 3641 } 3642} 3643 3644static int 3645get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 3646{ 3647 if (nir_src_is_const(instr->src[src])) { 3648 int64_t add_val = nir_src_as_int(instr->src[src]); 3649 if (add_val == 1) 3650 return BRW_AOP_INC; 3651 else if (add_val == -1) 3652 return BRW_AOP_DEC; 3653 } 3654 3655 return BRW_AOP_ADD; 3656} 3657 3658void 3659fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3660 nir_intrinsic_instr *instr) 3661{ 3662 assert(stage == MESA_SHADER_COMPUTE); 3663 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3664 3665 fs_reg dest; 3666 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3667 dest = get_nir_dest(instr->dest); 3668 3669 switch (instr->intrinsic) { 3670 case nir_intrinsic_barrier: 3671 emit_barrier(); 3672 cs_prog_data->uses_barrier = true; 3673 break; 3674 3675 case nir_intrinsic_load_subgroup_id: 3676 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3677 break; 3678 3679 case nir_intrinsic_load_local_invocation_id: 3680 case nir_intrinsic_load_work_group_id: { 3681 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3682 fs_reg val = nir_system_values[sv]; 3683 assert(val.file != BAD_FILE); 3684 dest.type = val.type; 3685 for (unsigned i = 0; i < 3; i++) 3686 bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3687 break; 3688 } 3689 3690 case nir_intrinsic_load_num_work_groups: { 3691 const unsigned surface = 3692 cs_prog_data->binding_table.work_groups_start; 3693 3694 cs_prog_data->uses_num_work_groups = true; 3695 3696 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3697 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3698 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3699 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); /* num components */ 3700 3701 /* Read the 3 GLuint components of gl_NumWorkGroups */ 3702 for (unsigned i = 0; i < 3; i++) { 3703 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(i << 2); 3704 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3705 offset(dest, bld, i), srcs, SURFACE_LOGICAL_NUM_SRCS); 3706 } 3707 break; 3708 } 3709 3710 case nir_intrinsic_shared_atomic_add: 3711 nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); 3712 break; 3713 case nir_intrinsic_shared_atomic_imin: 3714 nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); 3715 break; 3716 case nir_intrinsic_shared_atomic_umin: 3717 nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); 3718 break; 3719 case nir_intrinsic_shared_atomic_imax: 3720 nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); 3721 break; 3722 case nir_intrinsic_shared_atomic_umax: 3723 nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); 3724 break; 3725 case nir_intrinsic_shared_atomic_and: 3726 nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); 3727 break; 3728 case nir_intrinsic_shared_atomic_or: 3729 nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); 3730 break; 3731 case nir_intrinsic_shared_atomic_xor: 3732 nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); 3733 break; 3734 case nir_intrinsic_shared_atomic_exchange: 3735 nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); 3736 break; 3737 case nir_intrinsic_shared_atomic_comp_swap: 3738 nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); 3739 break; 3740 case nir_intrinsic_shared_atomic_fmin: 3741 nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); 3742 break; 3743 case nir_intrinsic_shared_atomic_fmax: 3744 nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); 3745 break; 3746 case nir_intrinsic_shared_atomic_fcomp_swap: 3747 nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); 3748 break; 3749 3750 case nir_intrinsic_load_shared: { 3751 assert(devinfo->gen >= 7); 3752 assert(stage == MESA_SHADER_COMPUTE); 3753 3754 const unsigned bit_size = nir_dest_bit_size(instr->dest); 3755 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3756 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 3757 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); 3758 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3759 3760 /* Make dest unsigned because that's what the temporary will be */ 3761 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3762 3763 /* Read the vector */ 3764 if (nir_intrinsic_align(instr) >= 4) { 3765 assert(nir_dest_bit_size(instr->dest) == 32); 3766 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3767 fs_inst *inst = 3768 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3769 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3770 inst->size_written = instr->num_components * dispatch_width * 4; 3771 } else { 3772 assert(nir_dest_bit_size(instr->dest) <= 32); 3773 assert(nir_dest_num_components(instr->dest) == 1); 3774 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3775 3776 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 3777 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 3778 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 3779 bld.MOV(dest, read_result); 3780 } 3781 break; 3782 } 3783 3784 case nir_intrinsic_store_shared: { 3785 assert(devinfo->gen >= 7); 3786 assert(stage == MESA_SHADER_COMPUTE); 3787 3788 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 3789 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3790 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 3791 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 3792 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3793 3794 fs_reg data = get_nir_src(instr->src[0]); 3795 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3796 3797 assert(nir_intrinsic_write_mask(instr) == 3798 (1u << instr->num_components) - 1); 3799 if (nir_intrinsic_align(instr) >= 4) { 3800 assert(nir_src_bit_size(instr->src[0]) == 32); 3801 assert(nir_src_num_components(instr->src[0]) <= 4); 3802 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 3803 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3804 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 3805 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3806 } else { 3807 assert(nir_src_bit_size(instr->src[0]) <= 32); 3808 assert(nir_src_num_components(instr->src[0]) == 1); 3809 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3810 3811 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 3812 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 3813 3814 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 3815 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3816 } 3817 break; 3818 } 3819 3820 default: 3821 nir_emit_intrinsic(bld, instr); 3822 break; 3823 } 3824} 3825 3826static fs_reg 3827brw_nir_reduction_op_identity(const fs_builder &bld, 3828 nir_op op, brw_reg_type type) 3829{ 3830 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 3831 switch (type_sz(type)) { 3832 case 2: 3833 assert(type != BRW_REGISTER_TYPE_HF); 3834 return retype(brw_imm_uw(value.u16), type); 3835 case 4: 3836 return retype(brw_imm_ud(value.u32), type); 3837 case 8: 3838 if (type == BRW_REGISTER_TYPE_DF) 3839 return setup_imm_df(bld, value.f64); 3840 else 3841 return retype(brw_imm_u64(value.u64), type); 3842 default: 3843 unreachable("Invalid type size"); 3844 } 3845} 3846 3847static opcode 3848brw_op_for_nir_reduction_op(nir_op op) 3849{ 3850 switch (op) { 3851 case nir_op_iadd: return BRW_OPCODE_ADD; 3852 case nir_op_fadd: return BRW_OPCODE_ADD; 3853 case nir_op_imul: return BRW_OPCODE_MUL; 3854 case nir_op_fmul: return BRW_OPCODE_MUL; 3855 case nir_op_imin: return BRW_OPCODE_SEL; 3856 case nir_op_umin: return BRW_OPCODE_SEL; 3857 case nir_op_fmin: return BRW_OPCODE_SEL; 3858 case nir_op_imax: return BRW_OPCODE_SEL; 3859 case nir_op_umax: return BRW_OPCODE_SEL; 3860 case nir_op_fmax: return BRW_OPCODE_SEL; 3861 case nir_op_iand: return BRW_OPCODE_AND; 3862 case nir_op_ior: return BRW_OPCODE_OR; 3863 case nir_op_ixor: return BRW_OPCODE_XOR; 3864 default: 3865 unreachable("Invalid reduction operation"); 3866 } 3867} 3868 3869static brw_conditional_mod 3870brw_cond_mod_for_nir_reduction_op(nir_op op) 3871{ 3872 switch (op) { 3873 case nir_op_iadd: return BRW_CONDITIONAL_NONE; 3874 case nir_op_fadd: return BRW_CONDITIONAL_NONE; 3875 case nir_op_imul: return BRW_CONDITIONAL_NONE; 3876 case nir_op_fmul: return BRW_CONDITIONAL_NONE; 3877 case nir_op_imin: return BRW_CONDITIONAL_L; 3878 case nir_op_umin: return BRW_CONDITIONAL_L; 3879 case nir_op_fmin: return BRW_CONDITIONAL_L; 3880 case nir_op_imax: return BRW_CONDITIONAL_GE; 3881 case nir_op_umax: return BRW_CONDITIONAL_GE; 3882 case nir_op_fmax: return BRW_CONDITIONAL_GE; 3883 case nir_op_iand: return BRW_CONDITIONAL_NONE; 3884 case nir_op_ior: return BRW_CONDITIONAL_NONE; 3885 case nir_op_ixor: return BRW_CONDITIONAL_NONE; 3886 default: 3887 unreachable("Invalid reduction operation"); 3888 } 3889} 3890 3891fs_reg 3892fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 3893 nir_intrinsic_instr *instr) 3894{ 3895 fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 3896 3897 if (stage_prog_data->binding_table.image_start > 0) { 3898 if (image.file == BRW_IMMEDIATE_VALUE) { 3899 image.d += stage_prog_data->binding_table.image_start; 3900 } else { 3901 bld.ADD(image, image, 3902 brw_imm_d(stage_prog_data->binding_table.image_start)); 3903 } 3904 } 3905 3906 return bld.emit_uniformize(image); 3907} 3908 3909fs_reg 3910fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, 3911 nir_intrinsic_instr *instr) 3912{ 3913 /* SSBO stores are weird in that their index is in src[1] */ 3914 const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0; 3915 3916 fs_reg surf_index; 3917 if (nir_src_is_const(instr->src[src])) { 3918 unsigned index = stage_prog_data->binding_table.ssbo_start + 3919 nir_src_as_uint(instr->src[src]); 3920 surf_index = brw_imm_ud(index); 3921 } else { 3922 surf_index = vgrf(glsl_type::uint_type); 3923 bld.ADD(surf_index, get_nir_src(instr->src[src]), 3924 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 3925 } 3926 3927 return bld.emit_uniformize(surf_index); 3928} 3929 3930static unsigned 3931image_intrinsic_coord_components(nir_intrinsic_instr *instr) 3932{ 3933 switch (nir_intrinsic_image_dim(instr)) { 3934 case GLSL_SAMPLER_DIM_1D: 3935 return 1 + nir_intrinsic_image_array(instr); 3936 case GLSL_SAMPLER_DIM_2D: 3937 case GLSL_SAMPLER_DIM_RECT: 3938 return 2 + nir_intrinsic_image_array(instr); 3939 case GLSL_SAMPLER_DIM_3D: 3940 case GLSL_SAMPLER_DIM_CUBE: 3941 return 3; 3942 case GLSL_SAMPLER_DIM_BUF: 3943 return 1; 3944 case GLSL_SAMPLER_DIM_MS: 3945 return 2 + nir_intrinsic_image_array(instr); 3946 default: 3947 unreachable("Invalid image dimension"); 3948 } 3949} 3950 3951void 3952fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 3953{ 3954 fs_reg dest; 3955 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3956 dest = get_nir_dest(instr->dest); 3957 3958 switch (instr->intrinsic) { 3959 case nir_intrinsic_image_load: 3960 case nir_intrinsic_image_store: 3961 case nir_intrinsic_image_atomic_add: 3962 case nir_intrinsic_image_atomic_min: 3963 case nir_intrinsic_image_atomic_max: 3964 case nir_intrinsic_image_atomic_and: 3965 case nir_intrinsic_image_atomic_or: 3966 case nir_intrinsic_image_atomic_xor: 3967 case nir_intrinsic_image_atomic_exchange: 3968 case nir_intrinsic_image_atomic_comp_swap: 3969 case nir_intrinsic_bindless_image_load: 3970 case nir_intrinsic_bindless_image_store: 3971 case nir_intrinsic_bindless_image_atomic_add: 3972 case nir_intrinsic_bindless_image_atomic_min: 3973 case nir_intrinsic_bindless_image_atomic_max: 3974 case nir_intrinsic_bindless_image_atomic_and: 3975 case nir_intrinsic_bindless_image_atomic_or: 3976 case nir_intrinsic_bindless_image_atomic_xor: 3977 case nir_intrinsic_bindless_image_atomic_exchange: 3978 case nir_intrinsic_bindless_image_atomic_comp_swap: { 3979 if (stage == MESA_SHADER_FRAGMENT && 3980 instr->intrinsic != nir_intrinsic_image_load) 3981 brw_wm_prog_data(prog_data)->has_side_effects = true; 3982 3983 /* Get some metadata from the image intrinsic. */ 3984 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3985 const GLenum format = nir_intrinsic_format(instr); 3986 3987 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3988 3989 switch (instr->intrinsic) { 3990 case nir_intrinsic_image_load: 3991 case nir_intrinsic_image_store: 3992 case nir_intrinsic_image_atomic_add: 3993 case nir_intrinsic_image_atomic_min: 3994 case nir_intrinsic_image_atomic_max: 3995 case nir_intrinsic_image_atomic_and: 3996 case nir_intrinsic_image_atomic_or: 3997 case nir_intrinsic_image_atomic_xor: 3998 case nir_intrinsic_image_atomic_exchange: 3999 case nir_intrinsic_image_atomic_comp_swap: 4000 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4001 get_nir_image_intrinsic_image(bld, instr); 4002 break; 4003 4004 default: 4005 /* Bindless */ 4006 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = 4007 bld.emit_uniformize(get_nir_src(instr->src[0])); 4008 break; 4009 } 4010 4011 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4012 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = 4013 brw_imm_ud(image_intrinsic_coord_components(instr)); 4014 4015 /* Emit an image load, store or atomic op. */ 4016 if (instr->intrinsic == nir_intrinsic_image_load || 4017 instr->intrinsic == nir_intrinsic_bindless_image_load) { 4018 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4019 fs_inst *inst = 4020 bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 4021 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4022 inst->size_written = instr->num_components * dispatch_width * 4; 4023 } else if (instr->intrinsic == nir_intrinsic_image_store || 4024 instr->intrinsic == nir_intrinsic_bindless_image_store) { 4025 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4026 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); 4027 bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 4028 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4029 } else { 4030 int op; 4031 unsigned num_srcs = info->num_srcs; 4032 4033 switch (instr->intrinsic) { 4034 case nir_intrinsic_image_atomic_add: 4035 case nir_intrinsic_bindless_image_atomic_add: 4036 assert(num_srcs == 4); 4037 4038 op = get_op_for_atomic_add(instr, 3); 4039 4040 if (op != BRW_AOP_ADD) 4041 num_srcs = 3; 4042 break; 4043 case nir_intrinsic_image_atomic_min: 4044 case nir_intrinsic_bindless_image_atomic_min: 4045 assert(format == GL_R32UI || format == GL_R32I); 4046 op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN; 4047 break; 4048 case nir_intrinsic_image_atomic_max: 4049 case nir_intrinsic_bindless_image_atomic_max: 4050 assert(format == GL_R32UI || format == GL_R32I); 4051 op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX; 4052 break; 4053 case nir_intrinsic_image_atomic_and: 4054 case nir_intrinsic_bindless_image_atomic_and: 4055 op = BRW_AOP_AND; 4056 break; 4057 case nir_intrinsic_image_atomic_or: 4058 case nir_intrinsic_bindless_image_atomic_or: 4059 op = BRW_AOP_OR; 4060 break; 4061 case nir_intrinsic_image_atomic_xor: 4062 case nir_intrinsic_bindless_image_atomic_xor: 4063 op = BRW_AOP_XOR; 4064 break; 4065 case nir_intrinsic_image_atomic_exchange: 4066 case nir_intrinsic_bindless_image_atomic_exchange: 4067 op = BRW_AOP_MOV; 4068 break; 4069 case nir_intrinsic_image_atomic_comp_swap: 4070 case nir_intrinsic_bindless_image_atomic_comp_swap: 4071 op = BRW_AOP_CMPWR; 4072 break; 4073 default: 4074 unreachable("Not reachable."); 4075 } 4076 4077 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 4078 4079 fs_reg data; 4080 if (num_srcs >= 4) 4081 data = get_nir_src(instr->src[3]); 4082 if (num_srcs >= 5) { 4083 fs_reg tmp = bld.vgrf(data.type, 2); 4084 fs_reg sources[2] = { data, get_nir_src(instr->src[4]) }; 4085 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 4086 data = tmp; 4087 } 4088 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4089 4090 bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 4091 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4092 } 4093 break; 4094 } 4095 4096 case nir_intrinsic_image_size: 4097 case nir_intrinsic_bindless_image_size: { 4098 /* Unlike the [un]typed load and store opcodes, the TXS that this turns 4099 * into will handle the binding table index for us in the geneerator. 4100 * Incidentally, this means that we can handle bindless with exactly the 4101 * same code. 4102 */ 4103 fs_reg image = retype(get_nir_src_imm(instr->src[0]), 4104 BRW_REGISTER_TYPE_UD); 4105 image = bld.emit_uniformize(image); 4106 4107 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4108 if (instr->intrinsic == nir_intrinsic_image_size) 4109 srcs[TEX_LOGICAL_SRC_SURFACE] = image; 4110 else 4111 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; 4112 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); 4113 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); 4114 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 4115 4116 /* Since the image size is always uniform, we can just emit a SIMD8 4117 * query instruction and splat the result out. 4118 */ 4119 const fs_builder ubld = bld.exec_all().group(8, 0); 4120 4121 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4122 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, 4123 tmp, srcs, ARRAY_SIZE(srcs)); 4124 inst->size_written = 4 * REG_SIZE; 4125 4126 for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 4127 if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) { 4128 bld.emit(SHADER_OPCODE_INT_QUOTIENT, 4129 offset(retype(dest, tmp.type), bld, c), 4130 component(offset(tmp, ubld, c), 0), brw_imm_ud(6)); 4131 } else { 4132 bld.MOV(offset(retype(dest, tmp.type), bld, c), 4133 component(offset(tmp, ubld, c), 0)); 4134 } 4135 } 4136 break; 4137 } 4138 4139 case nir_intrinsic_image_load_raw_intel: { 4140 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4141 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4142 get_nir_image_intrinsic_image(bld, instr); 4143 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4144 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4145 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4146 4147 fs_inst *inst = 4148 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4149 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4150 inst->size_written = instr->num_components * dispatch_width * 4; 4151 break; 4152 } 4153 4154 case nir_intrinsic_image_store_raw_intel: { 4155 if (stage == MESA_SHADER_FRAGMENT) 4156 brw_wm_prog_data(prog_data)->has_side_effects = true; 4157 4158 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4159 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4160 get_nir_image_intrinsic_image(bld, instr); 4161 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4162 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); 4163 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4164 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4165 4166 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4167 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4168 break; 4169 } 4170 4171 case nir_intrinsic_group_memory_barrier: 4172 case nir_intrinsic_memory_barrier_shared: 4173 case nir_intrinsic_memory_barrier_atomic_counter: 4174 case nir_intrinsic_memory_barrier_buffer: 4175 case nir_intrinsic_memory_barrier_image: 4176 case nir_intrinsic_memory_barrier: { 4177 const fs_builder ubld = bld.group(8, 0); 4178 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4179 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, 4180 brw_vec8_grf(0, 0), brw_imm_ud(0)) 4181 ->size_written = 2 * REG_SIZE; 4182 break; 4183 } 4184 4185 case nir_intrinsic_shader_clock: { 4186 /* We cannot do anything if there is an event, so ignore it for now */ 4187 const fs_reg shader_clock = get_timestamp(bld); 4188 const fs_reg srcs[] = { component(shader_clock, 0), 4189 component(shader_clock, 1) }; 4190 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 4191 break; 4192 } 4193 4194 case nir_intrinsic_image_samples: 4195 /* The driver does not support multi-sampled images. */ 4196 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 4197 break; 4198 4199 case nir_intrinsic_load_uniform: { 4200 /* Offsets are in bytes but they should always aligned to 4201 * the type size 4202 */ 4203 assert(instr->const_index[0] % 4 == 0 || 4204 instr->const_index[0] % type_sz(dest.type) == 0); 4205 4206 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 4207 4208 if (nir_src_is_const(instr->src[0])) { 4209 unsigned load_offset = nir_src_as_uint(instr->src[0]); 4210 assert(load_offset % type_sz(dest.type) == 0); 4211 /* For 16-bit types we add the module of the const_index[0] 4212 * offset to access to not 32-bit aligned element 4213 */ 4214 src.offset = load_offset + instr->const_index[0] % 4; 4215 4216 for (unsigned j = 0; j < instr->num_components; j++) { 4217 bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 4218 } 4219 } else { 4220 fs_reg indirect = retype(get_nir_src(instr->src[0]), 4221 BRW_REGISTER_TYPE_UD); 4222 4223 /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4224 * go past the end of the uniform. In order to keep the n'th 4225 * component from running past, we subtract off the size of all but 4226 * one component of the vector. 4227 */ 4228 assert(instr->const_index[1] >= 4229 instr->num_components * (int) type_sz(dest.type)); 4230 unsigned read_size = instr->const_index[1] - 4231 (instr->num_components - 1) * type_sz(dest.type); 4232 4233 bool supports_64bit_indirects = 4234 !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo); 4235 4236 if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4237 for (unsigned j = 0; j < instr->num_components; j++) { 4238 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4239 offset(dest, bld, j), offset(src, bld, j), 4240 indirect, brw_imm_ud(read_size)); 4241 } 4242 } else { 4243 const unsigned num_mov_indirects = 4244 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4245 /* We read a little bit less per MOV INDIRECT, as they are now 4246 * 32-bits ones instead of 64-bit. Fix read_size then. 4247 */ 4248 const unsigned read_size_32bit = read_size - 4249 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4250 for (unsigned j = 0; j < instr->num_components; j++) { 4251 for (unsigned i = 0; i < num_mov_indirects; i++) { 4252 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4253 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4254 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4255 indirect, brw_imm_ud(read_size_32bit)); 4256 } 4257 } 4258 } 4259 } 4260 break; 4261 } 4262 4263 case nir_intrinsic_load_ubo: { 4264 fs_reg surf_index; 4265 if (nir_src_is_const(instr->src[0])) { 4266 const unsigned index = stage_prog_data->binding_table.ubo_start + 4267 nir_src_as_uint(instr->src[0]); 4268 surf_index = brw_imm_ud(index); 4269 } else { 4270 /* The block index is not a constant. Evaluate the index expression 4271 * per-channel and add the base UBO index; we have to select a value 4272 * from any live channel. 4273 */ 4274 surf_index = vgrf(glsl_type::uint_type); 4275 bld.ADD(surf_index, get_nir_src(instr->src[0]), 4276 brw_imm_ud(stage_prog_data->binding_table.ubo_start)); 4277 surf_index = bld.emit_uniformize(surf_index); 4278 } 4279 4280 if (!nir_src_is_const(instr->src[1])) { 4281 fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4282 BRW_REGISTER_TYPE_UD); 4283 4284 for (int i = 0; i < instr->num_components; i++) 4285 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4286 base_offset, i * type_sz(dest.type)); 4287 } else { 4288 /* Even if we are loading doubles, a pull constant load will load 4289 * a 32-bit vec4, so should only reserve vgrf space for that. If we 4290 * need to load a full dvec4 we will have to emit 2 loads. This is 4291 * similar to demote_pull_constants(), except that in that case we 4292 * see individual accesses to each component of the vector and then 4293 * we let CSE deal with duplicate loads. Here we see a vector access 4294 * and we have to split it if necessary. 4295 */ 4296 const unsigned type_size = type_sz(dest.type); 4297 const unsigned load_offset = nir_src_as_uint(instr->src[1]); 4298 4299 /* See if we've selected this as a push constant candidate */ 4300 if (nir_src_is_const(instr->src[0])) { 4301 const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 4302 const unsigned offset_256b = load_offset / 32; 4303 4304 fs_reg push_reg; 4305 for (int i = 0; i < 4; i++) { 4306 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4307 if (range->block == ubo_block && 4308 offset_256b >= range->start && 4309 offset_256b < range->start + range->length) { 4310 4311 push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4312 push_reg.offset = load_offset - 32 * range->start; 4313 break; 4314 } 4315 } 4316 4317 if (push_reg.file != BAD_FILE) { 4318 for (unsigned i = 0; i < instr->num_components; i++) { 4319 bld.MOV(offset(dest, bld, i), 4320 byte_offset(push_reg, i * type_size)); 4321 } 4322 break; 4323 } 4324 } 4325 4326 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4327 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4328 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4329 4330 for (unsigned c = 0; c < instr->num_components;) { 4331 const unsigned base = load_offset + c * type_size; 4332 /* Number of usable components in the next block-aligned load. */ 4333 const unsigned count = MIN2(instr->num_components - c, 4334 (block_sz - base % block_sz) / type_size); 4335 4336 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4337 packed_consts, surf_index, 4338 brw_imm_ud(base & ~(block_sz - 1))); 4339 4340 const fs_reg consts = 4341 retype(byte_offset(packed_consts, base & (block_sz - 1)), 4342 dest.type); 4343 4344 for (unsigned d = 0; d < count; d++) 4345 bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4346 4347 c += count; 4348 } 4349 } 4350 break; 4351 } 4352 4353 case nir_intrinsic_load_global: { 4354 assert(devinfo->gen >= 8); 4355 4356 if (nir_intrinsic_align(instr) >= 4) { 4357 assert(nir_dest_bit_size(instr->dest) == 32); 4358 fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, 4359 dest, 4360 get_nir_src(instr->src[0]), /* Address */ 4361 fs_reg(), /* No source data */ 4362 brw_imm_ud(instr->num_components)); 4363 inst->size_written = instr->num_components * 4364 inst->dst.component_size(inst->exec_size); 4365 } else { 4366 const unsigned bit_size = nir_dest_bit_size(instr->dest); 4367 assert(bit_size <= 32); 4368 assert(nir_dest_num_components(instr->dest) == 1); 4369 brw_reg_type data_type = 4370 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4371 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4372 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, 4373 tmp, 4374 get_nir_src(instr->src[0]), /* Address */ 4375 fs_reg(), /* No source data */ 4376 brw_imm_ud(bit_size)); 4377 bld.MOV(retype(dest, data_type), tmp); 4378 } 4379 break; 4380 } 4381 4382 case nir_intrinsic_store_global: 4383 assert(devinfo->gen >= 8); 4384 4385 if (stage == MESA_SHADER_FRAGMENT) 4386 brw_wm_prog_data(prog_data)->has_side_effects = true; 4387 4388 if (nir_intrinsic_align(instr) >= 4) { 4389 assert(nir_src_bit_size(instr->src[0]) == 32); 4390 bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, 4391 fs_reg(), 4392 get_nir_src(instr->src[1]), /* Address */ 4393 get_nir_src(instr->src[0]), /* Data */ 4394 brw_imm_ud(instr->num_components)); 4395 } else { 4396 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4397 assert(bit_size <= 32); 4398 assert(nir_src_num_components(instr->src[0]) == 1); 4399 brw_reg_type data_type = 4400 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4401 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4402 bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); 4403 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, 4404 fs_reg(), 4405 get_nir_src(instr->src[1]), /* Address */ 4406 tmp, /* Data */ 4407 brw_imm_ud(nir_src_bit_size(instr->src[0]))); 4408 } 4409 break; 4410 4411 case nir_intrinsic_global_atomic_add: 4412 nir_emit_global_atomic(bld, get_op_for_atomic_add(instr, 1), instr); 4413 break; 4414 case nir_intrinsic_global_atomic_imin: 4415 nir_emit_global_atomic(bld, BRW_AOP_IMIN, instr); 4416 break; 4417 case nir_intrinsic_global_atomic_umin: 4418 nir_emit_global_atomic(bld, BRW_AOP_UMIN, instr); 4419 break; 4420 case nir_intrinsic_global_atomic_imax: 4421 nir_emit_global_atomic(bld, BRW_AOP_IMAX, instr); 4422 break; 4423 case nir_intrinsic_global_atomic_umax: 4424 nir_emit_global_atomic(bld, BRW_AOP_UMAX, instr); 4425 break; 4426 case nir_intrinsic_global_atomic_and: 4427 nir_emit_global_atomic(bld, BRW_AOP_AND, instr); 4428 break; 4429 case nir_intrinsic_global_atomic_or: 4430 nir_emit_global_atomic(bld, BRW_AOP_OR, instr); 4431 break; 4432 case nir_intrinsic_global_atomic_xor: 4433 nir_emit_global_atomic(bld, BRW_AOP_XOR, instr); 4434 break; 4435 case nir_intrinsic_global_atomic_exchange: 4436 nir_emit_global_atomic(bld, BRW_AOP_MOV, instr); 4437 break; 4438 case nir_intrinsic_global_atomic_comp_swap: 4439 nir_emit_global_atomic(bld, BRW_AOP_CMPWR, instr); 4440 break; 4441 case nir_intrinsic_global_atomic_fmin: 4442 nir_emit_global_atomic_float(bld, BRW_AOP_FMIN, instr); 4443 break; 4444 case nir_intrinsic_global_atomic_fmax: 4445 nir_emit_global_atomic_float(bld, BRW_AOP_FMAX, instr); 4446 break; 4447 case nir_intrinsic_global_atomic_fcomp_swap: 4448 nir_emit_global_atomic_float(bld, BRW_AOP_FCMPWR, instr); 4449 break; 4450 4451 case nir_intrinsic_load_ssbo: { 4452 assert(devinfo->gen >= 7); 4453 4454 const unsigned bit_size = nir_dest_bit_size(instr->dest); 4455 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4456 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4457 get_nir_ssbo_intrinsic_index(bld, instr); 4458 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4459 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4460 4461 /* Make dest unsigned because that's what the temporary will be */ 4462 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4463 4464 /* Read the vector */ 4465 if (nir_intrinsic_align(instr) >= 4) { 4466 assert(nir_dest_bit_size(instr->dest) == 32); 4467 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4468 fs_inst *inst = 4469 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4470 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4471 inst->size_written = instr->num_components * dispatch_width * 4; 4472 } else { 4473 assert(nir_dest_bit_size(instr->dest) <= 32); 4474 assert(nir_dest_num_components(instr->dest) == 1); 4475 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4476 4477 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 4478 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 4479 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 4480 bld.MOV(dest, read_result); 4481 } 4482 break; 4483 } 4484 4485 case nir_intrinsic_store_ssbo: { 4486 assert(devinfo->gen >= 7); 4487 4488 if (stage == MESA_SHADER_FRAGMENT) 4489 brw_wm_prog_data(prog_data)->has_side_effects = true; 4490 4491 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4492 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4493 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4494 get_nir_ssbo_intrinsic_index(bld, instr); 4495 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); 4496 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4497 4498 fs_reg data = get_nir_src(instr->src[0]); 4499 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4500 4501 assert(nir_intrinsic_write_mask(instr) == 4502 (1u << instr->num_components) - 1); 4503 if (nir_intrinsic_align(instr) >= 4) { 4504 assert(nir_src_bit_size(instr->src[0]) == 32); 4505 assert(nir_src_num_components(instr->src[0]) <= 4); 4506 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4507 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4508 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4509 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4510 } else { 4511 assert(nir_src_bit_size(instr->src[0]) <= 32); 4512 assert(nir_src_num_components(instr->src[0]) == 1); 4513 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4514 4515 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 4516 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 4517 4518 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 4519 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4520 } 4521 break; 4522 } 4523 4524 case nir_intrinsic_store_output: { 4525 fs_reg src = get_nir_src(instr->src[0]); 4526 4527 unsigned store_offset = nir_src_as_uint(instr->src[1]); 4528 unsigned num_components = instr->num_components; 4529 unsigned first_component = nir_intrinsic_component(instr); 4530 if (nir_src_bit_size(instr->src[0]) == 64) { 4531 src = shuffle_for_32bit_write(bld, src, 0, num_components); 4532 num_components *= 2; 4533 } 4534 4535 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 4536 4 * store_offset), src.type); 4537 for (unsigned j = 0; j < num_components; j++) { 4538 bld.MOV(offset(new_dest, bld, j + first_component), 4539 offset(src, bld, j)); 4540 } 4541 break; 4542 } 4543 4544 case nir_intrinsic_ssbo_atomic_add: 4545 nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); 4546 break; 4547 case nir_intrinsic_ssbo_atomic_imin: 4548 nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); 4549 break; 4550 case nir_intrinsic_ssbo_atomic_umin: 4551 nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); 4552 break; 4553 case nir_intrinsic_ssbo_atomic_imax: 4554 nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); 4555 break; 4556 case nir_intrinsic_ssbo_atomic_umax: 4557 nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); 4558 break; 4559 case nir_intrinsic_ssbo_atomic_and: 4560 nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); 4561 break; 4562 case nir_intrinsic_ssbo_atomic_or: 4563 nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); 4564 break; 4565 case nir_intrinsic_ssbo_atomic_xor: 4566 nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); 4567 break; 4568 case nir_intrinsic_ssbo_atomic_exchange: 4569 nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); 4570 break; 4571 case nir_intrinsic_ssbo_atomic_comp_swap: 4572 nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); 4573 break; 4574 case nir_intrinsic_ssbo_atomic_fmin: 4575 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); 4576 break; 4577 case nir_intrinsic_ssbo_atomic_fmax: 4578 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); 4579 break; 4580 case nir_intrinsic_ssbo_atomic_fcomp_swap: 4581 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); 4582 break; 4583 4584 case nir_intrinsic_get_buffer_size: { 4585 assert(nir_src_num_components(instr->src[0]) == 1); 4586 unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 4587 nir_src_as_uint(instr->src[0]) : 0; 4588 4589 /* A resinfo's sampler message is used to get the buffer size. The 4590 * SIMD8's writeback message consists of four registers and SIMD16's 4591 * writeback message consists of 8 destination registers (two per each 4592 * component). Because we are only interested on the first channel of 4593 * the first returned component, where resinfo returns the buffer size 4594 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 4595 * the dispatch width. 4596 */ 4597 const fs_builder ubld = bld.exec_all().group(8, 0); 4598 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4599 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4600 4601 /* Set LOD = 0 */ 4602 ubld.MOV(src_payload, brw_imm_d(0)); 4603 4604 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; 4605 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 4606 src_payload, brw_imm_ud(index)); 4607 inst->header_size = 0; 4608 inst->mlen = 1; 4609 inst->size_written = 4 * REG_SIZE; 4610 4611 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 4612 * 4613 * "Out-of-bounds checking is always performed at a DWord granularity. If 4614 * any part of the DWord is out-of-bounds then the whole DWord is 4615 * considered out-of-bounds." 4616 * 4617 * This implies that types with size smaller than 4-bytes need to be 4618 * padded if they don't complete the last dword of the buffer. But as we 4619 * need to maintain the original size we need to reverse the padding 4620 * calculation to return the correct size to know the number of elements 4621 * of an unsized array. As we stored in the last two bits of the surface 4622 * size the needed padding for the buffer, we calculate here the 4623 * original buffer_size reversing the surface_size calculation: 4624 * 4625 * surface_size = isl_align(buffer_size, 4) + 4626 * (isl_align(buffer_size) - buffer_size) 4627 * 4628 * buffer_size = surface_size & ~3 - surface_size & 3 4629 */ 4630 4631 fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4632 fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4633 fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4634 4635 ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 4636 ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 4637 ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 4638 4639 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 4640 break; 4641 } 4642 4643 case nir_intrinsic_load_subgroup_invocation: 4644 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 4645 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 4646 break; 4647 4648 case nir_intrinsic_load_subgroup_eq_mask: 4649 case nir_intrinsic_load_subgroup_ge_mask: 4650 case nir_intrinsic_load_subgroup_gt_mask: 4651 case nir_intrinsic_load_subgroup_le_mask: 4652 case nir_intrinsic_load_subgroup_lt_mask: 4653 unreachable("not reached"); 4654 4655 case nir_intrinsic_vote_any: { 4656 const fs_builder ubld = bld.exec_all().group(1, 0); 4657 4658 /* The any/all predicates do not consider channel enables. To prevent 4659 * dead channels from affecting the result, we initialize the flag with 4660 * with the identity value for the logical operation. 4661 */ 4662 if (dispatch_width == 32) { 4663 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4664 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4665 brw_imm_ud(0)); 4666 } else { 4667 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 4668 } 4669 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4670 4671 /* For some reason, the any/all predicates don't work properly with 4672 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4673 * doesn't read the correct subset of the flag register and you end up 4674 * getting garbage in the second half. Work around this by using a pair 4675 * of 1-wide MOVs and scattering the result. 4676 */ 4677 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4678 ubld.MOV(res1, brw_imm_d(0)); 4679 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 4680 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 4681 BRW_PREDICATE_ALIGN1_ANY32H, 4682 ubld.MOV(res1, brw_imm_d(-1))); 4683 4684 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4685 break; 4686 } 4687 case nir_intrinsic_vote_all: { 4688 const fs_builder ubld = bld.exec_all().group(1, 0); 4689 4690 /* The any/all predicates do not consider channel enables. To prevent 4691 * dead channels from affecting the result, we initialize the flag with 4692 * with the identity value for the logical operation. 4693 */ 4694 if (dispatch_width == 32) { 4695 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4696 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4697 brw_imm_ud(0xffffffff)); 4698 } else { 4699 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4700 } 4701 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4702 4703 /* For some reason, the any/all predicates don't work properly with 4704 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4705 * doesn't read the correct subset of the flag register and you end up 4706 * getting garbage in the second half. Work around this by using a pair 4707 * of 1-wide MOVs and scattering the result. 4708 */ 4709 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4710 ubld.MOV(res1, brw_imm_d(0)); 4711 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4712 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4713 BRW_PREDICATE_ALIGN1_ALL32H, 4714 ubld.MOV(res1, brw_imm_d(-1))); 4715 4716 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4717 break; 4718 } 4719 case nir_intrinsic_vote_feq: 4720 case nir_intrinsic_vote_ieq: { 4721 fs_reg value = get_nir_src(instr->src[0]); 4722 if (instr->intrinsic == nir_intrinsic_vote_feq) { 4723 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4724 value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : 4725 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 4726 } 4727 4728 fs_reg uniformized = bld.emit_uniformize(value); 4729 const fs_builder ubld = bld.exec_all().group(1, 0); 4730 4731 /* The any/all predicates do not consider channel enables. To prevent 4732 * dead channels from affecting the result, we initialize the flag with 4733 * with the identity value for the logical operation. 4734 */ 4735 if (dispatch_width == 32) { 4736 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4737 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4738 brw_imm_ud(0xffffffff)); 4739 } else { 4740 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4741 } 4742 bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 4743 4744 /* For some reason, the any/all predicates don't work properly with 4745 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4746 * doesn't read the correct subset of the flag register and you end up 4747 * getting garbage in the second half. Work around this by using a pair 4748 * of 1-wide MOVs and scattering the result. 4749 */ 4750 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4751 ubld.MOV(res1, brw_imm_d(0)); 4752 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4753 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4754 BRW_PREDICATE_ALIGN1_ALL32H, 4755 ubld.MOV(res1, brw_imm_d(-1))); 4756 4757 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4758 break; 4759 } 4760 4761 case nir_intrinsic_ballot: { 4762 const fs_reg value = retype(get_nir_src(instr->src[0]), 4763 BRW_REGISTER_TYPE_UD); 4764 struct brw_reg flag = brw_flag_reg(0, 0); 4765 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 4766 * as f0.0. This is a problem for fragment programs as we currently use 4767 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 4768 * programs yet so this isn't a problem. When we do, something will 4769 * have to change. 4770 */ 4771 if (dispatch_width == 32) 4772 flag.type = BRW_REGISTER_TYPE_UD; 4773 4774 bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 4775 bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 4776 4777 if (instr->dest.ssa.bit_size > 32) { 4778 dest.type = BRW_REGISTER_TYPE_UQ; 4779 } else { 4780 dest.type = BRW_REGISTER_TYPE_UD; 4781 } 4782 bld.MOV(dest, flag); 4783 break; 4784 } 4785 4786 case nir_intrinsic_read_invocation: { 4787 const fs_reg value = get_nir_src(instr->src[0]); 4788 const fs_reg invocation = get_nir_src(instr->src[1]); 4789 fs_reg tmp = bld.vgrf(value.type); 4790 4791 bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 4792 bld.emit_uniformize(invocation)); 4793 4794 bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 4795 break; 4796 } 4797 4798 case nir_intrinsic_read_first_invocation: { 4799 const fs_reg value = get_nir_src(instr->src[0]); 4800 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 4801 break; 4802 } 4803 4804 case nir_intrinsic_shuffle: { 4805 const fs_reg value = get_nir_src(instr->src[0]); 4806 const fs_reg index = get_nir_src(instr->src[1]); 4807 4808 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 4809 break; 4810 } 4811 4812 case nir_intrinsic_first_invocation: { 4813 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4814 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 4815 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 4816 fs_reg(component(tmp, 0))); 4817 break; 4818 } 4819 4820 case nir_intrinsic_quad_broadcast: { 4821 const fs_reg value = get_nir_src(instr->src[0]); 4822 const unsigned index = nir_src_as_uint(instr->src[1]); 4823 4824 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 4825 value, brw_imm_ud(index), brw_imm_ud(4)); 4826 break; 4827 } 4828 4829 case nir_intrinsic_quad_swap_horizontal: { 4830 const fs_reg value = get_nir_src(instr->src[0]); 4831 const fs_reg tmp = bld.vgrf(value.type); 4832 if (devinfo->gen <= 7) { 4833 /* The hardware doesn't seem to support these crazy regions with 4834 * compressed instructions on gen7 and earlier so we fall back to 4835 * using quad swizzles. Fortunately, we don't support 64-bit 4836 * anything in Vulkan on gen7. 4837 */ 4838 assert(nir_src_bit_size(instr->src[0]) == 32); 4839 const fs_builder ubld = bld.exec_all(); 4840 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4841 brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); 4842 bld.MOV(retype(dest, value.type), tmp); 4843 } else { 4844 const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 4845 4846 const fs_reg src_left = horiz_stride(value, 2); 4847 const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 4848 const fs_reg tmp_left = horiz_stride(tmp, 2); 4849 const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 4850 4851 ubld.MOV(tmp_left, src_right); 4852 ubld.MOV(tmp_right, src_left); 4853 4854 } 4855 bld.MOV(retype(dest, value.type), tmp); 4856 break; 4857 } 4858 4859 case nir_intrinsic_quad_swap_vertical: { 4860 const fs_reg value = get_nir_src(instr->src[0]); 4861 if (nir_src_bit_size(instr->src[0]) == 32) { 4862 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4863 const fs_reg tmp = bld.vgrf(value.type); 4864 const fs_builder ubld = bld.exec_all(); 4865 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4866 brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 4867 bld.MOV(retype(dest, value.type), tmp); 4868 } else { 4869 /* For larger data types, we have to either emit dispatch_width many 4870 * MOVs or else fall back to doing indirects. 4871 */ 4872 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4873 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4874 brw_imm_w(0x2)); 4875 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4876 } 4877 break; 4878 } 4879 4880 case nir_intrinsic_quad_swap_diagonal: { 4881 const fs_reg value = get_nir_src(instr->src[0]); 4882 if (nir_src_bit_size(instr->src[0]) == 32) { 4883 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4884 const fs_reg tmp = bld.vgrf(value.type); 4885 const fs_builder ubld = bld.exec_all(); 4886 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4887 brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 4888 bld.MOV(retype(dest, value.type), tmp); 4889 } else { 4890 /* For larger data types, we have to either emit dispatch_width many 4891 * MOVs or else fall back to doing indirects. 4892 */ 4893 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4894 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4895 brw_imm_w(0x3)); 4896 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4897 } 4898 break; 4899 } 4900 4901 case nir_intrinsic_reduce: { 4902 fs_reg src = get_nir_src(instr->src[0]); 4903 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4904 unsigned cluster_size = nir_intrinsic_cluster_size(instr); 4905 if (cluster_size == 0 || cluster_size > dispatch_width) 4906 cluster_size = dispatch_width; 4907 4908 /* Figure out the source type */ 4909 src.type = brw_type_for_nir_type(devinfo, 4910 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4911 nir_src_bit_size(instr->src[0]))); 4912 4913 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4914 opcode brw_op = brw_op_for_nir_reduction_op(redop); 4915 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4916 4917 /* Set up a register for all of our scratching around and initialize it 4918 * to reduction operation's identity value. 4919 */ 4920 fs_reg scan = bld.vgrf(src.type); 4921 bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4922 4923 bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 4924 4925 dest.type = src.type; 4926 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 4927 /* In this case, CLUSTER_BROADCAST instruction isn't needed because 4928 * the distance between clusters is at least 2 GRFs. In this case, 4929 * we don't need the weird striding of the CLUSTER_BROADCAST 4930 * instruction and can just do regular MOVs. 4931 */ 4932 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 4933 const unsigned groups = 4934 (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 4935 const unsigned group_size = dispatch_width / groups; 4936 for (unsigned i = 0; i < groups; i++) { 4937 const unsigned cluster = (i * group_size) / cluster_size; 4938 const unsigned comp = cluster * cluster_size + (cluster_size - 1); 4939 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 4940 component(scan, comp)); 4941 } 4942 } else { 4943 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 4944 brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 4945 } 4946 break; 4947 } 4948 4949 case nir_intrinsic_inclusive_scan: 4950 case nir_intrinsic_exclusive_scan: { 4951 fs_reg src = get_nir_src(instr->src[0]); 4952 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4953 4954 /* Figure out the source type */ 4955 src.type = brw_type_for_nir_type(devinfo, 4956 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4957 nir_src_bit_size(instr->src[0]))); 4958 4959 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4960 opcode brw_op = brw_op_for_nir_reduction_op(redop); 4961 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4962 4963 /* Set up a register for all of our scratching around and initialize it 4964 * to reduction operation's identity value. 4965 */ 4966 fs_reg scan = bld.vgrf(src.type); 4967 const fs_builder allbld = bld.exec_all(); 4968 allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4969 4970 if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 4971 /* Exclusive scan is a bit harder because we have to do an annoying 4972 * shift of the contents before we can begin. To make things worse, 4973 * we can't do this with a normal stride; we have to use indirects. 4974 */ 4975 fs_reg shifted = bld.vgrf(src.type); 4976 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4977 allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4978 brw_imm_w(-1)); 4979 allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 4980 allbld.group(1, 0).MOV(component(shifted, 0), identity); 4981 scan = shifted; 4982 } 4983 4984 bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 4985 4986 bld.MOV(retype(dest, src.type), scan); 4987 break; 4988 } 4989 4990 case nir_intrinsic_begin_invocation_interlock: { 4991 const fs_builder ubld = bld.group(8, 0); 4992 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4993 4994 ubld.emit(SHADER_OPCODE_INTERLOCK, tmp, brw_vec8_grf(0, 0)) 4995 ->size_written = 2 * REG_SIZE; 4996 break; 4997 } 4998 4999 case nir_intrinsic_end_invocation_interlock: { 5000 /* For endInvocationInterlock(), we need to insert a memory fence which 5001 * stalls in the shader until the memory transactions prior to that 5002 * fence are complete. This ensures that the shader does not end before 5003 * any writes from its critical section have landed. Otherwise, you can 5004 * end up with a case where the next invocation on that pixel properly 5005 * stalls for previous FS invocation on its pixel to complete but 5006 * doesn't actually wait for the dataport memory transactions from that 5007 * thread to land before submitting its own. 5008 */ 5009 const fs_builder ubld = bld.group(8, 0); 5010 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 5011 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, 5012 brw_vec8_grf(0, 0), brw_imm_ud(1)) 5013 ->size_written = 2 * REG_SIZE; 5014 break; 5015 } 5016 5017 default: 5018 unreachable("unknown intrinsic"); 5019 } 5020} 5021 5022void 5023fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 5024 int op, nir_intrinsic_instr *instr) 5025{ 5026 if (stage == MESA_SHADER_FRAGMENT) 5027 brw_wm_prog_data(prog_data)->has_side_effects = true; 5028 5029 /* The BTI untyped atomic messages only support 32-bit atomics. If you 5030 * just look at the big table of messages in the Vol 7 of the SKL PRM, they 5031 * appear to exist. However, if you look at Vol 2a, there are no message 5032 * descriptors provided for Qword atomic ops except for A64 messages. 5033 */ 5034 assert(nir_dest_bit_size(instr->dest) == 32); 5035 5036 fs_reg dest; 5037 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5038 dest = get_nir_dest(instr->dest); 5039 5040 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5041 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5042 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5043 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5044 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5045 5046 fs_reg data; 5047 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5048 data = get_nir_src(instr->src[2]); 5049 5050 if (op == BRW_AOP_CMPWR) { 5051 fs_reg tmp = bld.vgrf(data.type, 2); 5052 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5053 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5054 data = tmp; 5055 } 5056 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5057 5058 /* Emit the actual atomic operation */ 5059 5060 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5061 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5062} 5063 5064void 5065fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 5066 int op, nir_intrinsic_instr *instr) 5067{ 5068 if (stage == MESA_SHADER_FRAGMENT) 5069 brw_wm_prog_data(prog_data)->has_side_effects = true; 5070 5071 fs_reg dest; 5072 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5073 dest = get_nir_dest(instr->dest); 5074 5075 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5076 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5077 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5078 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5079 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5080 5081 fs_reg data = get_nir_src(instr->src[2]); 5082 if (op == BRW_AOP_FCMPWR) { 5083 fs_reg tmp = bld.vgrf(data.type, 2); 5084 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5085 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5086 data = tmp; 5087 } 5088 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5089 5090 /* Emit the actual atomic operation */ 5091 5092 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5093 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5094} 5095 5096void 5097fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 5098 int op, nir_intrinsic_instr *instr) 5099{ 5100 fs_reg dest; 5101 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5102 dest = get_nir_dest(instr->dest); 5103 5104 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5105 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 5106 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5107 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5108 5109 fs_reg data; 5110 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5111 data = get_nir_src(instr->src[1]); 5112 if (op == BRW_AOP_CMPWR) { 5113 fs_reg tmp = bld.vgrf(data.type, 2); 5114 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5115 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5116 data = tmp; 5117 } 5118 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5119 5120 /* Get the offset */ 5121 if (nir_src_is_const(instr->src[0])) { 5122 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5123 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5124 } else { 5125 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5126 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5127 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5128 brw_imm_ud(instr->const_index[0])); 5129 } 5130 5131 /* Emit the actual atomic operation operation */ 5132 5133 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5134 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5135} 5136 5137void 5138fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 5139 int op, nir_intrinsic_instr *instr) 5140{ 5141 fs_reg dest; 5142 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5143 dest = get_nir_dest(instr->dest); 5144 5145 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5146 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GEN7_BTI_SLM); 5147 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5148 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5149 5150 fs_reg data = get_nir_src(instr->src[1]); 5151 if (op == BRW_AOP_FCMPWR) { 5152 fs_reg tmp = bld.vgrf(data.type, 2); 5153 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5154 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5155 data = tmp; 5156 } 5157 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5158 5159 /* Get the offset */ 5160 if (nir_src_is_const(instr->src[0])) { 5161 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5162 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5163 } else { 5164 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5165 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5166 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5167 brw_imm_ud(instr->const_index[0])); 5168 } 5169 5170 /* Emit the actual atomic operation operation */ 5171 5172 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5173 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5174} 5175 5176void 5177fs_visitor::nir_emit_global_atomic(const fs_builder &bld, 5178 int op, nir_intrinsic_instr *instr) 5179{ 5180 if (stage == MESA_SHADER_FRAGMENT) 5181 brw_wm_prog_data(prog_data)->has_side_effects = true; 5182 5183 fs_reg dest; 5184 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5185 dest = get_nir_dest(instr->dest); 5186 5187 fs_reg addr = get_nir_src(instr->src[0]); 5188 5189 fs_reg data; 5190 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5191 data = get_nir_src(instr->src[1]); 5192 5193 if (op == BRW_AOP_CMPWR) { 5194 fs_reg tmp = bld.vgrf(data.type, 2); 5195 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5196 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5197 data = tmp; 5198 } 5199 5200 if (nir_dest_bit_size(instr->dest) == 64) { 5201 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, 5202 dest, addr, data, brw_imm_ud(op)); 5203 } else { 5204 assert(nir_dest_bit_size(instr->dest) == 32); 5205 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, 5206 dest, addr, data, brw_imm_ud(op)); 5207 } 5208} 5209 5210void 5211fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, 5212 int op, nir_intrinsic_instr *instr) 5213{ 5214 if (stage == MESA_SHADER_FRAGMENT) 5215 brw_wm_prog_data(prog_data)->has_side_effects = true; 5216 5217 assert(nir_intrinsic_infos[instr->intrinsic].has_dest); 5218 fs_reg dest = get_nir_dest(instr->dest); 5219 5220 fs_reg addr = get_nir_src(instr->src[0]); 5221 5222 assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC); 5223 fs_reg data = get_nir_src(instr->src[1]); 5224 5225 if (op == BRW_AOP_FCMPWR) { 5226 fs_reg tmp = bld.vgrf(data.type, 2); 5227 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5228 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5229 data = tmp; 5230 } 5231 5232 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, 5233 dest, addr, data, brw_imm_ud(op)); 5234} 5235 5236void 5237fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 5238{ 5239 unsigned texture = instr->texture_index; 5240 unsigned sampler = instr->sampler_index; 5241 5242 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 5243 5244 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 5245 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 5246 5247 int lod_components = 0; 5248 5249 /* The hardware requires a LOD for buffer textures */ 5250 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 5251 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 5252 5253 uint32_t header_bits = 0; 5254 for (unsigned i = 0; i < instr->num_srcs; i++) { 5255 fs_reg src = get_nir_src(instr->src[i].src); 5256 switch (instr->src[i].src_type) { 5257 case nir_tex_src_bias: 5258 srcs[TEX_LOGICAL_SRC_LOD] = 5259 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5260 break; 5261 case nir_tex_src_comparator: 5262 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 5263 break; 5264 case nir_tex_src_coord: 5265 switch (instr->op) { 5266 case nir_texop_txf: 5267 case nir_texop_txf_ms: 5268 case nir_texop_txf_ms_mcs: 5269 case nir_texop_samples_identical: 5270 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 5271 break; 5272 default: 5273 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 5274 break; 5275 } 5276 break; 5277 case nir_tex_src_ddx: 5278 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 5279 lod_components = nir_tex_instr_src_size(instr, i); 5280 break; 5281 case nir_tex_src_ddy: 5282 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 5283 break; 5284 case nir_tex_src_lod: 5285 switch (instr->op) { 5286 case nir_texop_txs: 5287 srcs[TEX_LOGICAL_SRC_LOD] = 5288 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 5289 break; 5290 case nir_texop_txf: 5291 srcs[TEX_LOGICAL_SRC_LOD] = 5292 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 5293 break; 5294 default: 5295 srcs[TEX_LOGICAL_SRC_LOD] = 5296 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5297 break; 5298 } 5299 break; 5300 case nir_tex_src_min_lod: 5301 srcs[TEX_LOGICAL_SRC_MIN_LOD] = 5302 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5303 break; 5304 case nir_tex_src_ms_index: 5305 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 5306 break; 5307 5308 case nir_tex_src_offset: { 5309 uint32_t offset_bits = 0; 5310 if (brw_texture_offset(instr, i, &offset_bits)) { 5311 header_bits |= offset_bits; 5312 } else { 5313 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 5314 retype(src, BRW_REGISTER_TYPE_D); 5315 } 5316 break; 5317 } 5318 5319 case nir_tex_src_projector: 5320 unreachable("should be lowered"); 5321 5322 case nir_tex_src_texture_offset: { 5323 /* Emit code to evaluate the actual indexing expression */ 5324 fs_reg tmp = vgrf(glsl_type::uint_type); 5325 bld.ADD(tmp, src, brw_imm_ud(texture)); 5326 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 5327 break; 5328 } 5329 5330 case nir_tex_src_sampler_offset: { 5331 /* Emit code to evaluate the actual indexing expression */ 5332 fs_reg tmp = vgrf(glsl_type::uint_type); 5333 bld.ADD(tmp, src, brw_imm_ud(sampler)); 5334 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 5335 break; 5336 } 5337 5338 case nir_tex_src_texture_handle: 5339 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); 5340 srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); 5341 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); 5342 break; 5343 5344 case nir_tex_src_sampler_handle: 5345 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); 5346 srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); 5347 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); 5348 break; 5349 5350 case nir_tex_src_ms_mcs: 5351 assert(instr->op == nir_texop_txf_ms); 5352 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 5353 break; 5354 5355 case nir_tex_src_plane: { 5356 const uint32_t plane = nir_src_as_uint(instr->src[i].src); 5357 const uint32_t texture_index = 5358 instr->texture_index + 5359 stage_prog_data->binding_table.plane_start[plane] - 5360 stage_prog_data->binding_table.texture_start; 5361 5362 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); 5363 break; 5364 } 5365 5366 default: 5367 unreachable("unknown texture source"); 5368 } 5369 } 5370 5371 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 5372 (instr->op == nir_texop_txf_ms || 5373 instr->op == nir_texop_samples_identical)) { 5374 if (devinfo->gen >= 7 && 5375 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 5376 srcs[TEX_LOGICAL_SRC_MCS] = 5377 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 5378 instr->coord_components, 5379 srcs[TEX_LOGICAL_SRC_SURFACE], 5380 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); 5381 } else { 5382 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 5383 } 5384 } 5385 5386 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 5387 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 5388 5389 enum opcode opcode; 5390 switch (instr->op) { 5391 case nir_texop_tex: 5392 opcode = SHADER_OPCODE_TEX_LOGICAL; 5393 break; 5394 case nir_texop_txb: 5395 opcode = FS_OPCODE_TXB_LOGICAL; 5396 break; 5397 case nir_texop_txl: 5398 opcode = SHADER_OPCODE_TXL_LOGICAL; 5399 break; 5400 case nir_texop_txd: 5401 opcode = SHADER_OPCODE_TXD_LOGICAL; 5402 break; 5403 case nir_texop_txf: 5404 opcode = SHADER_OPCODE_TXF_LOGICAL; 5405 break; 5406 case nir_texop_txf_ms: 5407 if ((key_tex->msaa_16 & (1 << sampler))) 5408 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 5409 else 5410 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 5411 break; 5412 case nir_texop_txf_ms_mcs: 5413 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 5414 break; 5415 case nir_texop_query_levels: 5416 case nir_texop_txs: 5417 opcode = SHADER_OPCODE_TXS_LOGICAL; 5418 break; 5419 case nir_texop_lod: 5420 opcode = SHADER_OPCODE_LOD_LOGICAL; 5421 break; 5422 case nir_texop_tg4: 5423 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 5424 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 5425 else 5426 opcode = SHADER_OPCODE_TG4_LOGICAL; 5427 break; 5428 case nir_texop_texture_samples: 5429 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 5430 break; 5431 case nir_texop_samples_identical: { 5432 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 5433 5434 /* If mcs is an immediate value, it means there is no MCS. In that case 5435 * just return false. 5436 */ 5437 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 5438 bld.MOV(dst, brw_imm_ud(0u)); 5439 } else if ((key_tex->msaa_16 & (1 << sampler))) { 5440 fs_reg tmp = vgrf(glsl_type::uint_type); 5441 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 5442 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 5443 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 5444 } else { 5445 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 5446 BRW_CONDITIONAL_EQ); 5447 } 5448 return; 5449 } 5450 default: 5451 unreachable("unknown texture opcode"); 5452 } 5453 5454 if (instr->op == nir_texop_tg4) { 5455 if (instr->component == 1 && 5456 key_tex->gather_channel_quirk_mask & (1 << texture)) { 5457 /* gather4 sampler is broken for green channel on RG32F -- 5458 * we must ask for blue instead. 5459 */ 5460 header_bits |= 2 << 16; 5461 } else { 5462 header_bits |= instr->component << 16; 5463 } 5464 } 5465 5466 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 5467 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 5468 inst->offset = header_bits; 5469 5470 const unsigned dest_size = nir_tex_instr_dest_size(instr); 5471 if (devinfo->gen >= 9 && 5472 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 5473 unsigned write_mask = instr->dest.is_ssa ? 5474 nir_ssa_def_components_read(&instr->dest.ssa): 5475 (1 << dest_size) - 1; 5476 assert(write_mask != 0); /* dead code should have been eliminated */ 5477 inst->size_written = util_last_bit(write_mask) * 5478 inst->dst.component_size(inst->exec_size); 5479 } else { 5480 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 5481 } 5482 5483 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 5484 inst->shadow_compare = true; 5485 5486 if (instr->op == nir_texop_tg4 && devinfo->gen == 6) 5487 emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst); 5488 5489 fs_reg nir_dest[4]; 5490 for (unsigned i = 0; i < dest_size; i++) 5491 nir_dest[i] = offset(dst, bld, i); 5492 5493 if (instr->op == nir_texop_query_levels) { 5494 /* # levels is in .w */ 5495 nir_dest[0] = offset(dst, bld, 3); 5496 } else if (instr->op == nir_texop_txs && 5497 dest_size >= 3 && devinfo->gen < 7) { 5498 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 5499 fs_reg depth = offset(dst, bld, 2); 5500 nir_dest[2] = vgrf(glsl_type::int_type); 5501 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 5502 } 5503 5504 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 5505} 5506 5507void 5508fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 5509{ 5510 switch (instr->type) { 5511 case nir_jump_break: 5512 bld.emit(BRW_OPCODE_BREAK); 5513 break; 5514 case nir_jump_continue: 5515 bld.emit(BRW_OPCODE_CONTINUE); 5516 break; 5517 case nir_jump_return: 5518 default: 5519 unreachable("unknown jump"); 5520 } 5521} 5522 5523/* 5524 * This helper takes a source register and un/shuffles it into the destination 5525 * register. 5526 * 5527 * If source type size is smaller than destination type size the operation 5528 * needed is a component shuffle. The opposite case would be an unshuffle. If 5529 * source/destination type size is equal a shuffle is done that would be 5530 * equivalent to a simple MOV. 5531 * 5532 * For example, if source is a 16-bit type and destination is 32-bit. A 3 5533 * components .xyz 16-bit vector on SIMD8 would be. 5534 * 5535 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 5536 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 5537 * 5538 * This helper will return the following 2 32-bit components with the 16-bit 5539 * values shuffled: 5540 * 5541 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 5542 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 5543 * 5544 * For unshuffle, the example would be the opposite, a 64-bit type source 5545 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 5546 * would be: 5547 * 5548 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 5549 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 5550 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 5551 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 5552 * 5553 * The returned result would be the following 4 32-bit components unshuffled: 5554 * 5555 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 5556 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 5557 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 5558 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 5559 * 5560 * - Source and destination register must not be overlapped. 5561 * - components units are measured in terms of the smaller type between 5562 * source and destination because we are un/shuffling the smaller 5563 * components from/into the bigger ones. 5564 * - first_component parameter allows skipping source components. 5565 */ 5566void 5567shuffle_src_to_dst(const fs_builder &bld, 5568 const fs_reg &dst, 5569 const fs_reg &src, 5570 uint32_t first_component, 5571 uint32_t components) 5572{ 5573 if (type_sz(src.type) == type_sz(dst.type)) { 5574 assert(!regions_overlap(dst, 5575 type_sz(dst.type) * bld.dispatch_width() * components, 5576 offset(src, bld, first_component), 5577 type_sz(src.type) * bld.dispatch_width() * components)); 5578 for (unsigned i = 0; i < components; i++) { 5579 bld.MOV(retype(offset(dst, bld, i), src.type), 5580 offset(src, bld, i + first_component)); 5581 } 5582 } else if (type_sz(src.type) < type_sz(dst.type)) { 5583 /* Source is shuffled into destination */ 5584 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 5585 assert(!regions_overlap(dst, 5586 type_sz(dst.type) * bld.dispatch_width() * 5587 DIV_ROUND_UP(components, size_ratio), 5588 offset(src, bld, first_component), 5589 type_sz(src.type) * bld.dispatch_width() * components)); 5590 5591 brw_reg_type shuffle_type = 5592 brw_reg_type_from_bit_size(8 * type_sz(src.type), 5593 BRW_REGISTER_TYPE_D); 5594 for (unsigned i = 0; i < components; i++) { 5595 fs_reg shuffle_component_i = 5596 subscript(offset(dst, bld, i / size_ratio), 5597 shuffle_type, i % size_ratio); 5598 bld.MOV(shuffle_component_i, 5599 retype(offset(src, bld, i + first_component), shuffle_type)); 5600 } 5601 } else { 5602 /* Source is unshuffled into destination */ 5603 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 5604 assert(!regions_overlap(dst, 5605 type_sz(dst.type) * bld.dispatch_width() * components, 5606 offset(src, bld, first_component / size_ratio), 5607 type_sz(src.type) * bld.dispatch_width() * 5608 DIV_ROUND_UP(components + (first_component % size_ratio), 5609 size_ratio))); 5610 5611 brw_reg_type shuffle_type = 5612 brw_reg_type_from_bit_size(8 * type_sz(dst.type), 5613 BRW_REGISTER_TYPE_D); 5614 for (unsigned i = 0; i < components; i++) { 5615 fs_reg shuffle_component_i = 5616 subscript(offset(src, bld, (first_component + i) / size_ratio), 5617 shuffle_type, (first_component + i) % size_ratio); 5618 bld.MOV(retype(offset(dst, bld, i), shuffle_type), 5619 shuffle_component_i); 5620 } 5621 } 5622} 5623 5624void 5625shuffle_from_32bit_read(const fs_builder &bld, 5626 const fs_reg &dst, 5627 const fs_reg &src, 5628 uint32_t first_component, 5629 uint32_t components) 5630{ 5631 assert(type_sz(src.type) == 4); 5632 5633 /* This function takes components in units of the destination type while 5634 * shuffle_src_to_dst takes components in units of the smallest type 5635 */ 5636 if (type_sz(dst.type) > 4) { 5637 assert(type_sz(dst.type) == 8); 5638 first_component *= 2; 5639 components *= 2; 5640 } 5641 5642 shuffle_src_to_dst(bld, dst, src, first_component, components); 5643} 5644 5645fs_reg 5646shuffle_for_32bit_write(const fs_builder &bld, 5647 const fs_reg &src, 5648 uint32_t first_component, 5649 uint32_t components) 5650{ 5651 fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 5652 DIV_ROUND_UP (components * type_sz(src.type), 4)); 5653 /* This function takes components in units of the source type while 5654 * shuffle_src_to_dst takes components in units of the smallest type 5655 */ 5656 if (type_sz(src.type) > 4) { 5657 assert(type_sz(src.type) == 8); 5658 first_component *= 2; 5659 components *= 2; 5660 } 5661 5662 shuffle_src_to_dst(bld, dst, src, first_component, components); 5663 5664 return dst; 5665} 5666 5667fs_reg 5668setup_imm_df(const fs_builder &bld, double v) 5669{ 5670 const struct gen_device_info *devinfo = bld.shader->devinfo; 5671 assert(devinfo->gen >= 7); 5672 5673 if (devinfo->gen >= 8) 5674 return brw_imm_df(v); 5675 5676 /* gen7.5 does not support DF immediates straighforward but the DIM 5677 * instruction allows to set the 64-bit immediate value. 5678 */ 5679 if (devinfo->is_haswell) { 5680 const fs_builder ubld = bld.exec_all().group(1, 0); 5681 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 5682 ubld.DIM(dst, brw_imm_df(v)); 5683 return component(dst, 0); 5684 } 5685 5686 /* gen7 does not support DF immediates, so we generate a 64-bit constant by 5687 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 5688 * the high 32-bit to suboffset 4 and then applying a stride of 0. 5689 * 5690 * Alternatively, we could also produce a normal VGRF (without stride 0) 5691 * by writing to all the channels in the VGRF, however, that would hit the 5692 * gen7 bug where we have to split writes that span more than 1 register 5693 * into instructions with a width of 4 (otherwise the write to the second 5694 * register written runs into an execmask hardware bug) which isn't very 5695 * nice. 5696 */ 5697 union { 5698 double d; 5699 struct { 5700 uint32_t i1; 5701 uint32_t i2; 5702 }; 5703 } di; 5704 5705 di.d = v; 5706 5707 const fs_builder ubld = bld.exec_all().group(1, 0); 5708 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 5709 ubld.MOV(tmp, brw_imm_ud(di.i1)); 5710 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 5711 5712 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 5713} 5714 5715fs_reg 5716setup_imm_b(const fs_builder &bld, int8_t v) 5717{ 5718 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 5719 bld.MOV(tmp, brw_imm_w(v)); 5720 return tmp; 5721} 5722 5723fs_reg 5724setup_imm_ub(const fs_builder &bld, uint8_t v) 5725{ 5726 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 5727 bld.MOV(tmp, brw_imm_uw(v)); 5728 return tmp; 5729} 5730