brw_fs_nir.cpp revision 01e04c3f
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/glsl/ir.h" 25#include "brw_fs.h" 26#include "brw_fs_surface_builder.h" 27#include "brw_nir.h" 28#include "util/u_math.h" 29 30using namespace brw; 31using namespace brw::surface_access; 32 33void 34fs_visitor::emit_nir_code() 35{ 36 /* emit the arrays used for inputs and outputs - load/store intrinsics will 37 * be converted to reads/writes of these arrays 38 */ 39 nir_setup_outputs(); 40 nir_setup_uniforms(); 41 nir_emit_system_values(); 42 43 nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 44} 45 46void 47fs_visitor::nir_setup_outputs() 48{ 49 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) 50 return; 51 52 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 53 54 /* Calculate the size of output registers in a separate pass, before 55 * allocating them. With ARB_enhanced_layouts, multiple output variables 56 * may occupy the same slot, but have different type sizes. 57 */ 58 nir_foreach_variable(var, &nir->outputs) { 59 const int loc = var->data.driver_location; 60 const unsigned var_vec4s = 61 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 62 : type_size_vec4(var->type); 63 vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 64 } 65 66 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 67 if (vec4s[loc] == 0) { 68 loc++; 69 continue; 70 } 71 72 unsigned reg_size = vec4s[loc]; 73 74 /* Check if there are any ranges that start within this range and extend 75 * past it. If so, include them in this allocation. 76 */ 77 for (unsigned i = 1; i < reg_size; i++) 78 reg_size = MAX2(vec4s[i + loc] + i, reg_size); 79 80 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 81 for (unsigned i = 0; i < reg_size; i++) 82 outputs[loc + i] = offset(reg, bld, 4 * i); 83 84 loc += reg_size; 85 } 86} 87 88void 89fs_visitor::nir_setup_uniforms() 90{ 91 /* Only the first compile gets to set up uniforms. */ 92 if (push_constant_loc) { 93 assert(pull_constant_loc); 94 return; 95 } 96 97 uniforms = nir->num_uniforms / 4; 98 99 if (stage == MESA_SHADER_COMPUTE) { 100 /* Add a uniform for the thread local id. It must be the last uniform 101 * on the list. 102 */ 103 assert(uniforms == prog_data->nr_params); 104 uint32_t *param = brw_stage_prog_data_add_params(prog_data, 1); 105 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 106 subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 107 } 108} 109 110static bool 111emit_system_values_block(nir_block *block, fs_visitor *v) 112{ 113 fs_reg *reg; 114 115 nir_foreach_instr(instr, block) { 116 if (instr->type != nir_instr_type_intrinsic) 117 continue; 118 119 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 120 switch (intrin->intrinsic) { 121 case nir_intrinsic_load_vertex_id: 122 case nir_intrinsic_load_base_vertex: 123 unreachable("should be lowered by nir_lower_system_values()."); 124 125 case nir_intrinsic_load_vertex_id_zero_base: 126 case nir_intrinsic_load_is_indexed_draw: 127 case nir_intrinsic_load_first_vertex: 128 case nir_intrinsic_load_instance_id: 129 case nir_intrinsic_load_base_instance: 130 case nir_intrinsic_load_draw_id: 131 unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 132 133 case nir_intrinsic_load_invocation_id: 134 if (v->stage == MESA_SHADER_TESS_CTRL) 135 break; 136 assert(v->stage == MESA_SHADER_GEOMETRY); 137 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 138 if (reg->file == BAD_FILE) { 139 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 140 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 141 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 142 abld.SHR(iid, g1, brw_imm_ud(27u)); 143 *reg = iid; 144 } 145 break; 146 147 case nir_intrinsic_load_sample_pos: 148 assert(v->stage == MESA_SHADER_FRAGMENT); 149 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 150 if (reg->file == BAD_FILE) 151 *reg = *v->emit_samplepos_setup(); 152 break; 153 154 case nir_intrinsic_load_sample_id: 155 assert(v->stage == MESA_SHADER_FRAGMENT); 156 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 157 if (reg->file == BAD_FILE) 158 *reg = *v->emit_sampleid_setup(); 159 break; 160 161 case nir_intrinsic_load_sample_mask_in: 162 assert(v->stage == MESA_SHADER_FRAGMENT); 163 assert(v->devinfo->gen >= 7); 164 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 165 if (reg->file == BAD_FILE) 166 *reg = *v->emit_samplemaskin_setup(); 167 break; 168 169 case nir_intrinsic_load_work_group_id: 170 assert(v->stage == MESA_SHADER_COMPUTE); 171 reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID]; 172 if (reg->file == BAD_FILE) 173 *reg = *v->emit_cs_work_group_id_setup(); 174 break; 175 176 case nir_intrinsic_load_helper_invocation: 177 assert(v->stage == MESA_SHADER_FRAGMENT); 178 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 179 if (reg->file == BAD_FILE) { 180 const fs_builder abld = 181 v->bld.annotate("gl_HelperInvocation", NULL); 182 183 /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the 184 * pixel mask is in g1.7 of the thread payload. 185 * 186 * We move the per-channel pixel enable bit to the low bit of each 187 * channel by shifting the byte containing the pixel mask by the 188 * vector immediate 0x76543210UV. 189 * 190 * The region of <1,8,0> reads only 1 byte (the pixel masks for 191 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 192 * masks for 2 and 3) in SIMD16. 193 */ 194 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 195 196 for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 197 const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 198 hbld.SHR(offset(shifted, hbld, i), 199 stride(retype(brw_vec1_grf(1 + i, 7), 200 BRW_REGISTER_TYPE_UB), 201 1, 8, 0), 202 brw_imm_v(0x76543210)); 203 } 204 205 /* A set bit in the pixel mask means the channel is enabled, but 206 * that is the opposite of gl_HelperInvocation so we need to invert 207 * the mask. 208 * 209 * The negate source-modifier bit of logical instructions on Gen8+ 210 * performs 1's complement negation, so we can use that instead of 211 * a NOT instruction. 212 */ 213 fs_reg inverted = negate(shifted); 214 if (v->devinfo->gen < 8) { 215 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 216 abld.NOT(inverted, shifted); 217 } 218 219 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 220 * with 1 and negating. 221 */ 222 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 223 abld.AND(anded, inverted, brw_imm_uw(1)); 224 225 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 226 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 227 *reg = dst; 228 } 229 break; 230 231 default: 232 break; 233 } 234 } 235 236 return true; 237} 238 239void 240fs_visitor::nir_emit_system_values() 241{ 242 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 243 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 244 nir_system_values[i] = fs_reg(); 245 } 246 247 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 248 * never end up using it. 249 */ 250 { 251 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 252 fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 253 reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 254 255 const fs_builder allbld8 = abld.group(8, 0).exec_all(); 256 allbld8.MOV(reg, brw_imm_v(0x76543210)); 257 if (dispatch_width > 8) 258 allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 259 if (dispatch_width > 16) { 260 const fs_builder allbld16 = abld.group(16, 0).exec_all(); 261 allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 262 } 263 } 264 265 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 266 nir_foreach_block(block, impl) 267 emit_system_values_block(block, this); 268} 269 270/* 271 * Returns a type based on a reference_type (word, float, half-float) and a 272 * given bit_size. 273 * 274 * Reference BRW_REGISTER_TYPE are HF,F,DF,W,D,UW,UD. 275 * 276 * @FIXME: 64-bit return types are always DF on integer types to maintain 277 * compability with uses of DF previously to the introduction of int64 278 * support. 279 */ 280static brw_reg_type 281brw_reg_type_from_bit_size(const unsigned bit_size, 282 const brw_reg_type reference_type) 283{ 284 switch(reference_type) { 285 case BRW_REGISTER_TYPE_HF: 286 case BRW_REGISTER_TYPE_F: 287 case BRW_REGISTER_TYPE_DF: 288 switch(bit_size) { 289 case 16: 290 return BRW_REGISTER_TYPE_HF; 291 case 32: 292 return BRW_REGISTER_TYPE_F; 293 case 64: 294 return BRW_REGISTER_TYPE_DF; 295 default: 296 unreachable("Invalid bit size"); 297 } 298 case BRW_REGISTER_TYPE_B: 299 case BRW_REGISTER_TYPE_W: 300 case BRW_REGISTER_TYPE_D: 301 case BRW_REGISTER_TYPE_Q: 302 switch(bit_size) { 303 case 8: 304 return BRW_REGISTER_TYPE_B; 305 case 16: 306 return BRW_REGISTER_TYPE_W; 307 case 32: 308 return BRW_REGISTER_TYPE_D; 309 case 64: 310 return BRW_REGISTER_TYPE_Q; 311 default: 312 unreachable("Invalid bit size"); 313 } 314 case BRW_REGISTER_TYPE_UB: 315 case BRW_REGISTER_TYPE_UW: 316 case BRW_REGISTER_TYPE_UD: 317 case BRW_REGISTER_TYPE_UQ: 318 switch(bit_size) { 319 case 8: 320 return BRW_REGISTER_TYPE_UB; 321 case 16: 322 return BRW_REGISTER_TYPE_UW; 323 case 32: 324 return BRW_REGISTER_TYPE_UD; 325 case 64: 326 return BRW_REGISTER_TYPE_UQ; 327 default: 328 unreachable("Invalid bit size"); 329 } 330 default: 331 unreachable("Unknown type"); 332 } 333} 334 335void 336fs_visitor::nir_emit_impl(nir_function_impl *impl) 337{ 338 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 339 for (unsigned i = 0; i < impl->reg_alloc; i++) { 340 nir_locals[i] = fs_reg(); 341 } 342 343 foreach_list_typed(nir_register, reg, node, &impl->registers) { 344 unsigned array_elems = 345 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 346 unsigned size = array_elems * reg->num_components; 347 const brw_reg_type reg_type = 348 brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 349 nir_locals[reg->index] = bld.vgrf(reg_type, size); 350 } 351 352 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 353 impl->ssa_alloc); 354 355 nir_emit_cf_list(&impl->body); 356} 357 358void 359fs_visitor::nir_emit_cf_list(exec_list *list) 360{ 361 exec_list_validate(list); 362 foreach_list_typed(nir_cf_node, node, node, list) { 363 switch (node->type) { 364 case nir_cf_node_if: 365 nir_emit_if(nir_cf_node_as_if(node)); 366 break; 367 368 case nir_cf_node_loop: 369 nir_emit_loop(nir_cf_node_as_loop(node)); 370 break; 371 372 case nir_cf_node_block: 373 nir_emit_block(nir_cf_node_as_block(node)); 374 break; 375 376 default: 377 unreachable("Invalid CFG node block"); 378 } 379 } 380} 381 382void 383fs_visitor::nir_emit_if(nir_if *if_stmt) 384{ 385 /* first, put the condition into f0 */ 386 fs_inst *inst = bld.MOV(bld.null_reg_d(), 387 retype(get_nir_src(if_stmt->condition), 388 BRW_REGISTER_TYPE_D)); 389 inst->conditional_mod = BRW_CONDITIONAL_NZ; 390 391 bld.IF(BRW_PREDICATE_NORMAL); 392 393 nir_emit_cf_list(&if_stmt->then_list); 394 395 /* note: if the else is empty, dead CF elimination will remove it */ 396 bld.emit(BRW_OPCODE_ELSE); 397 398 nir_emit_cf_list(&if_stmt->else_list); 399 400 bld.emit(BRW_OPCODE_ENDIF); 401 402 if (devinfo->gen < 7) 403 limit_dispatch_width(16, "Non-uniform control flow unsupported " 404 "in SIMD32 mode."); 405} 406 407void 408fs_visitor::nir_emit_loop(nir_loop *loop) 409{ 410 bld.emit(BRW_OPCODE_DO); 411 412 nir_emit_cf_list(&loop->body); 413 414 bld.emit(BRW_OPCODE_WHILE); 415 416 if (devinfo->gen < 7) 417 limit_dispatch_width(16, "Non-uniform control flow unsupported " 418 "in SIMD32 mode."); 419} 420 421void 422fs_visitor::nir_emit_block(nir_block *block) 423{ 424 nir_foreach_instr(instr, block) { 425 nir_emit_instr(instr); 426 } 427} 428 429void 430fs_visitor::nir_emit_instr(nir_instr *instr) 431{ 432 const fs_builder abld = bld.annotate(NULL, instr); 433 434 switch (instr->type) { 435 case nir_instr_type_alu: 436 nir_emit_alu(abld, nir_instr_as_alu(instr)); 437 break; 438 439 case nir_instr_type_deref: 440 /* Derefs can exist for images but they do nothing */ 441 break; 442 443 case nir_instr_type_intrinsic: 444 switch (stage) { 445 case MESA_SHADER_VERTEX: 446 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 447 break; 448 case MESA_SHADER_TESS_CTRL: 449 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 450 break; 451 case MESA_SHADER_TESS_EVAL: 452 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 453 break; 454 case MESA_SHADER_GEOMETRY: 455 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 456 break; 457 case MESA_SHADER_FRAGMENT: 458 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 459 break; 460 case MESA_SHADER_COMPUTE: 461 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 462 break; 463 default: 464 unreachable("unsupported shader stage"); 465 } 466 break; 467 468 case nir_instr_type_tex: 469 nir_emit_texture(abld, nir_instr_as_tex(instr)); 470 break; 471 472 case nir_instr_type_load_const: 473 nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 474 break; 475 476 case nir_instr_type_ssa_undef: 477 /* We create a new VGRF for undefs on every use (by handling 478 * them in get_nir_src()), rather than for each definition. 479 * This helps register coalescing eliminate MOVs from undef. 480 */ 481 break; 482 483 case nir_instr_type_jump: 484 nir_emit_jump(abld, nir_instr_as_jump(instr)); 485 break; 486 487 default: 488 unreachable("unknown instruction type"); 489 } 490} 491 492/** 493 * Recognizes a parent instruction of nir_op_extract_* and changes the type to 494 * match instr. 495 */ 496bool 497fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 498 const fs_reg &result) 499{ 500 if (!instr->src[0].src.is_ssa || 501 !instr->src[0].src.ssa->parent_instr) 502 return false; 503 504 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 505 return false; 506 507 nir_alu_instr *src0 = 508 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 509 510 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 511 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 512 return false; 513 514 nir_const_value *element = nir_src_as_const_value(src0->src[1].src); 515 assert(element != NULL); 516 517 /* Element type to extract.*/ 518 const brw_reg_type type = brw_int_type( 519 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 520 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 521 522 fs_reg op0 = get_nir_src(src0->src[0].src); 523 op0.type = brw_type_for_nir_type(devinfo, 524 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 525 nir_src_bit_size(src0->src[0].src))); 526 op0 = offset(op0, bld, src0->src[0].swizzle[0]); 527 528 set_saturate(instr->dest.saturate, 529 bld.MOV(result, subscript(op0, type, element->u32[0]))); 530 return true; 531} 532 533bool 534fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 535 const fs_reg &result) 536{ 537 if (!instr->src[0].src.is_ssa || 538 instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic) 539 return false; 540 541 nir_intrinsic_instr *src0 = 542 nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr); 543 544 if (src0->intrinsic != nir_intrinsic_load_front_face) 545 return false; 546 547 nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src); 548 if (!value1 || fabsf(value1->f32[0]) != 1.0f) 549 return false; 550 551 nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src); 552 if (!value2 || fabsf(value2->f32[0]) != 1.0f) 553 return false; 554 555 fs_reg tmp = vgrf(glsl_type::int_type); 556 557 if (devinfo->gen >= 6) { 558 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 559 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 560 561 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 562 * 563 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 564 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 565 * 566 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 567 * 568 * This negation looks like it's safe in practice, because bits 0:4 will 569 * surely be TRIANGLES 570 */ 571 572 if (value1->f32[0] == -1.0f) { 573 g0.negate = true; 574 } 575 576 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 577 g0, brw_imm_uw(0x3f80)); 578 } else { 579 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 580 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 581 582 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 583 * 584 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 585 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 586 * 587 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 588 * 589 * This negation looks like it's safe in practice, because bits 0:4 will 590 * surely be TRIANGLES 591 */ 592 593 if (value1->f32[0] == -1.0f) { 594 g1_6.negate = true; 595 } 596 597 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 598 } 599 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 600 601 return true; 602} 603 604static void 605emit_find_msb_using_lzd(const fs_builder &bld, 606 const fs_reg &result, 607 const fs_reg &src, 608 bool is_signed) 609{ 610 fs_inst *inst; 611 fs_reg temp = src; 612 613 if (is_signed) { 614 /* LZD of an absolute value source almost always does the right 615 * thing. There are two problem values: 616 * 617 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 618 * 0. However, findMSB(int(0x80000000)) == 30. 619 * 620 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 621 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 622 * 623 * For a value of zero or negative one, -1 will be returned. 624 * 625 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 626 * findMSB(-(1<<x)) should return x-1. 627 * 628 * For all negative number cases, including 0x80000000 and 629 * 0xffffffff, the correct value is obtained from LZD if instead of 630 * negating the (already negative) value the logical-not is used. A 631 * conditonal logical-not can be achieved in two instructions. 632 */ 633 temp = bld.vgrf(BRW_REGISTER_TYPE_D); 634 635 bld.ASR(temp, src, brw_imm_d(31)); 636 bld.XOR(temp, temp, src); 637 } 638 639 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 640 retype(temp, BRW_REGISTER_TYPE_UD)); 641 642 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 643 * from the LSB side. Subtract the result from 31 to convert the MSB 644 * count into an LSB count. If no bits are set, LZD will return 32. 645 * 31-32 = -1, which is exactly what findMSB() is supposed to return. 646 */ 647 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 648 inst->src[0].negate = true; 649} 650 651static brw_rnd_mode 652brw_rnd_mode_from_nir_op (const nir_op op) { 653 switch (op) { 654 case nir_op_f2f16_rtz: 655 return BRW_RND_MODE_RTZ; 656 case nir_op_f2f16_rtne: 657 return BRW_RND_MODE_RTNE; 658 default: 659 unreachable("Operation doesn't support rounding mode"); 660 } 661} 662 663void 664fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr) 665{ 666 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; 667 fs_inst *inst; 668 669 fs_reg result = get_nir_dest(instr->dest.dest); 670 result.type = brw_type_for_nir_type(devinfo, 671 (nir_alu_type)(nir_op_infos[instr->op].output_type | 672 nir_dest_bit_size(instr->dest.dest))); 673 674 fs_reg op[4]; 675 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 676 op[i] = get_nir_src(instr->src[i].src); 677 op[i].type = brw_type_for_nir_type(devinfo, 678 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 679 nir_src_bit_size(instr->src[i].src))); 680 op[i].abs = instr->src[i].abs; 681 op[i].negate = instr->src[i].negate; 682 } 683 684 /* We get a bunch of mov's out of the from_ssa pass and they may still 685 * be vectorized. We'll handle them as a special-case. We'll also 686 * handle vecN here because it's basically the same thing. 687 */ 688 switch (instr->op) { 689 case nir_op_imov: 690 case nir_op_fmov: 691 case nir_op_vec2: 692 case nir_op_vec3: 693 case nir_op_vec4: { 694 fs_reg temp = result; 695 bool need_extra_copy = false; 696 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 697 if (!instr->src[i].src.is_ssa && 698 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 699 need_extra_copy = true; 700 temp = bld.vgrf(result.type, 4); 701 break; 702 } 703 } 704 705 for (unsigned i = 0; i < 4; i++) { 706 if (!(instr->dest.write_mask & (1 << i))) 707 continue; 708 709 if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { 710 inst = bld.MOV(offset(temp, bld, i), 711 offset(op[0], bld, instr->src[0].swizzle[i])); 712 } else { 713 inst = bld.MOV(offset(temp, bld, i), 714 offset(op[i], bld, instr->src[i].swizzle[0])); 715 } 716 inst->saturate = instr->dest.saturate; 717 } 718 719 /* In this case the source and destination registers were the same, 720 * so we need to insert an extra set of moves in order to deal with 721 * any swizzling. 722 */ 723 if (need_extra_copy) { 724 for (unsigned i = 0; i < 4; i++) { 725 if (!(instr->dest.write_mask & (1 << i))) 726 continue; 727 728 bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 729 } 730 } 731 return; 732 } 733 default: 734 break; 735 } 736 737 /* At this point, we have dealt with any instruction that operates on 738 * more than a single channel. Therefore, we can just adjust the source 739 * and destination registers for that channel and emit the instruction. 740 */ 741 unsigned channel = 0; 742 if (nir_op_infos[instr->op].output_size == 0) { 743 /* Since NIR is doing the scalarizing for us, we should only ever see 744 * vectorized operations with a single channel. 745 */ 746 assert(util_bitcount(instr->dest.write_mask) == 1); 747 channel = ffs(instr->dest.write_mask) - 1; 748 749 result = offset(result, bld, channel); 750 } 751 752 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 753 assert(nir_op_infos[instr->op].input_sizes[i] < 2); 754 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 755 } 756 757 switch (instr->op) { 758 case nir_op_i2f32: 759 case nir_op_u2f32: 760 if (optimize_extract_to_float(instr, result)) 761 return; 762 inst = bld.MOV(result, op[0]); 763 inst->saturate = instr->dest.saturate; 764 break; 765 766 case nir_op_f2f16_rtne: 767 case nir_op_f2f16_rtz: 768 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 769 brw_imm_d(brw_rnd_mode_from_nir_op(instr->op))); 770 /* fallthrough */ 771 772 /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 773 * on the HW gen, it is a special hw opcode or just a MOV, and 774 * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 775 * 776 * But if we want to use that opcode, we need to provide support on 777 * different optimizations and lowerings. As right now HF support is 778 * only for gen8+, it will be better to use directly the MOV, and use 779 * BRW_OPCODE_F32TO16 when/if we work for HF support on gen7. 780 */ 781 782 case nir_op_f2f16: 783 inst = bld.MOV(result, op[0]); 784 inst->saturate = instr->dest.saturate; 785 break; 786 787 case nir_op_b2i: 788 case nir_op_b2f: 789 op[0].type = BRW_REGISTER_TYPE_D; 790 op[0].negate = !op[0].negate; 791 /* fallthrough */ 792 case nir_op_f2f64: 793 case nir_op_f2i64: 794 case nir_op_f2u64: 795 case nir_op_i2f64: 796 case nir_op_i2i64: 797 case nir_op_u2f64: 798 case nir_op_u2u64: 799 /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions: 800 * 801 * "When source or destination is 64b (...), regioning in Align1 802 * must follow these rules: 803 * 804 * 1. Source and destination horizontal stride must be aligned to 805 * the same qword. 806 * (...)" 807 * 808 * This means that conversions from bit-sizes smaller than 64-bit to 809 * 64-bit need to have the source data elements aligned to 64-bit. 810 * This restriction does not apply to BDW and later. 811 */ 812 if (nir_dest_bit_size(instr->dest.dest) == 64 && 813 nir_src_bit_size(instr->src[0].src) < 64 && 814 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { 815 fs_reg tmp = bld.vgrf(result.type, 1); 816 tmp = subscript(tmp, op[0].type, 0); 817 inst = bld.MOV(tmp, op[0]); 818 inst = bld.MOV(result, tmp); 819 inst->saturate = instr->dest.saturate; 820 break; 821 } 822 /* fallthrough */ 823 case nir_op_f2f32: 824 case nir_op_f2i32: 825 case nir_op_f2u32: 826 case nir_op_f2i16: 827 case nir_op_f2u16: 828 case nir_op_i2i32: 829 case nir_op_u2u32: 830 case nir_op_i2i16: 831 case nir_op_u2u16: 832 case nir_op_i2f16: 833 case nir_op_u2f16: 834 case nir_op_i2i8: 835 case nir_op_u2u8: 836 inst = bld.MOV(result, op[0]); 837 inst->saturate = instr->dest.saturate; 838 break; 839 840 case nir_op_fsign: { 841 assert(!instr->dest.saturate); 842 if (op[0].abs) { 843 /* Straightforward since the source can be assumed to be either 844 * strictly >= 0 or strictly <= 0 depending on the setting of the 845 * negate flag. 846 */ 847 set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0])); 848 849 inst = (op[0].negate) 850 ? bld.MOV(result, brw_imm_f(-1.0f)) 851 : bld.MOV(result, brw_imm_f(1.0f)); 852 853 set_predicate(BRW_PREDICATE_NORMAL, inst); 854 } else if (type_sz(op[0].type) < 8) { 855 /* AND(val, 0x80000000) gives the sign bit. 856 * 857 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 858 * zero. 859 */ 860 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 861 862 fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD); 863 op[0].type = BRW_REGISTER_TYPE_UD; 864 result.type = BRW_REGISTER_TYPE_UD; 865 bld.AND(result_int, op[0], brw_imm_ud(0x80000000u)); 866 867 inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u)); 868 inst->predicate = BRW_PREDICATE_NORMAL; 869 } else { 870 /* For doubles we do the same but we need to consider: 871 * 872 * - 2-src instructions can't operate with 64-bit immediates 873 * - The sign is encoded in the high 32-bit of each DF 874 * - We need to produce a DF result. 875 */ 876 877 fs_reg zero = vgrf(glsl_type::double_type); 878 bld.MOV(zero, setup_imm_df(bld, 0.0)); 879 bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 880 881 bld.MOV(result, zero); 882 883 fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 884 bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 885 brw_imm_ud(0x80000000u)); 886 887 set_predicate(BRW_PREDICATE_NORMAL, 888 bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 889 } 890 break; 891 } 892 893 case nir_op_isign: { 894 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1). 895 * -> non-negative val generates 0x00000000. 896 * Predicated OR sets 1 if val is positive. 897 */ 898 uint32_t bit_size = nir_dest_bit_size(instr->dest.dest); 899 assert(bit_size == 32 || bit_size == 16); 900 901 fs_reg zero = bit_size == 32 ? brw_imm_d(0) : brw_imm_w(0); 902 fs_reg one = bit_size == 32 ? brw_imm_d(1) : brw_imm_w(1); 903 fs_reg shift = bit_size == 32 ? brw_imm_d(31) : brw_imm_w(15); 904 905 bld.CMP(bld.null_reg_d(), op[0], zero, BRW_CONDITIONAL_G); 906 bld.ASR(result, op[0], shift); 907 inst = bld.OR(result, result, one); 908 inst->predicate = BRW_PREDICATE_NORMAL; 909 break; 910 } 911 912 case nir_op_frcp: 913 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 914 inst->saturate = instr->dest.saturate; 915 break; 916 917 case nir_op_fexp2: 918 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 919 inst->saturate = instr->dest.saturate; 920 break; 921 922 case nir_op_flog2: 923 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 924 inst->saturate = instr->dest.saturate; 925 break; 926 927 case nir_op_fsin: 928 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 929 inst->saturate = instr->dest.saturate; 930 break; 931 932 case nir_op_fcos: 933 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 934 inst->saturate = instr->dest.saturate; 935 break; 936 937 case nir_op_fddx: 938 if (fs_key->high_quality_derivatives) { 939 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 940 } else { 941 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 942 } 943 inst->saturate = instr->dest.saturate; 944 break; 945 case nir_op_fddx_fine: 946 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 947 inst->saturate = instr->dest.saturate; 948 break; 949 case nir_op_fddx_coarse: 950 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 951 inst->saturate = instr->dest.saturate; 952 break; 953 case nir_op_fddy: 954 if (fs_key->high_quality_derivatives) { 955 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 956 } else { 957 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 958 } 959 inst->saturate = instr->dest.saturate; 960 break; 961 case nir_op_fddy_fine: 962 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 963 inst->saturate = instr->dest.saturate; 964 break; 965 case nir_op_fddy_coarse: 966 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 967 inst->saturate = instr->dest.saturate; 968 break; 969 970 case nir_op_iadd: 971 case nir_op_fadd: 972 inst = bld.ADD(result, op[0], op[1]); 973 inst->saturate = instr->dest.saturate; 974 break; 975 976 case nir_op_fmul: 977 inst = bld.MUL(result, op[0], op[1]); 978 inst->saturate = instr->dest.saturate; 979 break; 980 981 case nir_op_imul: 982 assert(nir_dest_bit_size(instr->dest.dest) < 64); 983 bld.MUL(result, op[0], op[1]); 984 break; 985 986 case nir_op_imul_high: 987 case nir_op_umul_high: 988 assert(nir_dest_bit_size(instr->dest.dest) < 64); 989 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 990 break; 991 992 case nir_op_idiv: 993 case nir_op_udiv: 994 assert(nir_dest_bit_size(instr->dest.dest) < 64); 995 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 996 break; 997 998 case nir_op_uadd_carry: 999 unreachable("Should have been lowered by carry_to_arith()."); 1000 1001 case nir_op_usub_borrow: 1002 unreachable("Should have been lowered by borrow_to_arith()."); 1003 1004 case nir_op_umod: 1005 case nir_op_irem: 1006 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1007 * appears that our hardware just does the right thing for signed 1008 * remainder. 1009 */ 1010 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1011 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1012 break; 1013 1014 case nir_op_imod: { 1015 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1016 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1017 1018 /* Math instructions don't support conditional mod */ 1019 inst = bld.MOV(bld.null_reg_d(), result); 1020 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1021 1022 /* Now, we need to determine if signs of the sources are different. 1023 * When we XOR the sources, the top bit is 0 if they are the same and 1 1024 * if they are different. We can then use a conditional modifier to 1025 * turn that into a predicate. This leads us to an XOR.l instruction. 1026 * 1027 * Technically, according to the PRM, you're not allowed to use .l on a 1028 * XOR instruction. However, emperical experiments and Curro's reading 1029 * of the simulator source both indicate that it's safe. 1030 */ 1031 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1032 inst = bld.XOR(tmp, op[0], op[1]); 1033 inst->predicate = BRW_PREDICATE_NORMAL; 1034 inst->conditional_mod = BRW_CONDITIONAL_L; 1035 1036 /* If the result of the initial remainder operation is non-zero and the 1037 * two sources have different signs, add in a copy of op[1] to get the 1038 * final integer modulus value. 1039 */ 1040 inst = bld.ADD(result, result, op[1]); 1041 inst->predicate = BRW_PREDICATE_NORMAL; 1042 break; 1043 } 1044 1045 case nir_op_flt: 1046 case nir_op_fge: 1047 case nir_op_feq: 1048 case nir_op_fne: { 1049 fs_reg dest = result; 1050 1051 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1052 if (bit_size != 32) 1053 dest = bld.vgrf(op[0].type, 1); 1054 1055 brw_conditional_mod cond; 1056 switch (instr->op) { 1057 case nir_op_flt: 1058 cond = BRW_CONDITIONAL_L; 1059 break; 1060 case nir_op_fge: 1061 cond = BRW_CONDITIONAL_GE; 1062 break; 1063 case nir_op_feq: 1064 cond = BRW_CONDITIONAL_Z; 1065 break; 1066 case nir_op_fne: 1067 cond = BRW_CONDITIONAL_NZ; 1068 break; 1069 default: 1070 unreachable("bad opcode"); 1071 } 1072 1073 bld.CMP(dest, op[0], op[1], cond); 1074 1075 if (bit_size > 32) { 1076 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1077 } else if(bit_size < 32) { 1078 /* When we convert the result to 32-bit we need to be careful and do 1079 * it as a signed conversion to get sign extension (for 32-bit true) 1080 */ 1081 const brw_reg_type src_type = 1082 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1083 1084 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1085 } 1086 break; 1087 } 1088 1089 case nir_op_ilt: 1090 case nir_op_ult: 1091 case nir_op_ige: 1092 case nir_op_uge: 1093 case nir_op_ieq: 1094 case nir_op_ine: { 1095 fs_reg dest = result; 1096 1097 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1098 if (bit_size != 32) 1099 dest = bld.vgrf(op[0].type, 1); 1100 1101 brw_conditional_mod cond; 1102 switch (instr->op) { 1103 case nir_op_ilt: 1104 case nir_op_ult: 1105 cond = BRW_CONDITIONAL_L; 1106 break; 1107 case nir_op_ige: 1108 case nir_op_uge: 1109 cond = BRW_CONDITIONAL_GE; 1110 break; 1111 case nir_op_ieq: 1112 cond = BRW_CONDITIONAL_Z; 1113 break; 1114 case nir_op_ine: 1115 cond = BRW_CONDITIONAL_NZ; 1116 break; 1117 default: 1118 unreachable("bad opcode"); 1119 } 1120 bld.CMP(dest, op[0], op[1], cond); 1121 1122 if (bit_size > 32) { 1123 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1124 } else if (bit_size < 32) { 1125 /* When we convert the result to 32-bit we need to be careful and do 1126 * it as a signed conversion to get sign extension (for 32-bit true) 1127 */ 1128 const brw_reg_type src_type = 1129 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1130 1131 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1132 } 1133 break; 1134 } 1135 1136 case nir_op_inot: 1137 if (devinfo->gen >= 8) { 1138 op[0] = resolve_source_modifiers(op[0]); 1139 } 1140 bld.NOT(result, op[0]); 1141 break; 1142 case nir_op_ixor: 1143 if (devinfo->gen >= 8) { 1144 op[0] = resolve_source_modifiers(op[0]); 1145 op[1] = resolve_source_modifiers(op[1]); 1146 } 1147 bld.XOR(result, op[0], op[1]); 1148 break; 1149 case nir_op_ior: 1150 if (devinfo->gen >= 8) { 1151 op[0] = resolve_source_modifiers(op[0]); 1152 op[1] = resolve_source_modifiers(op[1]); 1153 } 1154 bld.OR(result, op[0], op[1]); 1155 break; 1156 case nir_op_iand: 1157 if (devinfo->gen >= 8) { 1158 op[0] = resolve_source_modifiers(op[0]); 1159 op[1] = resolve_source_modifiers(op[1]); 1160 } 1161 bld.AND(result, op[0], op[1]); 1162 break; 1163 1164 case nir_op_fdot2: 1165 case nir_op_fdot3: 1166 case nir_op_fdot4: 1167 case nir_op_ball_fequal2: 1168 case nir_op_ball_iequal2: 1169 case nir_op_ball_fequal3: 1170 case nir_op_ball_iequal3: 1171 case nir_op_ball_fequal4: 1172 case nir_op_ball_iequal4: 1173 case nir_op_bany_fnequal2: 1174 case nir_op_bany_inequal2: 1175 case nir_op_bany_fnequal3: 1176 case nir_op_bany_inequal3: 1177 case nir_op_bany_fnequal4: 1178 case nir_op_bany_inequal4: 1179 unreachable("Lowered by nir_lower_alu_reductions"); 1180 1181 case nir_op_fnoise1_1: 1182 case nir_op_fnoise1_2: 1183 case nir_op_fnoise1_3: 1184 case nir_op_fnoise1_4: 1185 case nir_op_fnoise2_1: 1186 case nir_op_fnoise2_2: 1187 case nir_op_fnoise2_3: 1188 case nir_op_fnoise2_4: 1189 case nir_op_fnoise3_1: 1190 case nir_op_fnoise3_2: 1191 case nir_op_fnoise3_3: 1192 case nir_op_fnoise3_4: 1193 case nir_op_fnoise4_1: 1194 case nir_op_fnoise4_2: 1195 case nir_op_fnoise4_3: 1196 case nir_op_fnoise4_4: 1197 unreachable("not reached: should be handled by lower_noise"); 1198 1199 case nir_op_ldexp: 1200 unreachable("not reached: should be handled by ldexp_to_arith()"); 1201 1202 case nir_op_fsqrt: 1203 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1204 inst->saturate = instr->dest.saturate; 1205 break; 1206 1207 case nir_op_frsq: 1208 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1209 inst->saturate = instr->dest.saturate; 1210 break; 1211 1212 case nir_op_i2b: 1213 case nir_op_f2b: { 1214 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1215 if (bit_size == 64) { 1216 /* two-argument instructions can't take 64-bit immediates */ 1217 fs_reg zero; 1218 fs_reg tmp; 1219 1220 if (instr->op == nir_op_f2b) { 1221 zero = vgrf(glsl_type::double_type); 1222 tmp = vgrf(glsl_type::double_type); 1223 bld.MOV(zero, setup_imm_df(bld, 0.0)); 1224 } else { 1225 zero = vgrf(glsl_type::int64_t_type); 1226 tmp = vgrf(glsl_type::int64_t_type); 1227 bld.MOV(zero, brw_imm_q(0)); 1228 } 1229 1230 /* A SIMD16 execution needs to be split in two instructions, so use 1231 * a vgrf instead of the flag register as dst so instruction splitting 1232 * works 1233 */ 1234 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1235 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1236 } else { 1237 fs_reg zero; 1238 if (bit_size == 32) { 1239 zero = instr->op == nir_op_f2b ? brw_imm_f(0.0f) : brw_imm_d(0); 1240 } else { 1241 assert(bit_size == 16); 1242 zero = instr->op == nir_op_f2b ? 1243 retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1244 } 1245 bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1246 } 1247 break; 1248 } 1249 1250 case nir_op_ftrunc: 1251 inst = bld.RNDZ(result, op[0]); 1252 inst->saturate = instr->dest.saturate; 1253 break; 1254 1255 case nir_op_fceil: { 1256 op[0].negate = !op[0].negate; 1257 fs_reg temp = vgrf(glsl_type::float_type); 1258 bld.RNDD(temp, op[0]); 1259 temp.negate = true; 1260 inst = bld.MOV(result, temp); 1261 inst->saturate = instr->dest.saturate; 1262 break; 1263 } 1264 case nir_op_ffloor: 1265 inst = bld.RNDD(result, op[0]); 1266 inst->saturate = instr->dest.saturate; 1267 break; 1268 case nir_op_ffract: 1269 inst = bld.FRC(result, op[0]); 1270 inst->saturate = instr->dest.saturate; 1271 break; 1272 case nir_op_fround_even: 1273 inst = bld.RNDE(result, op[0]); 1274 inst->saturate = instr->dest.saturate; 1275 break; 1276 1277 case nir_op_fquantize2f16: { 1278 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1279 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1280 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1281 1282 /* The destination stride must be at least as big as the source stride. */ 1283 tmp16.type = BRW_REGISTER_TYPE_W; 1284 tmp16.stride = 2; 1285 1286 /* Check for denormal */ 1287 fs_reg abs_src0 = op[0]; 1288 abs_src0.abs = true; 1289 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1290 BRW_CONDITIONAL_L); 1291 /* Get the appropriately signed zero */ 1292 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1293 retype(op[0], BRW_REGISTER_TYPE_UD), 1294 brw_imm_ud(0x80000000)); 1295 /* Do the actual F32 -> F16 -> F32 conversion */ 1296 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1297 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1298 /* Select that or zero based on normal status */ 1299 inst = bld.SEL(result, zero, tmp32); 1300 inst->predicate = BRW_PREDICATE_NORMAL; 1301 inst->saturate = instr->dest.saturate; 1302 break; 1303 } 1304 1305 case nir_op_imin: 1306 case nir_op_umin: 1307 case nir_op_fmin: 1308 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1309 inst->saturate = instr->dest.saturate; 1310 break; 1311 1312 case nir_op_imax: 1313 case nir_op_umax: 1314 case nir_op_fmax: 1315 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1316 inst->saturate = instr->dest.saturate; 1317 break; 1318 1319 case nir_op_pack_snorm_2x16: 1320 case nir_op_pack_snorm_4x8: 1321 case nir_op_pack_unorm_2x16: 1322 case nir_op_pack_unorm_4x8: 1323 case nir_op_unpack_snorm_2x16: 1324 case nir_op_unpack_snorm_4x8: 1325 case nir_op_unpack_unorm_2x16: 1326 case nir_op_unpack_unorm_4x8: 1327 case nir_op_unpack_half_2x16: 1328 case nir_op_pack_half_2x16: 1329 unreachable("not reached: should be handled by lower_packing_builtins"); 1330 1331 case nir_op_unpack_half_2x16_split_x: 1332 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]); 1333 inst->saturate = instr->dest.saturate; 1334 break; 1335 case nir_op_unpack_half_2x16_split_y: 1336 inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]); 1337 inst->saturate = instr->dest.saturate; 1338 break; 1339 1340 case nir_op_pack_64_2x32_split: 1341 case nir_op_pack_32_2x16_split: 1342 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1343 break; 1344 1345 case nir_op_unpack_64_2x32_split_x: 1346 case nir_op_unpack_64_2x32_split_y: { 1347 if (instr->op == nir_op_unpack_64_2x32_split_x) 1348 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1349 else 1350 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1351 break; 1352 } 1353 1354 case nir_op_unpack_32_2x16_split_x: 1355 case nir_op_unpack_32_2x16_split_y: { 1356 if (instr->op == nir_op_unpack_32_2x16_split_x) 1357 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1358 else 1359 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1360 break; 1361 } 1362 1363 case nir_op_fpow: 1364 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1365 inst->saturate = instr->dest.saturate; 1366 break; 1367 1368 case nir_op_bitfield_reverse: 1369 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1370 bld.BFREV(result, op[0]); 1371 break; 1372 1373 case nir_op_bit_count: 1374 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1375 bld.CBIT(result, op[0]); 1376 break; 1377 1378 case nir_op_ufind_msb: { 1379 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1380 emit_find_msb_using_lzd(bld, result, op[0], false); 1381 break; 1382 } 1383 1384 case nir_op_ifind_msb: { 1385 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1386 1387 if (devinfo->gen < 7) { 1388 emit_find_msb_using_lzd(bld, result, op[0], true); 1389 } else { 1390 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1391 1392 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1393 * count from the LSB side. If FBH didn't return an error 1394 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1395 * count into an LSB count. 1396 */ 1397 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1398 1399 inst = bld.ADD(result, result, brw_imm_d(31)); 1400 inst->predicate = BRW_PREDICATE_NORMAL; 1401 inst->src[0].negate = true; 1402 } 1403 break; 1404 } 1405 1406 case nir_op_find_lsb: 1407 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1408 1409 if (devinfo->gen < 7) { 1410 fs_reg temp = vgrf(glsl_type::int_type); 1411 1412 /* (x & -x) generates a value that consists of only the LSB of x. 1413 * For all powers of 2, findMSB(y) == findLSB(y). 1414 */ 1415 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1416 fs_reg negated_src = src; 1417 1418 /* One must be negated, and the other must be non-negated. It 1419 * doesn't matter which is which. 1420 */ 1421 negated_src.negate = true; 1422 src.negate = false; 1423 1424 bld.AND(temp, src, negated_src); 1425 emit_find_msb_using_lzd(bld, result, temp, false); 1426 } else { 1427 bld.FBL(result, op[0]); 1428 } 1429 break; 1430 1431 case nir_op_ubitfield_extract: 1432 case nir_op_ibitfield_extract: 1433 unreachable("should have been lowered"); 1434 case nir_op_ubfe: 1435 case nir_op_ibfe: 1436 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1437 bld.BFE(result, op[2], op[1], op[0]); 1438 break; 1439 case nir_op_bfm: 1440 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1441 bld.BFI1(result, op[0], op[1]); 1442 break; 1443 case nir_op_bfi: 1444 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1445 bld.BFI2(result, op[0], op[1], op[2]); 1446 break; 1447 1448 case nir_op_bitfield_insert: 1449 unreachable("not reached: should have been lowered"); 1450 1451 case nir_op_ishl: 1452 case nir_op_ishr: 1453 case nir_op_ushr: { 1454 fs_reg shift_count = op[1]; 1455 1456 if (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo)) { 1457 if (op[1].file == VGRF && 1458 (result.type == BRW_REGISTER_TYPE_Q || 1459 result.type == BRW_REGISTER_TYPE_UQ)) { 1460 shift_count = fs_reg(VGRF, alloc.allocate(dispatch_width / 4), 1461 BRW_REGISTER_TYPE_UD); 1462 shift_count.stride = 2; 1463 bld.MOV(shift_count, op[1]); 1464 } 1465 } 1466 1467 switch (instr->op) { 1468 case nir_op_ishl: 1469 bld.SHL(result, op[0], shift_count); 1470 break; 1471 case nir_op_ishr: 1472 bld.ASR(result, op[0], shift_count); 1473 break; 1474 case nir_op_ushr: 1475 bld.SHR(result, op[0], shift_count); 1476 break; 1477 default: 1478 unreachable("not reached"); 1479 } 1480 break; 1481 } 1482 1483 case nir_op_pack_half_2x16_split: 1484 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1485 break; 1486 1487 case nir_op_ffma: 1488 inst = bld.MAD(result, op[2], op[1], op[0]); 1489 inst->saturate = instr->dest.saturate; 1490 break; 1491 1492 case nir_op_flrp: 1493 inst = bld.LRP(result, op[0], op[1], op[2]); 1494 inst->saturate = instr->dest.saturate; 1495 break; 1496 1497 case nir_op_bcsel: 1498 if (optimize_frontfacing_ternary(instr, result)) 1499 return; 1500 1501 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1502 inst = bld.SEL(result, op[1], op[2]); 1503 inst->predicate = BRW_PREDICATE_NORMAL; 1504 break; 1505 1506 case nir_op_extract_u8: 1507 case nir_op_extract_i8: { 1508 nir_const_value *byte = nir_src_as_const_value(instr->src[1].src); 1509 assert(byte != NULL); 1510 1511 /* The PRMs say: 1512 * 1513 * BDW+ 1514 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1515 * Use two instructions and a word or DWord intermediate integer type. 1516 */ 1517 if (nir_dest_bit_size(instr->dest.dest) == 64) { 1518 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i8); 1519 1520 if (instr->op == nir_op_extract_i8) { 1521 /* If we need to sign extend, extract to a word first */ 1522 fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1523 bld.MOV(w_temp, subscript(op[0], type, byte->u32[0])); 1524 bld.MOV(result, w_temp); 1525 } else { 1526 /* Otherwise use an AND with 0xff and a word type */ 1527 bld.AND(result, subscript(op[0], type, byte->u32[0] / 2), brw_imm_uw(0xff)); 1528 } 1529 } else { 1530 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1531 bld.MOV(result, subscript(op[0], type, byte->u32[0])); 1532 } 1533 break; 1534 } 1535 1536 case nir_op_extract_u16: 1537 case nir_op_extract_i16: { 1538 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 1539 nir_const_value *word = nir_src_as_const_value(instr->src[1].src); 1540 assert(word != NULL); 1541 bld.MOV(result, subscript(op[0], type, word->u32[0])); 1542 break; 1543 } 1544 1545 default: 1546 unreachable("unhandled instruction"); 1547 } 1548 1549 /* If we need to do a boolean resolve, replace the result with -(x & 1) 1550 * to sign extend the low bit to 0/~0 1551 */ 1552 if (devinfo->gen <= 5 && 1553 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 1554 fs_reg masked = vgrf(glsl_type::int_type); 1555 bld.AND(masked, result, brw_imm_d(1)); 1556 masked.negate = true; 1557 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 1558 } 1559} 1560 1561void 1562fs_visitor::nir_emit_load_const(const fs_builder &bld, 1563 nir_load_const_instr *instr) 1564{ 1565 const brw_reg_type reg_type = 1566 brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 1567 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 1568 1569 switch (instr->def.bit_size) { 1570 case 8: 1571 for (unsigned i = 0; i < instr->def.num_components; i++) 1572 bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value.i8[i])); 1573 break; 1574 1575 case 16: 1576 for (unsigned i = 0; i < instr->def.num_components; i++) 1577 bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value.i16[i])); 1578 break; 1579 1580 case 32: 1581 for (unsigned i = 0; i < instr->def.num_components; i++) 1582 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i])); 1583 break; 1584 1585 case 64: 1586 assert(devinfo->gen >= 7); 1587 if (devinfo->gen == 7) { 1588 /* We don't get 64-bit integer types until gen8 */ 1589 for (unsigned i = 0; i < instr->def.num_components; i++) { 1590 bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 1591 setup_imm_df(bld, instr->value.f64[i])); 1592 } 1593 } else { 1594 for (unsigned i = 0; i < instr->def.num_components; i++) 1595 bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value.i64[i])); 1596 } 1597 break; 1598 1599 default: 1600 unreachable("Invalid bit size"); 1601 } 1602 1603 nir_ssa_values[instr->def.index] = reg; 1604} 1605 1606fs_reg 1607fs_visitor::get_nir_src(const nir_src &src) 1608{ 1609 fs_reg reg; 1610 if (src.is_ssa) { 1611 if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) { 1612 const brw_reg_type reg_type = 1613 brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 1614 reg = bld.vgrf(reg_type, src.ssa->num_components); 1615 } else { 1616 reg = nir_ssa_values[src.ssa->index]; 1617 } 1618 } else { 1619 /* We don't handle indirects on locals */ 1620 assert(src.reg.indirect == NULL); 1621 reg = offset(nir_locals[src.reg.reg->index], bld, 1622 src.reg.base_offset * src.reg.reg->num_components); 1623 } 1624 1625 if (nir_src_bit_size(src) == 64 && devinfo->gen == 7) { 1626 /* The only 64-bit type available on gen7 is DF, so use that. */ 1627 reg.type = BRW_REGISTER_TYPE_DF; 1628 } else { 1629 /* To avoid floating-point denorm flushing problems, set the type by 1630 * default to an integer type - instructions that need floating point 1631 * semantics will set this to F if they need to 1632 */ 1633 reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 1634 BRW_REGISTER_TYPE_D); 1635 } 1636 1637 return reg; 1638} 1639 1640/** 1641 * Return an IMM for constants; otherwise call get_nir_src() as normal. 1642 * 1643 * This function should not be called on any value which may be 64 bits. 1644 * We could theoretically support 64-bit on gen8+ but we choose not to 1645 * because it wouldn't work in general (no gen7 support) and there are 1646 * enough restrictions in 64-bit immediates that you can't take the return 1647 * value and treat it the same as the result of get_nir_src(). 1648 */ 1649fs_reg 1650fs_visitor::get_nir_src_imm(const nir_src &src) 1651{ 1652 nir_const_value *val = nir_src_as_const_value(src); 1653 assert(nir_src_bit_size(src) == 32); 1654 return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src); 1655} 1656 1657fs_reg 1658fs_visitor::get_nir_dest(const nir_dest &dest) 1659{ 1660 if (dest.is_ssa) { 1661 const brw_reg_type reg_type = 1662 brw_reg_type_from_bit_size(dest.ssa.bit_size, 1663 dest.ssa.bit_size == 8 ? 1664 BRW_REGISTER_TYPE_D : 1665 BRW_REGISTER_TYPE_F); 1666 nir_ssa_values[dest.ssa.index] = 1667 bld.vgrf(reg_type, dest.ssa.num_components); 1668 return nir_ssa_values[dest.ssa.index]; 1669 } else { 1670 /* We don't handle indirects on locals */ 1671 assert(dest.reg.indirect == NULL); 1672 return offset(nir_locals[dest.reg.reg->index], bld, 1673 dest.reg.base_offset * dest.reg.reg->num_components); 1674 } 1675} 1676 1677void 1678fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 1679 unsigned wr_mask) 1680{ 1681 for (unsigned i = 0; i < 4; i++) { 1682 if (!((wr_mask >> i) & 1)) 1683 continue; 1684 1685 fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 1686 new_inst->dst = offset(new_inst->dst, bld, i); 1687 for (unsigned j = 0; j < new_inst->sources; j++) 1688 if (new_inst->src[j].file == VGRF) 1689 new_inst->src[j] = offset(new_inst->src[j], bld, i); 1690 1691 bld.emit(new_inst); 1692 } 1693} 1694 1695static fs_inst * 1696emit_pixel_interpolater_send(const fs_builder &bld, 1697 enum opcode opcode, 1698 const fs_reg &dst, 1699 const fs_reg &src, 1700 const fs_reg &desc, 1701 glsl_interp_mode interpolation) 1702{ 1703 struct brw_wm_prog_data *wm_prog_data = 1704 brw_wm_prog_data(bld.shader->stage_prog_data); 1705 1706 fs_inst *inst = bld.emit(opcode, dst, src, desc); 1707 /* 2 floats per slot returned */ 1708 inst->size_written = 2 * dst.component_size(inst->exec_size); 1709 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; 1710 1711 wm_prog_data->pulls_bary = true; 1712 1713 return inst; 1714} 1715 1716/** 1717 * Computes 1 << x, given a D/UD register containing some value x. 1718 */ 1719static fs_reg 1720intexp2(const fs_builder &bld, const fs_reg &x) 1721{ 1722 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 1723 1724 fs_reg result = bld.vgrf(x.type, 1); 1725 fs_reg one = bld.vgrf(x.type, 1); 1726 1727 bld.MOV(one, retype(brw_imm_d(1), one.type)); 1728 bld.SHL(result, one, x); 1729 return result; 1730} 1731 1732void 1733fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 1734{ 1735 assert(stage == MESA_SHADER_GEOMETRY); 1736 1737 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1738 1739 if (gs_compile->control_data_header_size_bits == 0) 1740 return; 1741 1742 /* We can only do EndPrimitive() functionality when the control data 1743 * consists of cut bits. Fortunately, the only time it isn't is when the 1744 * output type is points, in which case EndPrimitive() is a no-op. 1745 */ 1746 if (gs_prog_data->control_data_format != 1747 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 1748 return; 1749 } 1750 1751 /* Cut bits use one bit per vertex. */ 1752 assert(gs_compile->control_data_bits_per_vertex == 1); 1753 1754 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 1755 vertex_count.type = BRW_REGISTER_TYPE_UD; 1756 1757 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 1758 * vertex n, 0 otherwise. So all we need to do here is mark bit 1759 * (vertex_count - 1) % 32 in the cut_bits register to indicate that 1760 * EndPrimitive() was called after emitting vertex (vertex_count - 1); 1761 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 1762 * 1763 * Note that if EndPrimitive() is called before emitting any vertices, this 1764 * will cause us to set bit 31 of the control_data_bits register to 1. 1765 * That's fine because: 1766 * 1767 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 1768 * output, so the hardware will ignore cut bit 31. 1769 * 1770 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 1771 * last vertex, so setting cut bit 31 has no effect (since the primitive 1772 * is automatically ended when the GS terminates). 1773 * 1774 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 1775 * control_data_bits register to 0 when the first vertex is emitted. 1776 */ 1777 1778 const fs_builder abld = bld.annotate("end primitive"); 1779 1780 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 1781 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1782 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 1783 fs_reg mask = intexp2(abld, prev_count); 1784 /* Note: we're relying on the fact that the GEN SHL instruction only pays 1785 * attention to the lower 5 bits of its second source argument, so on this 1786 * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 1787 * ((vertex_count - 1) % 32). 1788 */ 1789 abld.OR(this->control_data_bits, this->control_data_bits, mask); 1790} 1791 1792void 1793fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 1794{ 1795 assert(stage == MESA_SHADER_GEOMETRY); 1796 assert(gs_compile->control_data_bits_per_vertex != 0); 1797 1798 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1799 1800 const fs_builder abld = bld.annotate("emit control data bits"); 1801 const fs_builder fwa_bld = bld.exec_all(); 1802 1803 /* We use a single UD register to accumulate control data bits (32 bits 1804 * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 1805 * at a time. 1806 * 1807 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 1808 * We have select a 128-bit group via the Global and Per-Slot Offsets, then 1809 * use the Channel Mask phase to enable/disable which DWord within that 1810 * group to write. (Remember, different SIMD8 channels may have emitted 1811 * different numbers of vertices, so we may need per-slot offsets.) 1812 * 1813 * Channel masking presents an annoying problem: we may have to replicate 1814 * the data up to 4 times: 1815 * 1816 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 1817 * 1818 * To avoid penalizing shaders that emit a small number of vertices, we 1819 * can avoid these sometimes: if the size of the control data header is 1820 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 1821 * land in the same 128-bit group, so we can skip per-slot offsets. 1822 * 1823 * Similarly, if the control data header is <= 32 bits, there is only one 1824 * DWord, so we can skip channel masks. 1825 */ 1826 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 1827 1828 fs_reg channel_mask, per_slot_offset; 1829 1830 if (gs_compile->control_data_header_size_bits > 32) { 1831 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 1832 channel_mask = vgrf(glsl_type::uint_type); 1833 } 1834 1835 if (gs_compile->control_data_header_size_bits > 128) { 1836 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; 1837 per_slot_offset = vgrf(glsl_type::uint_type); 1838 } 1839 1840 /* Figure out which DWord we're trying to write to using the formula: 1841 * 1842 * dword_index = (vertex_count - 1) * bits_per_vertex / 32 1843 * 1844 * Since bits_per_vertex is a power of two, and is known at compile 1845 * time, this can be optimized to: 1846 * 1847 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 1848 */ 1849 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { 1850 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1851 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1852 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 1853 unsigned log2_bits_per_vertex = 1854 util_last_bit(gs_compile->control_data_bits_per_vertex); 1855 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 1856 1857 if (per_slot_offset.file != BAD_FILE) { 1858 /* Set the per-slot offset to dword_index / 4, so that we'll write to 1859 * the appropriate OWord within the control data header. 1860 */ 1861 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 1862 } 1863 1864 /* Set the channel masks to 1 << (dword_index % 4), so that we'll 1865 * write to the appropriate DWORD within the OWORD. 1866 */ 1867 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1868 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 1869 channel_mask = intexp2(fwa_bld, channel); 1870 /* Then the channel masks need to be in bits 23:16. */ 1871 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 1872 } 1873 1874 /* Store the control data bits in the message payload and send it. */ 1875 int mlen = 2; 1876 if (channel_mask.file != BAD_FILE) 1877 mlen += 4; /* channel masks, plus 3 extra copies of the data */ 1878 if (per_slot_offset.file != BAD_FILE) 1879 mlen++; 1880 1881 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 1882 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); 1883 int i = 0; 1884 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 1885 if (per_slot_offset.file != BAD_FILE) 1886 sources[i++] = per_slot_offset; 1887 if (channel_mask.file != BAD_FILE) 1888 sources[i++] = channel_mask; 1889 while (i < mlen) { 1890 sources[i++] = this->control_data_bits; 1891 } 1892 1893 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); 1894 fs_inst *inst = abld.emit(opcode, reg_undef, payload); 1895 inst->mlen = mlen; 1896 /* We need to increment Global Offset by 256-bits to make room for 1897 * Broadwell's extra "Vertex Count" payload at the beginning of the 1898 * URB entry. Since this is an OWord message, Global Offset is counted 1899 * in 128-bit units, so we must set it to 2. 1900 */ 1901 if (gs_prog_data->static_vertex_count == -1) 1902 inst->offset = 2; 1903} 1904 1905void 1906fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 1907 unsigned stream_id) 1908{ 1909 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 1910 1911 /* Note: we are calling this *before* increasing vertex_count, so 1912 * this->vertex_count == vertex_count - 1 in the formula above. 1913 */ 1914 1915 /* Stream mode uses 2 bits per vertex */ 1916 assert(gs_compile->control_data_bits_per_vertex == 2); 1917 1918 /* Must be a valid stream */ 1919 assert(stream_id < MAX_VERTEX_STREAMS); 1920 1921 /* Control data bits are initialized to 0 so we don't have to set any 1922 * bits when sending vertices to stream 0. 1923 */ 1924 if (stream_id == 0) 1925 return; 1926 1927 const fs_builder abld = bld.annotate("set stream control data bits", NULL); 1928 1929 /* reg::sid = stream_id */ 1930 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1931 abld.MOV(sid, brw_imm_ud(stream_id)); 1932 1933 /* reg:shift_count = 2 * (vertex_count - 1) */ 1934 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1935 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 1936 1937 /* Note: we're relying on the fact that the GEN SHL instruction only pays 1938 * attention to the lower 5 bits of its second source argument, so on this 1939 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 1940 * stream_id << ((2 * (vertex_count - 1)) % 32). 1941 */ 1942 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 1943 abld.SHL(mask, sid, shift_count); 1944 abld.OR(this->control_data_bits, this->control_data_bits, mask); 1945} 1946 1947void 1948fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 1949 unsigned stream_id) 1950{ 1951 assert(stage == MESA_SHADER_GEOMETRY); 1952 1953 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 1954 1955 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 1956 vertex_count.type = BRW_REGISTER_TYPE_UD; 1957 1958 /* Haswell and later hardware ignores the "Render Stream Select" bits 1959 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 1960 * and instead sends all primitives down the pipeline for rasterization. 1961 * If the SOL stage is enabled, "Render Stream Select" is honored and 1962 * primitives bound to non-zero streams are discarded after stream output. 1963 * 1964 * Since the only purpose of primives sent to non-zero streams is to 1965 * be recorded by transform feedback, we can simply discard all geometry 1966 * bound to these streams when transform feedback is disabled. 1967 */ 1968 if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 1969 return; 1970 1971 /* If we're outputting 32 control data bits or less, then we can wait 1972 * until the shader is over to output them all. Otherwise we need to 1973 * output them as we go. Now is the time to do it, since we're about to 1974 * output the vertex_count'th vertex, so it's guaranteed that the 1975 * control data bits associated with the (vertex_count - 1)th vertex are 1976 * correct. 1977 */ 1978 if (gs_compile->control_data_header_size_bits > 32) { 1979 const fs_builder abld = 1980 bld.annotate("emit vertex: emit control data bits"); 1981 1982 /* Only emit control data bits if we've finished accumulating a batch 1983 * of 32 bits. This is the case when: 1984 * 1985 * (vertex_count * bits_per_vertex) % 32 == 0 1986 * 1987 * (in other words, when the last 5 bits of vertex_count * 1988 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 1989 * integer n (which is always the case, since bits_per_vertex is 1990 * always 1 or 2), this is equivalent to requiring that the last 5-n 1991 * bits of vertex_count are 0: 1992 * 1993 * vertex_count & (2^(5-n) - 1) == 0 1994 * 1995 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 1996 * equivalent to: 1997 * 1998 * vertex_count & (32 / bits_per_vertex - 1) == 0 1999 * 2000 * TODO: If vertex_count is an immediate, we could do some of this math 2001 * at compile time... 2002 */ 2003 fs_inst *inst = 2004 abld.AND(bld.null_reg_d(), vertex_count, 2005 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2006 inst->conditional_mod = BRW_CONDITIONAL_Z; 2007 2008 abld.IF(BRW_PREDICATE_NORMAL); 2009 /* If vertex_count is 0, then no control data bits have been 2010 * accumulated yet, so we can skip emitting them. 2011 */ 2012 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2013 BRW_CONDITIONAL_NEQ); 2014 abld.IF(BRW_PREDICATE_NORMAL); 2015 emit_gs_control_data_bits(vertex_count); 2016 abld.emit(BRW_OPCODE_ENDIF); 2017 2018 /* Reset control_data_bits to 0 so we can start accumulating a new 2019 * batch. 2020 * 2021 * Note: in the case where vertex_count == 0, this neutralizes the 2022 * effect of any call to EndPrimitive() that the shader may have 2023 * made before outputting its first vertex. 2024 */ 2025 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2026 inst->force_writemask_all = true; 2027 abld.emit(BRW_OPCODE_ENDIF); 2028 } 2029 2030 emit_urb_writes(vertex_count); 2031 2032 /* In stream mode we have to set control data bits for all vertices 2033 * unless we have disabled control data bits completely (which we do 2034 * do for GL_POINTS outputs that don't use streams). 2035 */ 2036 if (gs_compile->control_data_header_size_bits > 0 && 2037 gs_prog_data->control_data_format == 2038 GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2039 set_gs_stream_control_data_bits(vertex_count, stream_id); 2040 } 2041} 2042 2043void 2044fs_visitor::emit_gs_input_load(const fs_reg &dst, 2045 const nir_src &vertex_src, 2046 unsigned base_offset, 2047 const nir_src &offset_src, 2048 unsigned num_components, 2049 unsigned first_component) 2050{ 2051 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2052 2053 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); 2054 nir_const_value *offset_const = nir_src_as_const_value(offset_src); 2055 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2056 2057 /* TODO: figure out push input layout for invocations == 1 */ 2058 /* TODO: make this work with 64-bit inputs */ 2059 if (gs_prog_data->invocations == 1 && 2060 type_sz(dst.type) <= 4 && 2061 offset_const != NULL && vertex_const != NULL && 2062 4 * (base_offset + offset_const->u32[0]) < push_reg_count) { 2063 int imm_offset = (base_offset + offset_const->u32[0]) * 4 + 2064 vertex_const->u32[0] * push_reg_count; 2065 for (unsigned i = 0; i < num_components; i++) { 2066 bld.MOV(offset(dst, bld, i), 2067 fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2068 } 2069 return; 2070 } 2071 2072 /* Resort to the pull model. Ensure the VUE handles are provided. */ 2073 assert(gs_prog_data->base.include_vue_handles); 2074 2075 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2076 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2077 2078 if (gs_prog_data->invocations == 1) { 2079 if (vertex_const) { 2080 /* The vertex index is constant; just select the proper URB handle. */ 2081 icp_handle = 2082 retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0), 2083 BRW_REGISTER_TYPE_UD); 2084 } else { 2085 /* The vertex index is non-constant. We need to use indirect 2086 * addressing to fetch the proper URB handle. 2087 * 2088 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2089 * indicating that channel <n> should read the handle from 2090 * DWord <n>. We convert that to bytes by multiplying by 4. 2091 * 2092 * Next, we convert the vertex index to bytes by multiplying 2093 * by 32 (shifting by 5), and add the two together. This is 2094 * the final indirect byte offset. 2095 */ 2096 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2097 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2098 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2099 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2100 2101 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2102 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2103 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2104 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2105 /* Convert vertex_index to bytes (multiply by 32) */ 2106 bld.SHL(vertex_offset_bytes, 2107 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2108 brw_imm_ud(5u)); 2109 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2110 2111 /* Use first_icp_handle as the base offset. There is one register 2112 * of URB handles per vertex, so inform the register allocator that 2113 * we might read up to nir->info.gs.vertices_in registers. 2114 */ 2115 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2116 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2117 fs_reg(icp_offset_bytes), 2118 brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2119 } 2120 } else { 2121 assert(gs_prog_data->invocations > 1); 2122 2123 if (vertex_const) { 2124 assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5); 2125 bld.MOV(icp_handle, 2126 retype(brw_vec1_grf(first_icp_handle + 2127 vertex_const->i32[0] / 8, 2128 vertex_const->i32[0] % 8), 2129 BRW_REGISTER_TYPE_UD)); 2130 } else { 2131 /* The vertex index is non-constant. We need to use indirect 2132 * addressing to fetch the proper URB handle. 2133 * 2134 */ 2135 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2136 2137 /* Convert vertex_index to bytes (multiply by 4) */ 2138 bld.SHL(icp_offset_bytes, 2139 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2140 brw_imm_ud(2u)); 2141 2142 /* Use first_icp_handle as the base offset. There is one DWord 2143 * of URB handles per vertex, so inform the register allocator that 2144 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2145 */ 2146 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2147 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2148 fs_reg(icp_offset_bytes), 2149 brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2150 REG_SIZE)); 2151 } 2152 } 2153 2154 fs_inst *inst; 2155 2156 fs_reg tmp_dst = dst; 2157 fs_reg indirect_offset = get_nir_src(offset_src); 2158 unsigned num_iterations = 1; 2159 unsigned orig_num_components = num_components; 2160 2161 if (type_sz(dst.type) == 8) { 2162 if (num_components > 2) { 2163 num_iterations = 2; 2164 num_components = 2; 2165 } 2166 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2167 tmp_dst = tmp; 2168 first_component = first_component / 2; 2169 } 2170 2171 for (unsigned iter = 0; iter < num_iterations; iter++) { 2172 if (offset_const) { 2173 /* Constant indexing - use global offset. */ 2174 if (first_component != 0) { 2175 unsigned read_components = num_components + first_component; 2176 fs_reg tmp = bld.vgrf(dst.type, read_components); 2177 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2178 inst->size_written = read_components * 2179 tmp.component_size(inst->exec_size); 2180 for (unsigned i = 0; i < num_components; i++) { 2181 bld.MOV(offset(tmp_dst, bld, i), 2182 offset(tmp, bld, i + first_component)); 2183 } 2184 } else { 2185 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst, 2186 icp_handle); 2187 inst->size_written = num_components * 2188 tmp_dst.component_size(inst->exec_size); 2189 } 2190 inst->offset = base_offset + offset_const->u32[0]; 2191 inst->mlen = 1; 2192 } else { 2193 /* Indirect indexing - use per-slot offsets as well. */ 2194 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2195 unsigned read_components = num_components + first_component; 2196 fs_reg tmp = bld.vgrf(dst.type, read_components); 2197 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2198 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2199 if (first_component != 0) { 2200 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2201 payload); 2202 inst->size_written = read_components * 2203 tmp.component_size(inst->exec_size); 2204 for (unsigned i = 0; i < num_components; i++) { 2205 bld.MOV(offset(tmp_dst, bld, i), 2206 offset(tmp, bld, i + first_component)); 2207 } 2208 } else { 2209 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst, 2210 payload); 2211 inst->size_written = num_components * 2212 tmp_dst.component_size(inst->exec_size); 2213 } 2214 inst->offset = base_offset; 2215 inst->mlen = 2; 2216 } 2217 2218 if (type_sz(dst.type) == 8) { 2219 shuffle_from_32bit_read(bld, 2220 offset(dst, bld, iter * 2), 2221 retype(tmp_dst, BRW_REGISTER_TYPE_D), 2222 0, 2223 num_components); 2224 } 2225 2226 if (num_iterations > 1) { 2227 num_components = orig_num_components - 2; 2228 if(offset_const) { 2229 base_offset++; 2230 } else { 2231 fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2232 bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u)); 2233 indirect_offset = new_indirect; 2234 } 2235 } 2236 } 2237} 2238 2239fs_reg 2240fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2241{ 2242 nir_src *offset_src = nir_get_io_offset_src(instr); 2243 nir_const_value *const_value = nir_src_as_const_value(*offset_src); 2244 2245 if (const_value) { 2246 /* The only constant offset we should find is 0. brw_nir.c's 2247 * add_const_offset_to_base() will fold other constant offsets 2248 * into instr->const_index[0]. 2249 */ 2250 assert(const_value->u32[0] == 0); 2251 return fs_reg(); 2252 } 2253 2254 return get_nir_src(*offset_src); 2255} 2256 2257static void 2258do_untyped_vector_read(const fs_builder &bld, 2259 const fs_reg dest, 2260 const fs_reg surf_index, 2261 const fs_reg offset_reg, 2262 unsigned num_components) 2263{ 2264 if (type_sz(dest.type) <= 2) { 2265 assert(dest.stride == 1); 2266 boolean is_const_offset = offset_reg.file == BRW_IMMEDIATE_VALUE; 2267 2268 if (is_const_offset) { 2269 uint32_t start = offset_reg.ud & ~3; 2270 uint32_t end = offset_reg.ud + num_components * type_sz(dest.type); 2271 end = ALIGN(end, 4); 2272 assert (end - start <= 16); 2273 2274 /* At this point we have 16-bit component/s that have constant 2275 * offset aligned to 4-bytes that can be read with untyped_reads. 2276 * untyped_read message requires 32-bit aligned offsets. 2277 */ 2278 unsigned first_component = (offset_reg.ud & 3) / type_sz(dest.type); 2279 unsigned num_components_32bit = (end - start) / 4; 2280 2281 fs_reg read_result = 2282 emit_untyped_read(bld, surf_index, brw_imm_ud(start), 2283 1 /* dims */, 2284 num_components_32bit, 2285 BRW_PREDICATE_NONE); 2286 shuffle_from_32bit_read(bld, dest, read_result, first_component, 2287 num_components); 2288 } else { 2289 fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); 2290 for (unsigned i = 0; i < num_components; i++) { 2291 if (i == 0) { 2292 bld.MOV(read_offset, offset_reg); 2293 } else { 2294 bld.ADD(read_offset, offset_reg, 2295 brw_imm_ud(i * type_sz(dest.type))); 2296 } 2297 /* Non constant offsets are not guaranteed to be aligned 32-bits 2298 * so they are read using one byte_scattered_read message 2299 * for each component. 2300 */ 2301 fs_reg read_result = 2302 emit_byte_scattered_read(bld, surf_index, read_offset, 2303 1 /* dims */, 1, 2304 type_sz(dest.type) * 8 /* bit_size */, 2305 BRW_PREDICATE_NONE); 2306 bld.MOV(offset(dest, bld, i), 2307 subscript (read_result, dest.type, 0)); 2308 } 2309 } 2310 } else if (type_sz(dest.type) == 4) { 2311 fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, 2312 1 /* dims */, 2313 num_components, 2314 BRW_PREDICATE_NONE); 2315 read_result.type = dest.type; 2316 for (unsigned i = 0; i < num_components; i++) 2317 bld.MOV(offset(dest, bld, i), offset(read_result, bld, i)); 2318 } else if (type_sz(dest.type) == 8) { 2319 /* Reading a dvec, so we need to: 2320 * 2321 * 1. Multiply num_components by 2, to account for the fact that we 2322 * need to read 64-bit components. 2323 * 2. Shuffle the result of the load to form valid 64-bit elements 2324 * 3. Emit a second load (for components z/w) if needed. 2325 */ 2326 fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD); 2327 bld.MOV(read_offset, offset_reg); 2328 2329 int iters = num_components <= 2 ? 1 : 2; 2330 2331 /* Load the dvec, the first iteration loads components x/y, the second 2332 * iteration, if needed, loads components z/w 2333 */ 2334 for (int it = 0; it < iters; it++) { 2335 /* Compute number of components to read in this iteration */ 2336 int iter_components = MIN2(2, num_components); 2337 num_components -= iter_components; 2338 2339 /* Read. Since this message reads 32-bit components, we need to 2340 * read twice as many components. 2341 */ 2342 fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset, 2343 1 /* dims */, 2344 iter_components * 2, 2345 BRW_PREDICATE_NONE); 2346 2347 /* Shuffle the 32-bit load result into valid 64-bit data */ 2348 shuffle_from_32bit_read(bld, offset(dest, bld, it * 2), 2349 read_result, 0, iter_components); 2350 2351 bld.ADD(read_offset, read_offset, brw_imm_ud(16)); 2352 } 2353 } else { 2354 unreachable("Unsupported type"); 2355 } 2356} 2357 2358void 2359fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2360 nir_intrinsic_instr *instr) 2361{ 2362 assert(stage == MESA_SHADER_VERTEX); 2363 2364 fs_reg dest; 2365 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2366 dest = get_nir_dest(instr->dest); 2367 2368 switch (instr->intrinsic) { 2369 case nir_intrinsic_load_vertex_id: 2370 case nir_intrinsic_load_base_vertex: 2371 unreachable("should be lowered by nir_lower_system_values()"); 2372 2373 case nir_intrinsic_load_input: { 2374 fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2375 unsigned first_component = nir_intrinsic_component(instr); 2376 unsigned num_components = instr->num_components; 2377 2378 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 2379 assert(const_offset && "Indirect input loads not allowed"); 2380 src = offset(src, bld, const_offset->u32[0]); 2381 2382 if (type_sz(dest.type) == 8) 2383 first_component /= 2; 2384 2385 /* For 16-bit support maybe a temporary will be needed to copy from 2386 * the ATTR file. 2387 */ 2388 shuffle_from_32bit_read(bld, dest, retype(src, BRW_REGISTER_TYPE_D), 2389 first_component, num_components); 2390 break; 2391 } 2392 2393 case nir_intrinsic_load_vertex_id_zero_base: 2394 case nir_intrinsic_load_instance_id: 2395 case nir_intrinsic_load_base_instance: 2396 case nir_intrinsic_load_draw_id: 2397 case nir_intrinsic_load_first_vertex: 2398 case nir_intrinsic_load_is_indexed_draw: 2399 unreachable("lowered by brw_nir_lower_vs_inputs"); 2400 2401 default: 2402 nir_emit_intrinsic(bld, instr); 2403 break; 2404 } 2405} 2406 2407void 2408fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2409 nir_intrinsic_instr *instr) 2410{ 2411 assert(stage == MESA_SHADER_TESS_CTRL); 2412 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2413 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2414 2415 fs_reg dst; 2416 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2417 dst = get_nir_dest(instr->dest); 2418 2419 switch (instr->intrinsic) { 2420 case nir_intrinsic_load_primitive_id: 2421 bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1))); 2422 break; 2423 case nir_intrinsic_load_invocation_id: 2424 bld.MOV(retype(dst, invocation_id.type), invocation_id); 2425 break; 2426 case nir_intrinsic_load_patch_vertices_in: 2427 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2428 brw_imm_d(tcs_key->input_vertices)); 2429 break; 2430 2431 case nir_intrinsic_barrier: { 2432 if (tcs_prog_data->instances == 1) 2433 break; 2434 2435 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2436 fs_reg m0_2 = component(m0, 2); 2437 2438 const fs_builder chanbld = bld.exec_all().group(1, 0); 2439 2440 /* Zero the message header */ 2441 bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2442 2443 /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2444 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2445 brw_imm_ud(INTEL_MASK(16, 13))); 2446 2447 /* Shift it up to bits 27:24. */ 2448 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2449 2450 /* Set the Barrier Count and the enable bit */ 2451 chanbld.OR(m0_2, m0_2, 2452 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2453 2454 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2455 break; 2456 } 2457 2458 case nir_intrinsic_load_input: 2459 unreachable("nir_lower_io should never give us these."); 2460 break; 2461 2462 case nir_intrinsic_load_per_vertex_input: { 2463 fs_reg indirect_offset = get_indirect_offset(instr); 2464 unsigned imm_offset = instr->const_index[0]; 2465 2466 const nir_src &vertex_src = instr->src[0]; 2467 nir_const_value *vertex_const = nir_src_as_const_value(vertex_src); 2468 2469 fs_inst *inst; 2470 2471 fs_reg icp_handle; 2472 2473 if (vertex_const) { 2474 /* Emit a MOV to resolve <0,1,0> regioning. */ 2475 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2476 bld.MOV(icp_handle, 2477 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3), 2478 vertex_const->i32[0] & 7), 2479 BRW_REGISTER_TYPE_UD)); 2480 } else if (tcs_prog_data->instances == 1 && 2481 vertex_src.is_ssa && 2482 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic && 2483 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) { 2484 /* For the common case of only 1 instance, an array index of 2485 * gl_InvocationID means reading g1. Skip all the indirect work. 2486 */ 2487 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2488 } else { 2489 /* The vertex index is non-constant. We need to use indirect 2490 * addressing to fetch the proper URB handle. 2491 */ 2492 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2493 2494 /* Each ICP handle is a single DWord (4 bytes) */ 2495 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2496 bld.SHL(vertex_offset_bytes, 2497 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2498 brw_imm_ud(2u)); 2499 2500 /* Start at g1. We might read up to 4 registers. */ 2501 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2502 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2503 brw_imm_ud(4 * REG_SIZE)); 2504 } 2505 2506 /* We can only read two double components with each URB read, so 2507 * we send two read messages in that case, each one loading up to 2508 * two double components. 2509 */ 2510 unsigned num_iterations = 1; 2511 unsigned num_components = instr->num_components; 2512 unsigned first_component = nir_intrinsic_component(instr); 2513 fs_reg orig_dst = dst; 2514 if (type_sz(dst.type) == 8) { 2515 first_component = first_component / 2; 2516 if (instr->num_components > 2) { 2517 num_iterations = 2; 2518 num_components = 2; 2519 } 2520 2521 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type); 2522 dst = tmp; 2523 } 2524 2525 for (unsigned iter = 0; iter < num_iterations; iter++) { 2526 if (indirect_offset.file == BAD_FILE) { 2527 /* Constant indexing - use global offset. */ 2528 if (first_component != 0) { 2529 unsigned read_components = num_components + first_component; 2530 fs_reg tmp = bld.vgrf(dst.type, read_components); 2531 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2532 for (unsigned i = 0; i < num_components; i++) { 2533 bld.MOV(offset(dst, bld, i), 2534 offset(tmp, bld, i + first_component)); 2535 } 2536 } else { 2537 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2538 } 2539 inst->offset = imm_offset; 2540 inst->mlen = 1; 2541 } else { 2542 /* Indirect indexing - use per-slot offsets as well. */ 2543 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2544 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2545 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2546 if (first_component != 0) { 2547 unsigned read_components = num_components + first_component; 2548 fs_reg tmp = bld.vgrf(dst.type, read_components); 2549 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2550 payload); 2551 for (unsigned i = 0; i < num_components; i++) { 2552 bld.MOV(offset(dst, bld, i), 2553 offset(tmp, bld, i + first_component)); 2554 } 2555 } else { 2556 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2557 payload); 2558 } 2559 inst->offset = imm_offset; 2560 inst->mlen = 2; 2561 } 2562 inst->size_written = (num_components + first_component) * 2563 inst->dst.component_size(inst->exec_size); 2564 2565 /* If we are reading 64-bit data using 32-bit read messages we need 2566 * build proper 64-bit data elements by shuffling the low and high 2567 * 32-bit components around like we do for other things like UBOs 2568 * or SSBOs. 2569 */ 2570 if (type_sz(dst.type) == 8) { 2571 shuffle_from_32bit_read(bld, 2572 offset(orig_dst, bld, iter * 2), 2573 retype(dst, BRW_REGISTER_TYPE_D), 2574 0, num_components); 2575 } 2576 2577 /* Copy the temporary to the destination to deal with writemasking. 2578 * 2579 * Also attempt to deal with gl_PointSize being in the .w component. 2580 */ 2581 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2582 assert(type_sz(dst.type) < 8); 2583 inst->dst = bld.vgrf(dst.type, 4); 2584 inst->size_written = 4 * REG_SIZE; 2585 bld.MOV(dst, offset(inst->dst, bld, 3)); 2586 } 2587 2588 /* If we are loading double data and we need a second read message 2589 * adjust the write offset 2590 */ 2591 if (num_iterations > 1) { 2592 num_components = instr->num_components - 2; 2593 imm_offset++; 2594 } 2595 } 2596 break; 2597 } 2598 2599 case nir_intrinsic_load_output: 2600 case nir_intrinsic_load_per_vertex_output: { 2601 fs_reg indirect_offset = get_indirect_offset(instr); 2602 unsigned imm_offset = instr->const_index[0]; 2603 unsigned first_component = nir_intrinsic_component(instr); 2604 2605 fs_inst *inst; 2606 if (indirect_offset.file == BAD_FILE) { 2607 /* Replicate the patch handle to all enabled channels */ 2608 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2609 bld.MOV(patch_handle, 2610 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); 2611 2612 { 2613 if (first_component != 0) { 2614 unsigned read_components = 2615 instr->num_components + first_component; 2616 fs_reg tmp = bld.vgrf(dst.type, read_components); 2617 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2618 patch_handle); 2619 inst->size_written = read_components * REG_SIZE; 2620 for (unsigned i = 0; i < instr->num_components; i++) { 2621 bld.MOV(offset(dst, bld, i), 2622 offset(tmp, bld, i + first_component)); 2623 } 2624 } else { 2625 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 2626 patch_handle); 2627 inst->size_written = instr->num_components * REG_SIZE; 2628 } 2629 inst->offset = imm_offset; 2630 inst->mlen = 1; 2631 } 2632 } else { 2633 /* Indirect indexing - use per-slot offsets as well. */ 2634 const fs_reg srcs[] = { 2635 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2636 indirect_offset 2637 }; 2638 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2639 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2640 if (first_component != 0) { 2641 unsigned read_components = 2642 instr->num_components + first_component; 2643 fs_reg tmp = bld.vgrf(dst.type, read_components); 2644 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2645 payload); 2646 inst->size_written = read_components * REG_SIZE; 2647 for (unsigned i = 0; i < instr->num_components; i++) { 2648 bld.MOV(offset(dst, bld, i), 2649 offset(tmp, bld, i + first_component)); 2650 } 2651 } else { 2652 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2653 payload); 2654 inst->size_written = instr->num_components * REG_SIZE; 2655 } 2656 inst->offset = imm_offset; 2657 inst->mlen = 2; 2658 } 2659 break; 2660 } 2661 2662 case nir_intrinsic_store_output: 2663 case nir_intrinsic_store_per_vertex_output: { 2664 fs_reg value = get_nir_src(instr->src[0]); 2665 bool is_64bit = (instr->src[0].is_ssa ? 2666 instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64; 2667 fs_reg indirect_offset = get_indirect_offset(instr); 2668 unsigned imm_offset = instr->const_index[0]; 2669 unsigned mask = instr->const_index[1]; 2670 unsigned header_regs = 0; 2671 fs_reg srcs[7]; 2672 srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2673 2674 if (indirect_offset.file != BAD_FILE) { 2675 srcs[header_regs++] = indirect_offset; 2676 } 2677 2678 if (mask == 0) 2679 break; 2680 2681 unsigned num_components = util_last_bit(mask); 2682 enum opcode opcode; 2683 2684 /* We can only pack two 64-bit components in a single message, so send 2685 * 2 messages if we have more components 2686 */ 2687 unsigned num_iterations = 1; 2688 unsigned iter_components = num_components; 2689 unsigned first_component = nir_intrinsic_component(instr); 2690 if (is_64bit) { 2691 first_component = first_component / 2; 2692 if (instr->num_components > 2) { 2693 num_iterations = 2; 2694 iter_components = 2; 2695 } 2696 } 2697 2698 mask = mask << first_component; 2699 2700 for (unsigned iter = 0; iter < num_iterations; iter++) { 2701 if (!is_64bit && mask != WRITEMASK_XYZW) { 2702 srcs[header_regs++] = brw_imm_ud(mask << 16); 2703 opcode = indirect_offset.file != BAD_FILE ? 2704 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2705 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2706 } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) { 2707 /* Expand the 64-bit mask to 32-bit channels. We only handle 2708 * two channels in each iteration, so we only care about X/Y. 2709 */ 2710 unsigned mask32 = 0; 2711 if (mask & WRITEMASK_X) 2712 mask32 |= WRITEMASK_XY; 2713 if (mask & WRITEMASK_Y) 2714 mask32 |= WRITEMASK_ZW; 2715 2716 /* If the mask does not include any of the channels X or Y there 2717 * is nothing to do in this iteration. Move on to the next couple 2718 * of 64-bit channels. 2719 */ 2720 if (!mask32) { 2721 mask >>= 2; 2722 imm_offset++; 2723 continue; 2724 } 2725 2726 srcs[header_regs++] = brw_imm_ud(mask32 << 16); 2727 opcode = indirect_offset.file != BAD_FILE ? 2728 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 2729 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2730 } else { 2731 opcode = indirect_offset.file != BAD_FILE ? 2732 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : 2733 SHADER_OPCODE_URB_WRITE_SIMD8; 2734 } 2735 2736 for (unsigned i = 0; i < iter_components; i++) { 2737 if (!(mask & (1 << (i + first_component)))) 2738 continue; 2739 2740 if (!is_64bit) { 2741 srcs[header_regs + i + first_component] = offset(value, bld, i); 2742 } else { 2743 /* We need to shuffle the 64-bit data to match the layout 2744 * expected by our 32-bit URB write messages. We use a temporary 2745 * for that. 2746 */ 2747 unsigned channel = iter * 2 + i; 2748 fs_reg dest = shuffle_for_32bit_write(bld, value, channel, 1); 2749 2750 srcs[header_regs + (i + first_component) * 2] = dest; 2751 srcs[header_regs + (i + first_component) * 2 + 1] = 2752 offset(dest, bld, 1); 2753 } 2754 } 2755 2756 unsigned mlen = 2757 header_regs + (is_64bit ? 2 * iter_components : iter_components) + 2758 (is_64bit ? 2 * first_component : first_component); 2759 fs_reg payload = 2760 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2761 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); 2762 2763 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); 2764 inst->offset = imm_offset; 2765 inst->mlen = mlen; 2766 2767 /* If this is a 64-bit attribute, select the next two 64-bit channels 2768 * to be handled in the next iteration. 2769 */ 2770 if (is_64bit) { 2771 mask >>= 2; 2772 imm_offset++; 2773 } 2774 } 2775 break; 2776 } 2777 2778 default: 2779 nir_emit_intrinsic(bld, instr); 2780 break; 2781 } 2782} 2783 2784void 2785fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 2786 nir_intrinsic_instr *instr) 2787{ 2788 assert(stage == MESA_SHADER_TESS_EVAL); 2789 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 2790 2791 fs_reg dest; 2792 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2793 dest = get_nir_dest(instr->dest); 2794 2795 switch (instr->intrinsic) { 2796 case nir_intrinsic_load_primitive_id: 2797 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 2798 break; 2799 case nir_intrinsic_load_tess_coord: 2800 /* gl_TessCoord is part of the payload in g1-3 */ 2801 for (unsigned i = 0; i < 3; i++) { 2802 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 2803 } 2804 break; 2805 2806 case nir_intrinsic_load_input: 2807 case nir_intrinsic_load_per_vertex_input: { 2808 fs_reg indirect_offset = get_indirect_offset(instr); 2809 unsigned imm_offset = instr->const_index[0]; 2810 unsigned first_component = nir_intrinsic_component(instr); 2811 2812 if (type_sz(dest.type) == 8) { 2813 first_component = first_component / 2; 2814 } 2815 2816 fs_inst *inst; 2817 if (indirect_offset.file == BAD_FILE) { 2818 /* Arbitrarily only push up to 32 vec4 slots worth of data, 2819 * which is 16 registers (since each holds 2 vec4 slots). 2820 */ 2821 unsigned slot_count = 1; 2822 if (type_sz(dest.type) == 8 && instr->num_components > 2) 2823 slot_count++; 2824 2825 const unsigned max_push_slots = 32; 2826 if (imm_offset + slot_count <= max_push_slots) { 2827 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 2828 for (int i = 0; i < instr->num_components; i++) { 2829 unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) + 2830 i + first_component; 2831 bld.MOV(offset(dest, bld, i), component(src, comp)); 2832 } 2833 2834 tes_prog_data->base.urb_read_length = 2835 MAX2(tes_prog_data->base.urb_read_length, 2836 DIV_ROUND_UP(imm_offset + slot_count, 2)); 2837 } else { 2838 /* Replicate the patch handle to all enabled channels */ 2839 const fs_reg srcs[] = { 2840 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) 2841 }; 2842 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2843 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); 2844 2845 if (first_component != 0) { 2846 unsigned read_components = 2847 instr->num_components + first_component; 2848 fs_reg tmp = bld.vgrf(dest.type, read_components); 2849 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2850 patch_handle); 2851 inst->size_written = read_components * REG_SIZE; 2852 for (unsigned i = 0; i < instr->num_components; i++) { 2853 bld.MOV(offset(dest, bld, i), 2854 offset(tmp, bld, i + first_component)); 2855 } 2856 } else { 2857 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, 2858 patch_handle); 2859 inst->size_written = instr->num_components * REG_SIZE; 2860 } 2861 inst->mlen = 1; 2862 inst->offset = imm_offset; 2863 } 2864 } else { 2865 /* Indirect indexing - use per-slot offsets as well. */ 2866 2867 /* We can only read two double components with each URB read, so 2868 * we send two read messages in that case, each one loading up to 2869 * two double components. 2870 */ 2871 unsigned num_iterations = 1; 2872 unsigned num_components = instr->num_components; 2873 fs_reg orig_dest = dest; 2874 if (type_sz(dest.type) == 8) { 2875 if (instr->num_components > 2) { 2876 num_iterations = 2; 2877 num_components = 2; 2878 } 2879 fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type); 2880 dest = tmp; 2881 } 2882 2883 for (unsigned iter = 0; iter < num_iterations; iter++) { 2884 const fs_reg srcs[] = { 2885 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 2886 indirect_offset 2887 }; 2888 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2889 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2890 2891 if (first_component != 0) { 2892 unsigned read_components = 2893 num_components + first_component; 2894 fs_reg tmp = bld.vgrf(dest.type, read_components); 2895 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2896 payload); 2897 for (unsigned i = 0; i < num_components; i++) { 2898 bld.MOV(offset(dest, bld, i), 2899 offset(tmp, bld, i + first_component)); 2900 } 2901 } else { 2902 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, 2903 payload); 2904 } 2905 inst->mlen = 2; 2906 inst->offset = imm_offset; 2907 inst->size_written = (num_components + first_component) * 2908 inst->dst.component_size(inst->exec_size); 2909 2910 /* If we are reading 64-bit data using 32-bit read messages we need 2911 * build proper 64-bit data elements by shuffling the low and high 2912 * 32-bit components around like we do for other things like UBOs 2913 * or SSBOs. 2914 */ 2915 if (type_sz(dest.type) == 8) { 2916 shuffle_from_32bit_read(bld, 2917 offset(orig_dest, bld, iter * 2), 2918 retype(dest, BRW_REGISTER_TYPE_D), 2919 0, num_components); 2920 } 2921 2922 /* If we are loading double data and we need a second read message 2923 * adjust the offset 2924 */ 2925 if (num_iterations > 1) { 2926 num_components = instr->num_components - 2; 2927 imm_offset++; 2928 } 2929 } 2930 } 2931 break; 2932 } 2933 default: 2934 nir_emit_intrinsic(bld, instr); 2935 break; 2936 } 2937} 2938 2939void 2940fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 2941 nir_intrinsic_instr *instr) 2942{ 2943 assert(stage == MESA_SHADER_GEOMETRY); 2944 fs_reg indirect_offset; 2945 2946 fs_reg dest; 2947 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2948 dest = get_nir_dest(instr->dest); 2949 2950 switch (instr->intrinsic) { 2951 case nir_intrinsic_load_primitive_id: 2952 assert(stage == MESA_SHADER_GEOMETRY); 2953 assert(brw_gs_prog_data(prog_data)->include_primitive_id); 2954 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 2955 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 2956 break; 2957 2958 case nir_intrinsic_load_input: 2959 unreachable("load_input intrinsics are invalid for the GS stage"); 2960 2961 case nir_intrinsic_load_per_vertex_input: 2962 emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 2963 instr->src[1], instr->num_components, 2964 nir_intrinsic_component(instr)); 2965 break; 2966 2967 case nir_intrinsic_emit_vertex_with_counter: 2968 emit_gs_vertex(instr->src[0], instr->const_index[0]); 2969 break; 2970 2971 case nir_intrinsic_end_primitive_with_counter: 2972 emit_gs_end_primitive(instr->src[0]); 2973 break; 2974 2975 case nir_intrinsic_set_vertex_count: 2976 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 2977 break; 2978 2979 case nir_intrinsic_load_invocation_id: { 2980 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 2981 assert(val.file != BAD_FILE); 2982 dest.type = val.type; 2983 bld.MOV(dest, val); 2984 break; 2985 } 2986 2987 default: 2988 nir_emit_intrinsic(bld, instr); 2989 break; 2990 } 2991} 2992 2993/** 2994 * Fetch the current render target layer index. 2995 */ 2996static fs_reg 2997fetch_render_target_array_index(const fs_builder &bld) 2998{ 2999 if (bld.shader->devinfo->gen >= 6) { 3000 /* The render target array index is provided in the thread payload as 3001 * bits 26:16 of r0.0. 3002 */ 3003 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3004 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3005 brw_imm_uw(0x7ff)); 3006 return idx; 3007 } else { 3008 /* Pre-SNB we only ever render into the first layer of the framebuffer 3009 * since layered rendering is not implemented. 3010 */ 3011 return brw_imm_ud(0); 3012 } 3013} 3014 3015/** 3016 * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3017 * framebuffer at the current fragment coordinates and sample index. 3018 */ 3019fs_inst * 3020fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3021 unsigned target) 3022{ 3023 const struct gen_device_info *devinfo = bld.shader->devinfo; 3024 3025 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3026 const brw_wm_prog_key *wm_key = 3027 reinterpret_cast<const brw_wm_prog_key *>(key); 3028 assert(!wm_key->coherent_fb_fetch); 3029 const struct brw_wm_prog_data *wm_prog_data = 3030 brw_wm_prog_data(stage_prog_data); 3031 3032 /* Calculate the surface index relative to the start of the texture binding 3033 * table block, since that's what the texturing messages expect. 3034 */ 3035 const unsigned surface = target + 3036 wm_prog_data->binding_table.render_target_read_start - 3037 wm_prog_data->base.binding_table.texture_start; 3038 3039 brw_mark_surface_used( 3040 bld.shader->stage_prog_data, 3041 wm_prog_data->binding_table.render_target_read_start + target); 3042 3043 /* Calculate the fragment coordinates. */ 3044 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3045 bld.MOV(offset(coords, bld, 0), pixel_x); 3046 bld.MOV(offset(coords, bld, 1), pixel_y); 3047 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3048 3049 /* Calculate the sample index and MCS payload when multisampling. Luckily 3050 * the MCS fetch message behaves deterministically for UMS surfaces, so it 3051 * shouldn't be necessary to recompile based on whether the framebuffer is 3052 * CMS or UMS. 3053 */ 3054 if (wm_key->multisample_fbo && 3055 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3056 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 3057 3058 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3059 const fs_reg mcs = wm_key->multisample_fbo ? 3060 emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg(); 3061 3062 /* Use either a normal or a CMS texel fetch message depending on whether 3063 * the framebuffer is single or multisample. On SKL+ use the wide CMS 3064 * message just in case the framebuffer uses 16x multisampling, it should 3065 * be equivalent to the normal CMS fetch for lower multisampling modes. 3066 */ 3067 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : 3068 devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : 3069 SHADER_OPCODE_TXF_CMS_LOGICAL; 3070 3071 /* Emit the instruction. */ 3072 const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(), 3073 sample, mcs, 3074 brw_imm_ud(surface), brw_imm_ud(0), 3075 fs_reg(), brw_imm_ud(3), brw_imm_ud(0) }; 3076 STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS); 3077 3078 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3079 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3080 3081 return inst; 3082} 3083 3084/** 3085 * Actual coherent framebuffer read implemented using the native render target 3086 * read message. Requires SKL+. 3087 */ 3088static fs_inst * 3089emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3090{ 3091 assert(bld.shader->devinfo->gen >= 9); 3092 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3093 inst->target = target; 3094 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3095 3096 return inst; 3097} 3098 3099static fs_reg 3100alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3101{ 3102 if (n && regs[0].file != BAD_FILE) { 3103 return regs[0]; 3104 3105 } else { 3106 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3107 3108 for (unsigned i = 0; i < n; i++) 3109 regs[i] = tmp; 3110 3111 return tmp; 3112 } 3113} 3114 3115static fs_reg 3116alloc_frag_output(fs_visitor *v, unsigned location) 3117{ 3118 assert(v->stage == MESA_SHADER_FRAGMENT); 3119 const brw_wm_prog_key *const key = 3120 reinterpret_cast<const brw_wm_prog_key *>(v->key); 3121 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3122 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3123 3124 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3125 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3126 3127 else if (l == FRAG_RESULT_COLOR) 3128 return alloc_temporary(v->bld, 4, v->outputs, 3129 MAX2(key->nr_color_regions, 1)); 3130 3131 else if (l == FRAG_RESULT_DEPTH) 3132 return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3133 3134 else if (l == FRAG_RESULT_STENCIL) 3135 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3136 3137 else if (l == FRAG_RESULT_SAMPLE_MASK) 3138 return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3139 3140 else if (l >= FRAG_RESULT_DATA0 && 3141 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3142 return alloc_temporary(v->bld, 4, 3143 &v->outputs[l - FRAG_RESULT_DATA0], 1); 3144 3145 else 3146 unreachable("Invalid location"); 3147} 3148 3149void 3150fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3151 nir_intrinsic_instr *instr) 3152{ 3153 assert(stage == MESA_SHADER_FRAGMENT); 3154 3155 fs_reg dest; 3156 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3157 dest = get_nir_dest(instr->dest); 3158 3159 switch (instr->intrinsic) { 3160 case nir_intrinsic_load_front_face: 3161 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3162 *emit_frontfacing_interpolation()); 3163 break; 3164 3165 case nir_intrinsic_load_sample_pos: { 3166 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3167 assert(sample_pos.file != BAD_FILE); 3168 dest.type = sample_pos.type; 3169 bld.MOV(dest, sample_pos); 3170 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3171 break; 3172 } 3173 3174 case nir_intrinsic_load_layer_id: 3175 dest.type = BRW_REGISTER_TYPE_UD; 3176 bld.MOV(dest, fetch_render_target_array_index(bld)); 3177 break; 3178 3179 case nir_intrinsic_load_helper_invocation: 3180 case nir_intrinsic_load_sample_mask_in: 3181 case nir_intrinsic_load_sample_id: { 3182 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3183 fs_reg val = nir_system_values[sv]; 3184 assert(val.file != BAD_FILE); 3185 dest.type = val.type; 3186 bld.MOV(dest, val); 3187 break; 3188 } 3189 3190 case nir_intrinsic_store_output: { 3191 const fs_reg src = get_nir_src(instr->src[0]); 3192 const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3193 assert(const_offset && "Indirect output stores not allowed"); 3194 const unsigned location = nir_intrinsic_base(instr) + 3195 SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION); 3196 const fs_reg new_dest = retype(alloc_frag_output(this, location), 3197 src.type); 3198 3199 for (unsigned j = 0; j < instr->num_components; j++) 3200 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3201 offset(src, bld, j)); 3202 3203 break; 3204 } 3205 3206 case nir_intrinsic_load_output: { 3207 const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3208 BRW_NIR_FRAG_OUTPUT_LOCATION); 3209 assert(l >= FRAG_RESULT_DATA0); 3210 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3211 assert(const_offset && "Indirect output loads not allowed"); 3212 const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0]; 3213 const fs_reg tmp = bld.vgrf(dest.type, 4); 3214 3215 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3216 emit_coherent_fb_read(bld, tmp, target); 3217 else 3218 emit_non_coherent_fb_read(bld, tmp, target); 3219 3220 for (unsigned j = 0; j < instr->num_components; j++) { 3221 bld.MOV(offset(dest, bld, j), 3222 offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3223 } 3224 3225 break; 3226 } 3227 3228 case nir_intrinsic_discard: 3229 case nir_intrinsic_discard_if: { 3230 /* We track our discarded pixels in f0.1. By predicating on it, we can 3231 * update just the flag bits that aren't yet discarded. If there's no 3232 * condition, we emit a CMP of g0 != g0, so all currently executing 3233 * channels will get turned off. 3234 */ 3235 fs_inst *cmp; 3236 if (instr->intrinsic == nir_intrinsic_discard_if) { 3237 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3238 brw_imm_d(0), BRW_CONDITIONAL_Z); 3239 } else { 3240 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3241 BRW_REGISTER_TYPE_UW)); 3242 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3243 } 3244 cmp->predicate = BRW_PREDICATE_NORMAL; 3245 cmp->flag_subreg = 1; 3246 3247 if (devinfo->gen >= 6) { 3248 emit_discard_jump(); 3249 } 3250 3251 limit_dispatch_width(16, "Fragment discard not implemented in SIMD32 mode."); 3252 break; 3253 } 3254 3255 case nir_intrinsic_load_input: { 3256 /* load_input is only used for flat inputs */ 3257 unsigned base = nir_intrinsic_base(instr); 3258 unsigned comp = nir_intrinsic_component(instr); 3259 unsigned num_components = instr->num_components; 3260 fs_reg orig_dest = dest; 3261 enum brw_reg_type type = dest.type; 3262 3263 /* Special case fields in the VUE header */ 3264 if (base == VARYING_SLOT_LAYER) 3265 comp = 1; 3266 else if (base == VARYING_SLOT_VIEWPORT) 3267 comp = 2; 3268 3269 if (nir_dest_bit_size(instr->dest) == 64) { 3270 /* const_index is in 32-bit type size units that could not be aligned 3271 * with DF. We need to read the double vector as if it was a float 3272 * vector of twice the number of components to fetch the right data. 3273 */ 3274 type = BRW_REGISTER_TYPE_F; 3275 num_components *= 2; 3276 dest = bld.vgrf(type, num_components); 3277 } 3278 3279 for (unsigned int i = 0; i < num_components; i++) { 3280 bld.MOV(offset(retype(dest, type), bld, i), 3281 retype(component(interp_reg(base, comp + i), 3), type)); 3282 } 3283 3284 if (nir_dest_bit_size(instr->dest) == 64) { 3285 shuffle_from_32bit_read(bld, orig_dest, dest, 0, 3286 instr->num_components); 3287 } 3288 break; 3289 } 3290 3291 case nir_intrinsic_load_barycentric_pixel: 3292 case nir_intrinsic_load_barycentric_centroid: 3293 case nir_intrinsic_load_barycentric_sample: 3294 /* Do nothing - load_interpolated_input handling will handle it later. */ 3295 break; 3296 3297 case nir_intrinsic_load_barycentric_at_sample: { 3298 const glsl_interp_mode interpolation = 3299 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3300 3301 nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]); 3302 3303 if (const_sample) { 3304 unsigned msg_data = const_sample->i32[0] << 4; 3305 3306 emit_pixel_interpolater_send(bld, 3307 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3308 dest, 3309 fs_reg(), /* src */ 3310 brw_imm_ud(msg_data), 3311 interpolation); 3312 } else { 3313 const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3314 BRW_REGISTER_TYPE_UD); 3315 3316 if (nir_src_is_dynamically_uniform(instr->src[0])) { 3317 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3318 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3319 bld.exec_all().group(1, 0) 3320 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3321 emit_pixel_interpolater_send(bld, 3322 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3323 dest, 3324 fs_reg(), /* src */ 3325 msg_data, 3326 interpolation); 3327 } else { 3328 /* Make a loop that sends a message to the pixel interpolater 3329 * for the sample number in each live channel. If there are 3330 * multiple channels with the same sample number then these 3331 * will be handled simultaneously with a single interation of 3332 * the loop. 3333 */ 3334 bld.emit(BRW_OPCODE_DO); 3335 3336 /* Get the next live sample number into sample_id_reg */ 3337 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3338 3339 /* Set the flag register so that we can perform the send 3340 * message on all channels that have the same sample number 3341 */ 3342 bld.CMP(bld.null_reg_ud(), 3343 sample_src, sample_id, 3344 BRW_CONDITIONAL_EQ); 3345 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3346 bld.exec_all().group(1, 0) 3347 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3348 fs_inst *inst = 3349 emit_pixel_interpolater_send(bld, 3350 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3351 dest, 3352 fs_reg(), /* src */ 3353 component(msg_data, 0), 3354 interpolation); 3355 set_predicate(BRW_PREDICATE_NORMAL, inst); 3356 3357 /* Continue the loop if there are any live channels left */ 3358 set_predicate_inv(BRW_PREDICATE_NORMAL, 3359 true, /* inverse */ 3360 bld.emit(BRW_OPCODE_WHILE)); 3361 } 3362 } 3363 break; 3364 } 3365 3366 case nir_intrinsic_load_barycentric_at_offset: { 3367 const glsl_interp_mode interpolation = 3368 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3369 3370 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3371 3372 if (const_offset) { 3373 unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf; 3374 unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf; 3375 3376 emit_pixel_interpolater_send(bld, 3377 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3378 dest, 3379 fs_reg(), /* src */ 3380 brw_imm_ud(off_x | (off_y << 4)), 3381 interpolation); 3382 } else { 3383 fs_reg src = vgrf(glsl_type::ivec2_type); 3384 fs_reg offset_src = retype(get_nir_src(instr->src[0]), 3385 BRW_REGISTER_TYPE_F); 3386 for (int i = 0; i < 2; i++) { 3387 fs_reg temp = vgrf(glsl_type::float_type); 3388 bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f)); 3389 fs_reg itemp = vgrf(glsl_type::int_type); 3390 /* float to int */ 3391 bld.MOV(itemp, temp); 3392 3393 /* Clamp the upper end of the range to +7/16. 3394 * ARB_gpu_shader5 requires that we support a maximum offset 3395 * of +0.5, which isn't representable in a S0.4 value -- if 3396 * we didn't clamp it, we'd end up with -8/16, which is the 3397 * opposite of what the shader author wanted. 3398 * 3399 * This is legal due to ARB_gpu_shader5's quantization 3400 * rules: 3401 * 3402 * "Not all values of <offset> may be supported; x and y 3403 * offsets may be rounded to fixed-point values with the 3404 * number of fraction bits given by the 3405 * implementation-dependent constant 3406 * FRAGMENT_INTERPOLATION_OFFSET_BITS" 3407 */ 3408 set_condmod(BRW_CONDITIONAL_L, 3409 bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7))); 3410 } 3411 3412 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3413 emit_pixel_interpolater_send(bld, 3414 opcode, 3415 dest, 3416 src, 3417 brw_imm_ud(0u), 3418 interpolation); 3419 } 3420 break; 3421 } 3422 3423 case nir_intrinsic_load_interpolated_input: { 3424 if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) { 3425 emit_fragcoord_interpolation(dest); 3426 break; 3427 } 3428 3429 assert(instr->src[0].ssa && 3430 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3431 nir_intrinsic_instr *bary_intrinsic = 3432 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3433 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3434 enum glsl_interp_mode interp_mode = 3435 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3436 fs_reg dst_xy; 3437 3438 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3439 bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3440 /* Use the result of the PI message */ 3441 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3442 } else { 3443 /* Use the delta_xy values computed from the payload */ 3444 enum brw_barycentric_mode bary = 3445 brw_barycentric_mode(interp_mode, bary_intrin); 3446 3447 dst_xy = this->delta_xy[bary]; 3448 } 3449 3450 for (unsigned int i = 0; i < instr->num_components; i++) { 3451 fs_reg interp = 3452 component(interp_reg(nir_intrinsic_base(instr), 3453 nir_intrinsic_component(instr) + i), 0); 3454 interp.type = BRW_REGISTER_TYPE_F; 3455 dest.type = BRW_REGISTER_TYPE_F; 3456 3457 if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3458 fs_reg tmp = vgrf(glsl_type::float_type); 3459 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3460 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3461 } else { 3462 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3463 } 3464 } 3465 break; 3466 } 3467 3468 default: 3469 nir_emit_intrinsic(bld, instr); 3470 break; 3471 } 3472} 3473 3474static int 3475get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 3476{ 3477 const nir_const_value *const val = nir_src_as_const_value(instr->src[src]); 3478 3479 if (val != NULL) { 3480 if (val->i32[0] == 1) 3481 return BRW_AOP_INC; 3482 else if (val->i32[0] == -1) 3483 return BRW_AOP_DEC; 3484 } 3485 3486 return BRW_AOP_ADD; 3487} 3488 3489void 3490fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3491 nir_intrinsic_instr *instr) 3492{ 3493 assert(stage == MESA_SHADER_COMPUTE); 3494 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3495 3496 fs_reg dest; 3497 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3498 dest = get_nir_dest(instr->dest); 3499 3500 switch (instr->intrinsic) { 3501 case nir_intrinsic_barrier: 3502 emit_barrier(); 3503 cs_prog_data->uses_barrier = true; 3504 break; 3505 3506 case nir_intrinsic_load_subgroup_id: 3507 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3508 break; 3509 3510 case nir_intrinsic_load_local_invocation_id: 3511 case nir_intrinsic_load_work_group_id: { 3512 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3513 fs_reg val = nir_system_values[sv]; 3514 assert(val.file != BAD_FILE); 3515 dest.type = val.type; 3516 for (unsigned i = 0; i < 3; i++) 3517 bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3518 break; 3519 } 3520 3521 case nir_intrinsic_load_num_work_groups: { 3522 const unsigned surface = 3523 cs_prog_data->binding_table.work_groups_start; 3524 3525 cs_prog_data->uses_num_work_groups = true; 3526 3527 fs_reg surf_index = brw_imm_ud(surface); 3528 brw_mark_surface_used(prog_data, surface); 3529 3530 /* Read the 3 GLuint components of gl_NumWorkGroups */ 3531 for (unsigned i = 0; i < 3; i++) { 3532 fs_reg read_result = 3533 emit_untyped_read(bld, surf_index, 3534 brw_imm_ud(i << 2), 3535 1 /* dims */, 1 /* size */, 3536 BRW_PREDICATE_NONE); 3537 read_result.type = dest.type; 3538 bld.MOV(dest, read_result); 3539 dest = offset(dest, bld, 1); 3540 } 3541 break; 3542 } 3543 3544 case nir_intrinsic_shared_atomic_add: 3545 nir_emit_shared_atomic(bld, get_op_for_atomic_add(instr, 1), instr); 3546 break; 3547 case nir_intrinsic_shared_atomic_imin: 3548 nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr); 3549 break; 3550 case nir_intrinsic_shared_atomic_umin: 3551 nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr); 3552 break; 3553 case nir_intrinsic_shared_atomic_imax: 3554 nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr); 3555 break; 3556 case nir_intrinsic_shared_atomic_umax: 3557 nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr); 3558 break; 3559 case nir_intrinsic_shared_atomic_and: 3560 nir_emit_shared_atomic(bld, BRW_AOP_AND, instr); 3561 break; 3562 case nir_intrinsic_shared_atomic_or: 3563 nir_emit_shared_atomic(bld, BRW_AOP_OR, instr); 3564 break; 3565 case nir_intrinsic_shared_atomic_xor: 3566 nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr); 3567 break; 3568 case nir_intrinsic_shared_atomic_exchange: 3569 nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr); 3570 break; 3571 case nir_intrinsic_shared_atomic_comp_swap: 3572 nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr); 3573 break; 3574 case nir_intrinsic_shared_atomic_fmin: 3575 nir_emit_shared_atomic_float(bld, BRW_AOP_FMIN, instr); 3576 break; 3577 case nir_intrinsic_shared_atomic_fmax: 3578 nir_emit_shared_atomic_float(bld, BRW_AOP_FMAX, instr); 3579 break; 3580 case nir_intrinsic_shared_atomic_fcomp_swap: 3581 nir_emit_shared_atomic_float(bld, BRW_AOP_FCMPWR, instr); 3582 break; 3583 3584 case nir_intrinsic_load_shared: { 3585 assert(devinfo->gen >= 7); 3586 3587 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); 3588 3589 /* Get the offset to read from */ 3590 fs_reg offset_reg; 3591 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3592 if (const_offset) { 3593 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); 3594 } else { 3595 offset_reg = vgrf(glsl_type::uint_type); 3596 bld.ADD(offset_reg, 3597 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 3598 brw_imm_ud(instr->const_index[0])); 3599 } 3600 3601 /* Read the vector */ 3602 do_untyped_vector_read(bld, dest, surf_index, offset_reg, 3603 instr->num_components); 3604 break; 3605 } 3606 3607 case nir_intrinsic_store_shared: { 3608 assert(devinfo->gen >= 7); 3609 3610 /* Block index */ 3611 fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM); 3612 3613 /* Value */ 3614 fs_reg val_reg = get_nir_src(instr->src[0]); 3615 3616 /* Writemask */ 3617 unsigned writemask = instr->const_index[1]; 3618 3619 /* get_nir_src() retypes to integer. Be wary of 64-bit types though 3620 * since the untyped writes below operate in units of 32-bits, which 3621 * means that we need to write twice as many components each time. 3622 * Also, we have to suffle 64-bit data to be in the appropriate layout 3623 * expected by our 32-bit write messages. 3624 */ 3625 unsigned type_size = 4; 3626 if (nir_src_bit_size(instr->src[0]) == 64) { 3627 type_size = 8; 3628 val_reg = shuffle_for_32bit_write(bld, val_reg, 0, 3629 instr->num_components); 3630 } 3631 3632 unsigned type_slots = type_size / 4; 3633 3634 /* Combine groups of consecutive enabled channels in one write 3635 * message. We use ffs to find the first enabled channel and then ffs on 3636 * the bit-inverse, down-shifted writemask to determine the length of 3637 * the block of enabled bits. 3638 */ 3639 while (writemask) { 3640 unsigned first_component = ffs(writemask) - 1; 3641 unsigned length = ffs(~(writemask >> first_component)) - 1; 3642 3643 /* We can't write more than 2 64-bit components at once. Limit the 3644 * length of the write to what we can do and let the next iteration 3645 * handle the rest 3646 */ 3647 if (type_size > 4) 3648 length = MIN2(2, length); 3649 3650 fs_reg offset_reg; 3651 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 3652 if (const_offset) { 3653 offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] + 3654 type_size * first_component); 3655 } else { 3656 offset_reg = vgrf(glsl_type::uint_type); 3657 bld.ADD(offset_reg, 3658 retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD), 3659 brw_imm_ud(instr->const_index[0] + type_size * first_component)); 3660 } 3661 3662 emit_untyped_write(bld, surf_index, offset_reg, 3663 offset(val_reg, bld, first_component * type_slots), 3664 1 /* dims */, length * type_slots, 3665 BRW_PREDICATE_NONE); 3666 3667 /* Clear the bits in the writemask that we just wrote, then try 3668 * again to see if more channels are left. 3669 */ 3670 writemask &= (15 << (first_component + length)); 3671 } 3672 3673 break; 3674 } 3675 3676 default: 3677 nir_emit_intrinsic(bld, instr); 3678 break; 3679 } 3680} 3681 3682static fs_reg 3683brw_nir_reduction_op_identity(const fs_builder &bld, 3684 nir_op op, brw_reg_type type) 3685{ 3686 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 3687 switch (type_sz(type)) { 3688 case 2: 3689 assert(type != BRW_REGISTER_TYPE_HF); 3690 return retype(brw_imm_uw(value.u16[0]), type); 3691 case 4: 3692 return retype(brw_imm_ud(value.u32[0]), type); 3693 case 8: 3694 if (type == BRW_REGISTER_TYPE_DF) 3695 return setup_imm_df(bld, value.f64[0]); 3696 else 3697 return retype(brw_imm_u64(value.u64[0]), type); 3698 default: 3699 unreachable("Invalid type size"); 3700 } 3701} 3702 3703static opcode 3704brw_op_for_nir_reduction_op(nir_op op) 3705{ 3706 switch (op) { 3707 case nir_op_iadd: return BRW_OPCODE_ADD; 3708 case nir_op_fadd: return BRW_OPCODE_ADD; 3709 case nir_op_imul: return BRW_OPCODE_MUL; 3710 case nir_op_fmul: return BRW_OPCODE_MUL; 3711 case nir_op_imin: return BRW_OPCODE_SEL; 3712 case nir_op_umin: return BRW_OPCODE_SEL; 3713 case nir_op_fmin: return BRW_OPCODE_SEL; 3714 case nir_op_imax: return BRW_OPCODE_SEL; 3715 case nir_op_umax: return BRW_OPCODE_SEL; 3716 case nir_op_fmax: return BRW_OPCODE_SEL; 3717 case nir_op_iand: return BRW_OPCODE_AND; 3718 case nir_op_ior: return BRW_OPCODE_OR; 3719 case nir_op_ixor: return BRW_OPCODE_XOR; 3720 default: 3721 unreachable("Invalid reduction operation"); 3722 } 3723} 3724 3725static brw_conditional_mod 3726brw_cond_mod_for_nir_reduction_op(nir_op op) 3727{ 3728 switch (op) { 3729 case nir_op_iadd: return BRW_CONDITIONAL_NONE; 3730 case nir_op_fadd: return BRW_CONDITIONAL_NONE; 3731 case nir_op_imul: return BRW_CONDITIONAL_NONE; 3732 case nir_op_fmul: return BRW_CONDITIONAL_NONE; 3733 case nir_op_imin: return BRW_CONDITIONAL_L; 3734 case nir_op_umin: return BRW_CONDITIONAL_L; 3735 case nir_op_fmin: return BRW_CONDITIONAL_L; 3736 case nir_op_imax: return BRW_CONDITIONAL_GE; 3737 case nir_op_umax: return BRW_CONDITIONAL_GE; 3738 case nir_op_fmax: return BRW_CONDITIONAL_GE; 3739 case nir_op_iand: return BRW_CONDITIONAL_NONE; 3740 case nir_op_ior: return BRW_CONDITIONAL_NONE; 3741 case nir_op_ixor: return BRW_CONDITIONAL_NONE; 3742 default: 3743 unreachable("Invalid reduction operation"); 3744 } 3745} 3746 3747fs_reg 3748fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 3749 nir_intrinsic_instr *instr) 3750{ 3751 fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 3752 3753 if (stage_prog_data->binding_table.image_start > 0) { 3754 if (image.file == BRW_IMMEDIATE_VALUE) { 3755 image.d += stage_prog_data->binding_table.image_start; 3756 } else { 3757 bld.ADD(image, image, 3758 brw_imm_d(stage_prog_data->binding_table.image_start)); 3759 } 3760 } 3761 3762 return bld.emit_uniformize(image); 3763} 3764 3765static unsigned 3766image_intrinsic_coord_components(nir_intrinsic_instr *instr) 3767{ 3768 switch (nir_intrinsic_image_dim(instr)) { 3769 case GLSL_SAMPLER_DIM_1D: 3770 return 1 + nir_intrinsic_image_array(instr); 3771 case GLSL_SAMPLER_DIM_2D: 3772 case GLSL_SAMPLER_DIM_RECT: 3773 return 2 + nir_intrinsic_image_array(instr); 3774 case GLSL_SAMPLER_DIM_3D: 3775 case GLSL_SAMPLER_DIM_CUBE: 3776 return 3; 3777 case GLSL_SAMPLER_DIM_BUF: 3778 return 1; 3779 case GLSL_SAMPLER_DIM_MS: 3780 return 2 + nir_intrinsic_image_array(instr); 3781 default: 3782 unreachable("Invalid image dimension"); 3783 } 3784} 3785 3786void 3787fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 3788{ 3789 fs_reg dest; 3790 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3791 dest = get_nir_dest(instr->dest); 3792 3793 switch (instr->intrinsic) { 3794 case nir_intrinsic_image_load: 3795 case nir_intrinsic_image_store: 3796 case nir_intrinsic_image_atomic_add: 3797 case nir_intrinsic_image_atomic_min: 3798 case nir_intrinsic_image_atomic_max: 3799 case nir_intrinsic_image_atomic_and: 3800 case nir_intrinsic_image_atomic_or: 3801 case nir_intrinsic_image_atomic_xor: 3802 case nir_intrinsic_image_atomic_exchange: 3803 case nir_intrinsic_image_atomic_comp_swap: { 3804 if (stage == MESA_SHADER_FRAGMENT && 3805 instr->intrinsic != nir_intrinsic_image_load) 3806 brw_wm_prog_data(prog_data)->has_side_effects = true; 3807 3808 /* Get some metadata from the image intrinsic. */ 3809 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 3810 const unsigned dims = image_intrinsic_coord_components(instr); 3811 const GLenum format = nir_intrinsic_format(instr); 3812 const unsigned dest_components = nir_intrinsic_dest_components(instr); 3813 3814 /* Get the arguments of the image intrinsic. */ 3815 const fs_reg image = get_nir_image_intrinsic_image(bld, instr); 3816 const fs_reg coords = retype(get_nir_src(instr->src[1]), 3817 BRW_REGISTER_TYPE_UD); 3818 fs_reg tmp; 3819 3820 /* Emit an image load, store or atomic op. */ 3821 if (instr->intrinsic == nir_intrinsic_image_load) { 3822 tmp = emit_typed_read(bld, image, coords, dims, 3823 instr->num_components); 3824 } else if (instr->intrinsic == nir_intrinsic_image_store) { 3825 const fs_reg src0 = get_nir_src(instr->src[3]); 3826 emit_typed_write(bld, image, coords, src0, dims, 3827 instr->num_components); 3828 } else { 3829 int op; 3830 unsigned num_srcs = info->num_srcs; 3831 3832 switch (instr->intrinsic) { 3833 case nir_intrinsic_image_atomic_add: 3834 assert(num_srcs == 4); 3835 3836 op = get_op_for_atomic_add(instr, 3); 3837 3838 if (op != BRW_AOP_ADD) 3839 num_srcs = 3; 3840 break; 3841 case nir_intrinsic_image_atomic_min: 3842 assert(format == GL_R32UI || format == GL_R32I); 3843 op = (format == GL_R32I) ? BRW_AOP_IMIN : BRW_AOP_UMIN; 3844 break; 3845 case nir_intrinsic_image_atomic_max: 3846 assert(format == GL_R32UI || format == GL_R32I); 3847 op = (format == GL_R32I) ? BRW_AOP_IMAX : BRW_AOP_UMAX; 3848 break; 3849 case nir_intrinsic_image_atomic_and: 3850 op = BRW_AOP_AND; 3851 break; 3852 case nir_intrinsic_image_atomic_or: 3853 op = BRW_AOP_OR; 3854 break; 3855 case nir_intrinsic_image_atomic_xor: 3856 op = BRW_AOP_XOR; 3857 break; 3858 case nir_intrinsic_image_atomic_exchange: 3859 op = BRW_AOP_MOV; 3860 break; 3861 case nir_intrinsic_image_atomic_comp_swap: 3862 op = BRW_AOP_CMPWR; 3863 break; 3864 default: 3865 unreachable("Not reachable."); 3866 } 3867 3868 const fs_reg src0 = (num_srcs >= 4 ? 3869 get_nir_src(instr->src[3]) : fs_reg()); 3870 const fs_reg src1 = (num_srcs >= 5 ? 3871 get_nir_src(instr->src[4]) : fs_reg()); 3872 3873 tmp = emit_typed_atomic(bld, image, coords, src0, src1, dims, 1, op); 3874 } 3875 3876 /* Assign the result. */ 3877 for (unsigned c = 0; c < dest_components; ++c) { 3878 bld.MOV(offset(retype(dest, tmp.type), bld, c), 3879 offset(tmp, bld, c)); 3880 } 3881 break; 3882 } 3883 3884 case nir_intrinsic_image_size: { 3885 /* Unlike the [un]typed load and store opcodes, the TXS that this turns 3886 * into will handle the binding table index for us in the geneerator. 3887 */ 3888 fs_reg image = retype(get_nir_src_imm(instr->src[0]), 3889 BRW_REGISTER_TYPE_UD); 3890 image = bld.emit_uniformize(image); 3891 3892 /* Since the image size is always uniform, we can just emit a SIMD8 3893 * query instruction and splat the result out. 3894 */ 3895 const fs_builder ubld = bld.exec_all().group(8, 0); 3896 3897 /* The LOD also serves as the message payload */ 3898 fs_reg lod = ubld.vgrf(BRW_REGISTER_TYPE_UD); 3899 ubld.MOV(lod, brw_imm_ud(0)); 3900 3901 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 3902 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE, tmp, lod, image); 3903 inst->mlen = 1; 3904 inst->size_written = 4 * REG_SIZE; 3905 3906 for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 3907 if (c == 2 && nir_intrinsic_image_dim(instr) == GLSL_SAMPLER_DIM_CUBE) { 3908 bld.emit(SHADER_OPCODE_INT_QUOTIENT, 3909 offset(retype(dest, tmp.type), bld, c), 3910 component(offset(tmp, ubld, c), 0), brw_imm_ud(6)); 3911 } else { 3912 bld.MOV(offset(retype(dest, tmp.type), bld, c), 3913 component(offset(tmp, ubld, c), 0)); 3914 } 3915 } 3916 break; 3917 } 3918 3919 case nir_intrinsic_image_load_raw_intel: { 3920 const fs_reg image = get_nir_image_intrinsic_image(bld, instr); 3921 const fs_reg addr = retype(get_nir_src(instr->src[1]), 3922 BRW_REGISTER_TYPE_UD); 3923 3924 fs_reg tmp = emit_untyped_read(bld, image, addr, 1, 3925 instr->num_components); 3926 3927 for (unsigned c = 0; c < instr->num_components; ++c) { 3928 bld.MOV(offset(retype(dest, tmp.type), bld, c), 3929 offset(tmp, bld, c)); 3930 } 3931 break; 3932 } 3933 3934 case nir_intrinsic_image_store_raw_intel: { 3935 const fs_reg image = get_nir_image_intrinsic_image(bld, instr); 3936 const fs_reg addr = retype(get_nir_src(instr->src[1]), 3937 BRW_REGISTER_TYPE_UD); 3938 const fs_reg data = retype(get_nir_src(instr->src[2]), 3939 BRW_REGISTER_TYPE_UD); 3940 3941 brw_wm_prog_data(prog_data)->has_side_effects = true; 3942 3943 emit_untyped_write(bld, image, addr, data, 1, 3944 instr->num_components); 3945 break; 3946 } 3947 3948 case nir_intrinsic_group_memory_barrier: 3949 case nir_intrinsic_memory_barrier_shared: 3950 case nir_intrinsic_memory_barrier_atomic_counter: 3951 case nir_intrinsic_memory_barrier_buffer: 3952 case nir_intrinsic_memory_barrier_image: 3953 case nir_intrinsic_memory_barrier: { 3954 const fs_builder ubld = bld.group(8, 0); 3955 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3956 ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp) 3957 ->size_written = 2 * REG_SIZE; 3958 break; 3959 } 3960 3961 case nir_intrinsic_shader_clock: { 3962 /* We cannot do anything if there is an event, so ignore it for now */ 3963 const fs_reg shader_clock = get_timestamp(bld); 3964 const fs_reg srcs[] = { component(shader_clock, 0), 3965 component(shader_clock, 1) }; 3966 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 3967 break; 3968 } 3969 3970 case nir_intrinsic_image_samples: 3971 /* The driver does not support multi-sampled images. */ 3972 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 3973 break; 3974 3975 case nir_intrinsic_load_uniform: { 3976 /* Offsets are in bytes but they should always aligned to 3977 * the type size 3978 */ 3979 assert(instr->const_index[0] % 4 == 0 || 3980 instr->const_index[0] % type_sz(dest.type) == 0); 3981 3982 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 3983 3984 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3985 if (const_offset) { 3986 assert(const_offset->u32[0] % type_sz(dest.type) == 0); 3987 /* For 16-bit types we add the module of the const_index[0] 3988 * offset to access to not 32-bit aligned element 3989 */ 3990 src.offset = const_offset->u32[0] + instr->const_index[0] % 4; 3991 3992 for (unsigned j = 0; j < instr->num_components; j++) { 3993 bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 3994 } 3995 } else { 3996 fs_reg indirect = retype(get_nir_src(instr->src[0]), 3997 BRW_REGISTER_TYPE_UD); 3998 3999 /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4000 * go past the end of the uniform. In order to keep the n'th 4001 * component from running past, we subtract off the size of all but 4002 * one component of the vector. 4003 */ 4004 assert(instr->const_index[1] >= 4005 instr->num_components * (int) type_sz(dest.type)); 4006 unsigned read_size = instr->const_index[1] - 4007 (instr->num_components - 1) * type_sz(dest.type); 4008 4009 bool supports_64bit_indirects = 4010 !devinfo->is_cherryview && !gen_device_info_is_9lp(devinfo); 4011 4012 if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4013 for (unsigned j = 0; j < instr->num_components; j++) { 4014 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4015 offset(dest, bld, j), offset(src, bld, j), 4016 indirect, brw_imm_ud(read_size)); 4017 } 4018 } else { 4019 const unsigned num_mov_indirects = 4020 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4021 /* We read a little bit less per MOV INDIRECT, as they are now 4022 * 32-bits ones instead of 64-bit. Fix read_size then. 4023 */ 4024 const unsigned read_size_32bit = read_size - 4025 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4026 for (unsigned j = 0; j < instr->num_components; j++) { 4027 for (unsigned i = 0; i < num_mov_indirects; i++) { 4028 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4029 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4030 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4031 indirect, brw_imm_ud(read_size_32bit)); 4032 } 4033 } 4034 } 4035 } 4036 break; 4037 } 4038 4039 case nir_intrinsic_load_ubo: { 4040 nir_const_value *const_index = nir_src_as_const_value(instr->src[0]); 4041 fs_reg surf_index; 4042 4043 if (const_index) { 4044 const unsigned index = stage_prog_data->binding_table.ubo_start + 4045 const_index->u32[0]; 4046 surf_index = brw_imm_ud(index); 4047 brw_mark_surface_used(prog_data, index); 4048 } else { 4049 /* The block index is not a constant. Evaluate the index expression 4050 * per-channel and add the base UBO index; we have to select a value 4051 * from any live channel. 4052 */ 4053 surf_index = vgrf(glsl_type::uint_type); 4054 bld.ADD(surf_index, get_nir_src(instr->src[0]), 4055 brw_imm_ud(stage_prog_data->binding_table.ubo_start)); 4056 surf_index = bld.emit_uniformize(surf_index); 4057 4058 /* Assume this may touch any UBO. It would be nice to provide 4059 * a tighter bound, but the array information is already lowered away. 4060 */ 4061 brw_mark_surface_used(prog_data, 4062 stage_prog_data->binding_table.ubo_start + 4063 nir->info.num_ubos - 1); 4064 } 4065 4066 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 4067 if (const_offset == NULL) { 4068 fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4069 BRW_REGISTER_TYPE_UD); 4070 4071 for (int i = 0; i < instr->num_components; i++) 4072 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4073 base_offset, i * type_sz(dest.type)); 4074 } else { 4075 /* Even if we are loading doubles, a pull constant load will load 4076 * a 32-bit vec4, so should only reserve vgrf space for that. If we 4077 * need to load a full dvec4 we will have to emit 2 loads. This is 4078 * similar to demote_pull_constants(), except that in that case we 4079 * see individual accesses to each component of the vector and then 4080 * we let CSE deal with duplicate loads. Here we see a vector access 4081 * and we have to split it if necessary. 4082 */ 4083 const unsigned type_size = type_sz(dest.type); 4084 4085 /* See if we've selected this as a push constant candidate */ 4086 if (const_index) { 4087 const unsigned ubo_block = const_index->u32[0]; 4088 const unsigned offset_256b = const_offset->u32[0] / 32; 4089 4090 fs_reg push_reg; 4091 for (int i = 0; i < 4; i++) { 4092 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4093 if (range->block == ubo_block && 4094 offset_256b >= range->start && 4095 offset_256b < range->start + range->length) { 4096 4097 push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4098 push_reg.offset = const_offset->u32[0] - 32 * range->start; 4099 break; 4100 } 4101 } 4102 4103 if (push_reg.file != BAD_FILE) { 4104 for (unsigned i = 0; i < instr->num_components; i++) { 4105 bld.MOV(offset(dest, bld, i), 4106 byte_offset(push_reg, i * type_size)); 4107 } 4108 break; 4109 } 4110 } 4111 4112 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4113 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4114 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4115 4116 for (unsigned c = 0; c < instr->num_components;) { 4117 const unsigned base = const_offset->u32[0] + c * type_size; 4118 /* Number of usable components in the next block-aligned load. */ 4119 const unsigned count = MIN2(instr->num_components - c, 4120 (block_sz - base % block_sz) / type_size); 4121 4122 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4123 packed_consts, surf_index, 4124 brw_imm_ud(base & ~(block_sz - 1))); 4125 4126 const fs_reg consts = 4127 retype(byte_offset(packed_consts, base & (block_sz - 1)), 4128 dest.type); 4129 4130 for (unsigned d = 0; d < count; d++) 4131 bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4132 4133 c += count; 4134 } 4135 } 4136 break; 4137 } 4138 4139 case nir_intrinsic_load_ssbo: { 4140 assert(devinfo->gen >= 7); 4141 4142 nir_const_value *const_uniform_block = 4143 nir_src_as_const_value(instr->src[0]); 4144 4145 fs_reg surf_index; 4146 if (const_uniform_block) { 4147 unsigned index = stage_prog_data->binding_table.ssbo_start + 4148 const_uniform_block->u32[0]; 4149 surf_index = brw_imm_ud(index); 4150 brw_mark_surface_used(prog_data, index); 4151 } else { 4152 surf_index = vgrf(glsl_type::uint_type); 4153 bld.ADD(surf_index, get_nir_src(instr->src[0]), 4154 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4155 4156 /* Assume this may touch any UBO. It would be nice to provide 4157 * a tighter bound, but the array information is already lowered away. 4158 */ 4159 brw_mark_surface_used(prog_data, 4160 stage_prog_data->binding_table.ssbo_start + 4161 nir->info.num_ssbos - 1); 4162 } 4163 4164 fs_reg offset_reg; 4165 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 4166 if (const_offset) { 4167 offset_reg = brw_imm_ud(const_offset->u32[0]); 4168 } else { 4169 offset_reg = retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD); 4170 } 4171 4172 /* Read the vector */ 4173 do_untyped_vector_read(bld, dest, surf_index, offset_reg, 4174 instr->num_components); 4175 4176 break; 4177 } 4178 4179 case nir_intrinsic_store_ssbo: { 4180 assert(devinfo->gen >= 7); 4181 4182 if (stage == MESA_SHADER_FRAGMENT) 4183 brw_wm_prog_data(prog_data)->has_side_effects = true; 4184 4185 /* Block index */ 4186 fs_reg surf_index; 4187 nir_const_value *const_uniform_block = 4188 nir_src_as_const_value(instr->src[1]); 4189 if (const_uniform_block) { 4190 unsigned index = stage_prog_data->binding_table.ssbo_start + 4191 const_uniform_block->u32[0]; 4192 surf_index = brw_imm_ud(index); 4193 brw_mark_surface_used(prog_data, index); 4194 } else { 4195 surf_index = vgrf(glsl_type::uint_type); 4196 bld.ADD(surf_index, get_nir_src(instr->src[1]), 4197 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4198 4199 brw_mark_surface_used(prog_data, 4200 stage_prog_data->binding_table.ssbo_start + 4201 nir->info.num_ssbos - 1); 4202 } 4203 4204 /* Value */ 4205 fs_reg val_reg = get_nir_src(instr->src[0]); 4206 4207 /* Writemask */ 4208 unsigned writemask = instr->const_index[0]; 4209 4210 /* get_nir_src() retypes to integer. Be wary of 64-bit types though 4211 * since the untyped writes below operate in units of 32-bits, which 4212 * means that we need to write twice as many components each time. 4213 * Also, we have to suffle 64-bit data to be in the appropriate layout 4214 * expected by our 32-bit write messages. 4215 */ 4216 unsigned bit_size = nir_src_bit_size(instr->src[0]); 4217 unsigned type_size = bit_size / 8; 4218 4219 /* Combine groups of consecutive enabled channels in one write 4220 * message. We use ffs to find the first enabled channel and then ffs on 4221 * the bit-inverse, down-shifted writemask to determine the num_components 4222 * of the block of enabled bits. 4223 */ 4224 while (writemask) { 4225 unsigned first_component = ffs(writemask) - 1; 4226 unsigned num_components = ffs(~(writemask >> first_component)) - 1; 4227 fs_reg write_src = offset(val_reg, bld, first_component); 4228 4229 nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]); 4230 4231 if (type_size > 4) { 4232 /* We can't write more than 2 64-bit components at once. Limit 4233 * the num_components of the write to what we can do and let the next 4234 * iteration handle the rest. 4235 */ 4236 num_components = MIN2(2, num_components); 4237 write_src = shuffle_for_32bit_write(bld, write_src, 0, 4238 num_components); 4239 } else if (type_size < 4) { 4240 /* For 16-bit types we pack two consecutive values into a 32-bit 4241 * word and use an untyped write message. For single values or not 4242 * 32-bit-aligned we need to use byte-scattered writes because 4243 * untyped writes works with 32-bit components with 32-bit 4244 * alignment. byte_scattered_write messages only support one 4245 * 16-bit component at a time. As VK_KHR_relaxed_block_layout 4246 * could be enabled we can not guarantee that not constant offsets 4247 * to be 32-bit aligned for 16-bit types. For example an array, of 4248 * 16-bit vec3 with array element stride of 6. 4249 * 4250 * In the case of 32-bit aligned constant offsets if there is 4251 * a 3-components vector we submit one untyped-write message 4252 * of 32-bit (first two components), and one byte-scattered 4253 * write message (the last component). 4254 */ 4255 4256 if ( !const_offset || ((const_offset->u32[0] + 4257 type_size * first_component) % 4)) { 4258 /* If we use a .yz writemask we also need to emit 2 4259 * byte-scattered write messages because of y-component not 4260 * being aligned to 32-bit. 4261 */ 4262 num_components = 1; 4263 } else if (num_components * type_size > 4 && 4264 (num_components * type_size % 4)) { 4265 /* If the pending components size is not a multiple of 4 bytes 4266 * we left the not aligned components for following emits of 4267 * length == 1 with byte_scattered_write. 4268 */ 4269 num_components -= (num_components * type_size % 4) / type_size; 4270 } else if (num_components * type_size < 4) { 4271 num_components = 1; 4272 } 4273 /* For num_components == 1 we are also shuffling the component 4274 * because byte scattered writes of 16-bit need values to be dword 4275 * aligned. Shuffling only one component would be the same as 4276 * striding it. 4277 */ 4278 write_src = shuffle_for_32bit_write(bld, write_src, 0, 4279 num_components); 4280 } 4281 4282 fs_reg offset_reg; 4283 4284 if (const_offset) { 4285 offset_reg = brw_imm_ud(const_offset->u32[0] + 4286 type_size * first_component); 4287 } else { 4288 offset_reg = vgrf(glsl_type::uint_type); 4289 bld.ADD(offset_reg, 4290 retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD), 4291 brw_imm_ud(type_size * first_component)); 4292 } 4293 4294 if (type_size < 4 && num_components == 1) { 4295 /* Untyped Surface messages have a fixed 32-bit size, so we need 4296 * to rely on byte scattered in order to write 16-bit elements. 4297 * The byte_scattered_write message needs that every written 16-bit 4298 * type to be aligned 32-bits (stride=2). 4299 */ 4300 emit_byte_scattered_write(bld, surf_index, offset_reg, 4301 write_src, 4302 1 /* dims */, 4303 bit_size, 4304 BRW_PREDICATE_NONE); 4305 } else { 4306 assert(num_components * type_size <= 16); 4307 assert((num_components * type_size) % 4 == 0); 4308 assert(offset_reg.file != BRW_IMMEDIATE_VALUE || 4309 offset_reg.ud % 4 == 0); 4310 unsigned num_slots = (num_components * type_size) / 4; 4311 4312 emit_untyped_write(bld, surf_index, offset_reg, 4313 write_src, 4314 1 /* dims */, num_slots, 4315 BRW_PREDICATE_NONE); 4316 } 4317 4318 /* Clear the bits in the writemask that we just wrote, then try 4319 * again to see if more channels are left. 4320 */ 4321 writemask &= (15 << (first_component + num_components)); 4322 } 4323 break; 4324 } 4325 4326 case nir_intrinsic_store_output: { 4327 fs_reg src = get_nir_src(instr->src[0]); 4328 4329 nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]); 4330 assert(const_offset && "Indirect output stores not allowed"); 4331 4332 unsigned num_components = instr->num_components; 4333 unsigned first_component = nir_intrinsic_component(instr); 4334 if (nir_src_bit_size(instr->src[0]) == 64) { 4335 src = shuffle_for_32bit_write(bld, src, 0, num_components); 4336 num_components *= 2; 4337 } 4338 4339 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 4340 4 * const_offset->u32[0]), src.type); 4341 for (unsigned j = 0; j < num_components; j++) { 4342 bld.MOV(offset(new_dest, bld, j + first_component), 4343 offset(src, bld, j)); 4344 } 4345 break; 4346 } 4347 4348 case nir_intrinsic_ssbo_atomic_add: 4349 nir_emit_ssbo_atomic(bld, get_op_for_atomic_add(instr, 2), instr); 4350 break; 4351 case nir_intrinsic_ssbo_atomic_imin: 4352 nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr); 4353 break; 4354 case nir_intrinsic_ssbo_atomic_umin: 4355 nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr); 4356 break; 4357 case nir_intrinsic_ssbo_atomic_imax: 4358 nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr); 4359 break; 4360 case nir_intrinsic_ssbo_atomic_umax: 4361 nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr); 4362 break; 4363 case nir_intrinsic_ssbo_atomic_and: 4364 nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr); 4365 break; 4366 case nir_intrinsic_ssbo_atomic_or: 4367 nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr); 4368 break; 4369 case nir_intrinsic_ssbo_atomic_xor: 4370 nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr); 4371 break; 4372 case nir_intrinsic_ssbo_atomic_exchange: 4373 nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr); 4374 break; 4375 case nir_intrinsic_ssbo_atomic_comp_swap: 4376 nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr); 4377 break; 4378 case nir_intrinsic_ssbo_atomic_fmin: 4379 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMIN, instr); 4380 break; 4381 case nir_intrinsic_ssbo_atomic_fmax: 4382 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FMAX, instr); 4383 break; 4384 case nir_intrinsic_ssbo_atomic_fcomp_swap: 4385 nir_emit_ssbo_atomic_float(bld, BRW_AOP_FCMPWR, instr); 4386 break; 4387 4388 case nir_intrinsic_get_buffer_size: { 4389 nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]); 4390 unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0; 4391 4392 /* A resinfo's sampler message is used to get the buffer size. The 4393 * SIMD8's writeback message consists of four registers and SIMD16's 4394 * writeback message consists of 8 destination registers (two per each 4395 * component). Because we are only interested on the first channel of 4396 * the first returned component, where resinfo returns the buffer size 4397 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 4398 * the dispatch width. 4399 */ 4400 const fs_builder ubld = bld.exec_all().group(8, 0); 4401 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4402 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4403 4404 /* Set LOD = 0 */ 4405 ubld.MOV(src_payload, brw_imm_d(0)); 4406 4407 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; 4408 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 4409 src_payload, brw_imm_ud(index)); 4410 inst->header_size = 0; 4411 inst->mlen = 1; 4412 inst->size_written = 4 * REG_SIZE; 4413 4414 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 4415 * 4416 * "Out-of-bounds checking is always performed at a DWord granularity. If 4417 * any part of the DWord is out-of-bounds then the whole DWord is 4418 * considered out-of-bounds." 4419 * 4420 * This implies that types with size smaller than 4-bytes need to be 4421 * padded if they don't complete the last dword of the buffer. But as we 4422 * need to maintain the original size we need to reverse the padding 4423 * calculation to return the correct size to know the number of elements 4424 * of an unsized array. As we stored in the last two bits of the surface 4425 * size the needed padding for the buffer, we calculate here the 4426 * original buffer_size reversing the surface_size calculation: 4427 * 4428 * surface_size = isl_align(buffer_size, 4) + 4429 * (isl_align(buffer_size) - buffer_size) 4430 * 4431 * buffer_size = surface_size & ~3 - surface_size & 3 4432 */ 4433 4434 fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4435 fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4436 fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4437 4438 ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 4439 ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 4440 ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 4441 4442 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 4443 4444 brw_mark_surface_used(prog_data, index); 4445 break; 4446 } 4447 4448 case nir_intrinsic_load_subgroup_invocation: 4449 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 4450 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 4451 break; 4452 4453 case nir_intrinsic_load_subgroup_eq_mask: 4454 case nir_intrinsic_load_subgroup_ge_mask: 4455 case nir_intrinsic_load_subgroup_gt_mask: 4456 case nir_intrinsic_load_subgroup_le_mask: 4457 case nir_intrinsic_load_subgroup_lt_mask: 4458 unreachable("not reached"); 4459 4460 case nir_intrinsic_vote_any: { 4461 const fs_builder ubld = bld.exec_all().group(1, 0); 4462 4463 /* The any/all predicates do not consider channel enables. To prevent 4464 * dead channels from affecting the result, we initialize the flag with 4465 * with the identity value for the logical operation. 4466 */ 4467 if (dispatch_width == 32) { 4468 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4469 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4470 brw_imm_ud(0)); 4471 } else { 4472 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 4473 } 4474 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4475 4476 /* For some reason, the any/all predicates don't work properly with 4477 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4478 * doesn't read the correct subset of the flag register and you end up 4479 * getting garbage in the second half. Work around this by using a pair 4480 * of 1-wide MOVs and scattering the result. 4481 */ 4482 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4483 ubld.MOV(res1, brw_imm_d(0)); 4484 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 4485 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 4486 BRW_PREDICATE_ALIGN1_ANY32H, 4487 ubld.MOV(res1, brw_imm_d(-1))); 4488 4489 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4490 break; 4491 } 4492 case nir_intrinsic_vote_all: { 4493 const fs_builder ubld = bld.exec_all().group(1, 0); 4494 4495 /* The any/all predicates do not consider channel enables. To prevent 4496 * dead channels from affecting the result, we initialize the flag with 4497 * with the identity value for the logical operation. 4498 */ 4499 if (dispatch_width == 32) { 4500 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4501 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4502 brw_imm_ud(0xffffffff)); 4503 } else { 4504 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4505 } 4506 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 4507 4508 /* For some reason, the any/all predicates don't work properly with 4509 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4510 * doesn't read the correct subset of the flag register and you end up 4511 * getting garbage in the second half. Work around this by using a pair 4512 * of 1-wide MOVs and scattering the result. 4513 */ 4514 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4515 ubld.MOV(res1, brw_imm_d(0)); 4516 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4517 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4518 BRW_PREDICATE_ALIGN1_ALL32H, 4519 ubld.MOV(res1, brw_imm_d(-1))); 4520 4521 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4522 break; 4523 } 4524 case nir_intrinsic_vote_feq: 4525 case nir_intrinsic_vote_ieq: { 4526 fs_reg value = get_nir_src(instr->src[0]); 4527 if (instr->intrinsic == nir_intrinsic_vote_feq) { 4528 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4529 value.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 4530 } 4531 4532 fs_reg uniformized = bld.emit_uniformize(value); 4533 const fs_builder ubld = bld.exec_all().group(1, 0); 4534 4535 /* The any/all predicates do not consider channel enables. To prevent 4536 * dead channels from affecting the result, we initialize the flag with 4537 * with the identity value for the logical operation. 4538 */ 4539 if (dispatch_width == 32) { 4540 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 4541 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 4542 brw_imm_ud(0xffffffff)); 4543 } else { 4544 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 4545 } 4546 bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 4547 4548 /* For some reason, the any/all predicates don't work properly with 4549 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 4550 * doesn't read the correct subset of the flag register and you end up 4551 * getting garbage in the second half. Work around this by using a pair 4552 * of 1-wide MOVs and scattering the result. 4553 */ 4554 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 4555 ubld.MOV(res1, brw_imm_d(0)); 4556 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 4557 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 4558 BRW_PREDICATE_ALIGN1_ALL32H, 4559 ubld.MOV(res1, brw_imm_d(-1))); 4560 4561 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 4562 break; 4563 } 4564 4565 case nir_intrinsic_ballot: { 4566 const fs_reg value = retype(get_nir_src(instr->src[0]), 4567 BRW_REGISTER_TYPE_UD); 4568 struct brw_reg flag = brw_flag_reg(0, 0); 4569 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 4570 * as f0.0. This is a problem for fragment programs as we currently use 4571 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 4572 * programs yet so this isn't a problem. When we do, something will 4573 * have to change. 4574 */ 4575 if (dispatch_width == 32) 4576 flag.type = BRW_REGISTER_TYPE_UD; 4577 4578 bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 4579 bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 4580 4581 if (instr->dest.ssa.bit_size > 32) { 4582 dest.type = BRW_REGISTER_TYPE_UQ; 4583 } else { 4584 dest.type = BRW_REGISTER_TYPE_UD; 4585 } 4586 bld.MOV(dest, flag); 4587 break; 4588 } 4589 4590 case nir_intrinsic_read_invocation: { 4591 const fs_reg value = get_nir_src(instr->src[0]); 4592 const fs_reg invocation = get_nir_src(instr->src[1]); 4593 fs_reg tmp = bld.vgrf(value.type); 4594 4595 bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 4596 bld.emit_uniformize(invocation)); 4597 4598 bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 4599 break; 4600 } 4601 4602 case nir_intrinsic_read_first_invocation: { 4603 const fs_reg value = get_nir_src(instr->src[0]); 4604 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 4605 break; 4606 } 4607 4608 case nir_intrinsic_shuffle: { 4609 const fs_reg value = get_nir_src(instr->src[0]); 4610 const fs_reg index = get_nir_src(instr->src[1]); 4611 4612 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 4613 break; 4614 } 4615 4616 case nir_intrinsic_first_invocation: { 4617 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4618 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 4619 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 4620 fs_reg(component(tmp, 0))); 4621 break; 4622 } 4623 4624 case nir_intrinsic_quad_broadcast: { 4625 const fs_reg value = get_nir_src(instr->src[0]); 4626 nir_const_value *index = nir_src_as_const_value(instr->src[1]); 4627 assert(nir_src_bit_size(instr->src[1]) == 32); 4628 4629 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 4630 value, brw_imm_ud(index->u32[0]), brw_imm_ud(4)); 4631 break; 4632 } 4633 4634 case nir_intrinsic_quad_swap_horizontal: { 4635 const fs_reg value = get_nir_src(instr->src[0]); 4636 const fs_reg tmp = bld.vgrf(value.type); 4637 const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 4638 4639 const fs_reg src_left = horiz_stride(value, 2); 4640 const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 4641 const fs_reg tmp_left = horiz_stride(tmp, 2); 4642 const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 4643 4644 /* From the Cherryview PRM Vol. 7, "Register Region Restrictiosn": 4645 * 4646 * "When source or destination datatype is 64b or operation is 4647 * integer DWord multiply, regioning in Align1 must follow 4648 * these rules: 4649 * 4650 * [...] 4651 * 4652 * 3. Source and Destination offset must be the same, except 4653 * the case of scalar source." 4654 * 4655 * In order to work around this, we have to emit two 32-bit MOVs instead 4656 * of a single 64-bit MOV to do the shuffle. 4657 */ 4658 if (type_sz(value.type) > 4 && 4659 (devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) { 4660 ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 0), 4661 subscript(src_right, BRW_REGISTER_TYPE_D, 0)); 4662 ubld.MOV(subscript(tmp_left, BRW_REGISTER_TYPE_D, 1), 4663 subscript(src_right, BRW_REGISTER_TYPE_D, 1)); 4664 ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 0), 4665 subscript(src_left, BRW_REGISTER_TYPE_D, 0)); 4666 ubld.MOV(subscript(tmp_right, BRW_REGISTER_TYPE_D, 1), 4667 subscript(src_left, BRW_REGISTER_TYPE_D, 1)); 4668 } else { 4669 ubld.MOV(tmp_left, src_right); 4670 ubld.MOV(tmp_right, src_left); 4671 } 4672 bld.MOV(retype(dest, value.type), tmp); 4673 break; 4674 } 4675 4676 case nir_intrinsic_quad_swap_vertical: { 4677 const fs_reg value = get_nir_src(instr->src[0]); 4678 if (nir_src_bit_size(instr->src[0]) == 32) { 4679 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4680 const fs_reg tmp = bld.vgrf(value.type); 4681 const fs_builder ubld = bld.exec_all(); 4682 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4683 brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 4684 bld.MOV(retype(dest, value.type), tmp); 4685 } else { 4686 /* For larger data types, we have to either emit dispatch_width many 4687 * MOVs or else fall back to doing indirects. 4688 */ 4689 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4690 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4691 brw_imm_w(0x2)); 4692 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4693 } 4694 break; 4695 } 4696 4697 case nir_intrinsic_quad_swap_diagonal: { 4698 const fs_reg value = get_nir_src(instr->src[0]); 4699 if (nir_src_bit_size(instr->src[0]) == 32) { 4700 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 4701 const fs_reg tmp = bld.vgrf(value.type); 4702 const fs_builder ubld = bld.exec_all(); 4703 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 4704 brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 4705 bld.MOV(retype(dest, value.type), tmp); 4706 } else { 4707 /* For larger data types, we have to either emit dispatch_width many 4708 * MOVs or else fall back to doing indirects. 4709 */ 4710 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4711 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4712 brw_imm_w(0x3)); 4713 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 4714 } 4715 break; 4716 } 4717 4718 case nir_intrinsic_reduce: { 4719 fs_reg src = get_nir_src(instr->src[0]); 4720 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4721 unsigned cluster_size = nir_intrinsic_cluster_size(instr); 4722 if (cluster_size == 0 || cluster_size > dispatch_width) 4723 cluster_size = dispatch_width; 4724 4725 /* Figure out the source type */ 4726 src.type = brw_type_for_nir_type(devinfo, 4727 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4728 nir_src_bit_size(instr->src[0]))); 4729 4730 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4731 opcode brw_op = brw_op_for_nir_reduction_op(redop); 4732 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4733 4734 /* Set up a register for all of our scratching around and initialize it 4735 * to reduction operation's identity value. 4736 */ 4737 fs_reg scan = bld.vgrf(src.type); 4738 bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4739 4740 bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 4741 4742 dest.type = src.type; 4743 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 4744 /* In this case, CLUSTER_BROADCAST instruction isn't needed because 4745 * the distance between clusters is at least 2 GRFs. In this case, 4746 * we don't need the weird striding of the CLUSTER_BROADCAST 4747 * instruction and can just do regular MOVs. 4748 */ 4749 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 4750 const unsigned groups = 4751 (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 4752 const unsigned group_size = dispatch_width / groups; 4753 for (unsigned i = 0; i < groups; i++) { 4754 const unsigned cluster = (i * group_size) / cluster_size; 4755 const unsigned comp = cluster * cluster_size + (cluster_size - 1); 4756 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 4757 component(scan, comp)); 4758 } 4759 } else { 4760 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 4761 brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 4762 } 4763 break; 4764 } 4765 4766 case nir_intrinsic_inclusive_scan: 4767 case nir_intrinsic_exclusive_scan: { 4768 fs_reg src = get_nir_src(instr->src[0]); 4769 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 4770 4771 /* Figure out the source type */ 4772 src.type = brw_type_for_nir_type(devinfo, 4773 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 4774 nir_src_bit_size(instr->src[0]))); 4775 4776 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 4777 opcode brw_op = brw_op_for_nir_reduction_op(redop); 4778 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 4779 4780 /* Set up a register for all of our scratching around and initialize it 4781 * to reduction operation's identity value. 4782 */ 4783 fs_reg scan = bld.vgrf(src.type); 4784 const fs_builder allbld = bld.exec_all(); 4785 allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 4786 4787 if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 4788 /* Exclusive scan is a bit harder because we have to do an annoying 4789 * shift of the contents before we can begin. To make things worse, 4790 * we can't do this with a normal stride; we have to use indirects. 4791 */ 4792 fs_reg shifted = bld.vgrf(src.type); 4793 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 4794 allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 4795 brw_imm_w(-1)); 4796 allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 4797 allbld.group(1, 0).MOV(component(shifted, 0), identity); 4798 scan = shifted; 4799 } 4800 4801 bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 4802 4803 bld.MOV(retype(dest, src.type), scan); 4804 break; 4805 } 4806 4807 case nir_intrinsic_begin_invocation_interlock: { 4808 const fs_builder ubld = bld.group(8, 0); 4809 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 4810 4811 ubld.emit(SHADER_OPCODE_INTERLOCK, tmp)->size_written = 2 * 4812 REG_SIZE; 4813 4814 break; 4815 } 4816 4817 case nir_intrinsic_end_invocation_interlock: { 4818 /* We don't need to do anything here */ 4819 break; 4820 } 4821 4822 default: 4823 unreachable("unknown intrinsic"); 4824 } 4825} 4826 4827void 4828fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 4829 int op, nir_intrinsic_instr *instr) 4830{ 4831 if (stage == MESA_SHADER_FRAGMENT) 4832 brw_wm_prog_data(prog_data)->has_side_effects = true; 4833 4834 fs_reg dest; 4835 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4836 dest = get_nir_dest(instr->dest); 4837 4838 fs_reg surface; 4839 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); 4840 if (const_surface) { 4841 unsigned surf_index = stage_prog_data->binding_table.ssbo_start + 4842 const_surface->u32[0]; 4843 surface = brw_imm_ud(surf_index); 4844 brw_mark_surface_used(prog_data, surf_index); 4845 } else { 4846 surface = vgrf(glsl_type::uint_type); 4847 bld.ADD(surface, get_nir_src(instr->src[0]), 4848 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4849 4850 /* Assume this may touch any SSBO. This is the same we do for other 4851 * UBO/SSBO accesses with non-constant surface. 4852 */ 4853 brw_mark_surface_used(prog_data, 4854 stage_prog_data->binding_table.ssbo_start + 4855 nir->info.num_ssbos - 1); 4856 } 4857 4858 fs_reg offset = get_nir_src(instr->src[1]); 4859 fs_reg data1; 4860 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 4861 data1 = get_nir_src(instr->src[2]); 4862 fs_reg data2; 4863 if (op == BRW_AOP_CMPWR) 4864 data2 = get_nir_src(instr->src[3]); 4865 4866 /* Emit the actual atomic operation */ 4867 4868 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 4869 data1, data2, 4870 1 /* dims */, 1 /* rsize */, 4871 op, 4872 BRW_PREDICATE_NONE); 4873 dest.type = atomic_result.type; 4874 bld.MOV(dest, atomic_result); 4875} 4876 4877void 4878fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 4879 int op, nir_intrinsic_instr *instr) 4880{ 4881 if (stage == MESA_SHADER_FRAGMENT) 4882 brw_wm_prog_data(prog_data)->has_side_effects = true; 4883 4884 fs_reg dest; 4885 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4886 dest = get_nir_dest(instr->dest); 4887 4888 fs_reg surface; 4889 nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]); 4890 if (const_surface) { 4891 unsigned surf_index = stage_prog_data->binding_table.ssbo_start + 4892 const_surface->u32[0]; 4893 surface = brw_imm_ud(surf_index); 4894 brw_mark_surface_used(prog_data, surf_index); 4895 } else { 4896 surface = vgrf(glsl_type::uint_type); 4897 bld.ADD(surface, get_nir_src(instr->src[0]), 4898 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4899 4900 /* Assume this may touch any SSBO. This is the same we do for other 4901 * UBO/SSBO accesses with non-constant surface. 4902 */ 4903 brw_mark_surface_used(prog_data, 4904 stage_prog_data->binding_table.ssbo_start + 4905 nir->info.num_ssbos - 1); 4906 } 4907 4908 fs_reg offset = get_nir_src(instr->src[1]); 4909 fs_reg data1 = get_nir_src(instr->src[2]); 4910 fs_reg data2; 4911 if (op == BRW_AOP_FCMPWR) 4912 data2 = get_nir_src(instr->src[3]); 4913 4914 /* Emit the actual atomic operation */ 4915 4916 fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset, 4917 data1, data2, 4918 1 /* dims */, 1 /* rsize */, 4919 op, 4920 BRW_PREDICATE_NONE); 4921 dest.type = atomic_result.type; 4922 bld.MOV(dest, atomic_result); 4923} 4924 4925void 4926fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 4927 int op, nir_intrinsic_instr *instr) 4928{ 4929 fs_reg dest; 4930 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4931 dest = get_nir_dest(instr->dest); 4932 4933 fs_reg surface = brw_imm_ud(GEN7_BTI_SLM); 4934 fs_reg offset; 4935 fs_reg data1; 4936 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 4937 data1 = get_nir_src(instr->src[1]); 4938 fs_reg data2; 4939 if (op == BRW_AOP_CMPWR) 4940 data2 = get_nir_src(instr->src[2]); 4941 4942 /* Get the offset */ 4943 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 4944 if (const_offset) { 4945 offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); 4946 } else { 4947 offset = vgrf(glsl_type::uint_type); 4948 bld.ADD(offset, 4949 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 4950 brw_imm_ud(instr->const_index[0])); 4951 } 4952 4953 /* Emit the actual atomic operation operation */ 4954 4955 fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset, 4956 data1, data2, 4957 1 /* dims */, 1 /* rsize */, 4958 op, 4959 BRW_PREDICATE_NONE); 4960 dest.type = atomic_result.type; 4961 bld.MOV(dest, atomic_result); 4962} 4963 4964void 4965fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 4966 int op, nir_intrinsic_instr *instr) 4967{ 4968 fs_reg dest; 4969 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4970 dest = get_nir_dest(instr->dest); 4971 4972 fs_reg surface = brw_imm_ud(GEN7_BTI_SLM); 4973 fs_reg offset; 4974 fs_reg data1 = get_nir_src(instr->src[1]); 4975 fs_reg data2; 4976 if (op == BRW_AOP_FCMPWR) 4977 data2 = get_nir_src(instr->src[2]); 4978 4979 /* Get the offset */ 4980 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 4981 if (const_offset) { 4982 offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]); 4983 } else { 4984 offset = vgrf(glsl_type::uint_type); 4985 bld.ADD(offset, 4986 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 4987 brw_imm_ud(instr->const_index[0])); 4988 } 4989 4990 /* Emit the actual atomic operation operation */ 4991 4992 fs_reg atomic_result = emit_untyped_atomic_float(bld, surface, offset, 4993 data1, data2, 4994 1 /* dims */, 1 /* rsize */, 4995 op, 4996 BRW_PREDICATE_NONE); 4997 dest.type = atomic_result.type; 4998 bld.MOV(dest, atomic_result); 4999} 5000 5001void 5002fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 5003{ 5004 unsigned texture = instr->texture_index; 5005 unsigned sampler = instr->sampler_index; 5006 5007 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 5008 5009 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 5010 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 5011 5012 int lod_components = 0; 5013 5014 /* The hardware requires a LOD for buffer textures */ 5015 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 5016 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 5017 5018 uint32_t header_bits = 0; 5019 for (unsigned i = 0; i < instr->num_srcs; i++) { 5020 fs_reg src = get_nir_src(instr->src[i].src); 5021 switch (instr->src[i].src_type) { 5022 case nir_tex_src_bias: 5023 srcs[TEX_LOGICAL_SRC_LOD] = 5024 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5025 break; 5026 case nir_tex_src_comparator: 5027 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 5028 break; 5029 case nir_tex_src_coord: 5030 switch (instr->op) { 5031 case nir_texop_txf: 5032 case nir_texop_txf_ms: 5033 case nir_texop_txf_ms_mcs: 5034 case nir_texop_samples_identical: 5035 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 5036 break; 5037 default: 5038 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 5039 break; 5040 } 5041 break; 5042 case nir_tex_src_ddx: 5043 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 5044 lod_components = nir_tex_instr_src_size(instr, i); 5045 break; 5046 case nir_tex_src_ddy: 5047 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 5048 break; 5049 case nir_tex_src_lod: 5050 switch (instr->op) { 5051 case nir_texop_txs: 5052 srcs[TEX_LOGICAL_SRC_LOD] = 5053 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 5054 break; 5055 case nir_texop_txf: 5056 srcs[TEX_LOGICAL_SRC_LOD] = 5057 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 5058 break; 5059 default: 5060 srcs[TEX_LOGICAL_SRC_LOD] = 5061 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 5062 break; 5063 } 5064 break; 5065 case nir_tex_src_ms_index: 5066 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 5067 break; 5068 5069 case nir_tex_src_offset: { 5070 nir_const_value *const_offset = 5071 nir_src_as_const_value(instr->src[i].src); 5072 unsigned offset_bits = 0; 5073 if (const_offset && 5074 brw_texture_offset(const_offset->i32, 5075 nir_tex_instr_src_size(instr, i), 5076 &offset_bits)) { 5077 header_bits |= offset_bits; 5078 } else { 5079 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 5080 retype(src, BRW_REGISTER_TYPE_D); 5081 } 5082 break; 5083 } 5084 5085 case nir_tex_src_projector: 5086 unreachable("should be lowered"); 5087 5088 case nir_tex_src_texture_offset: { 5089 /* Figure out the highest possible texture index and mark it as used */ 5090 uint32_t max_used = texture + instr->texture_array_size - 1; 5091 if (instr->op == nir_texop_tg4 && devinfo->gen < 8) { 5092 max_used += stage_prog_data->binding_table.gather_texture_start; 5093 } else { 5094 max_used += stage_prog_data->binding_table.texture_start; 5095 } 5096 brw_mark_surface_used(prog_data, max_used); 5097 5098 /* Emit code to evaluate the actual indexing expression */ 5099 fs_reg tmp = vgrf(glsl_type::uint_type); 5100 bld.ADD(tmp, src, brw_imm_ud(texture)); 5101 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 5102 break; 5103 } 5104 5105 case nir_tex_src_sampler_offset: { 5106 /* Emit code to evaluate the actual indexing expression */ 5107 fs_reg tmp = vgrf(glsl_type::uint_type); 5108 bld.ADD(tmp, src, brw_imm_ud(sampler)); 5109 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 5110 break; 5111 } 5112 5113 case nir_tex_src_ms_mcs: 5114 assert(instr->op == nir_texop_txf_ms); 5115 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 5116 break; 5117 5118 case nir_tex_src_plane: { 5119 nir_const_value *const_plane = 5120 nir_src_as_const_value(instr->src[i].src); 5121 const uint32_t plane = const_plane->u32[0]; 5122 const uint32_t texture_index = 5123 instr->texture_index + 5124 stage_prog_data->binding_table.plane_start[plane] - 5125 stage_prog_data->binding_table.texture_start; 5126 5127 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); 5128 break; 5129 } 5130 5131 default: 5132 unreachable("unknown texture source"); 5133 } 5134 } 5135 5136 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 5137 (instr->op == nir_texop_txf_ms || 5138 instr->op == nir_texop_samples_identical)) { 5139 if (devinfo->gen >= 7 && 5140 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 5141 srcs[TEX_LOGICAL_SRC_MCS] = 5142 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 5143 instr->coord_components, 5144 srcs[TEX_LOGICAL_SRC_SURFACE]); 5145 } else { 5146 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 5147 } 5148 } 5149 5150 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 5151 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 5152 5153 enum opcode opcode; 5154 switch (instr->op) { 5155 case nir_texop_tex: 5156 opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL : 5157 SHADER_OPCODE_TXL_LOGICAL); 5158 break; 5159 case nir_texop_txb: 5160 opcode = FS_OPCODE_TXB_LOGICAL; 5161 break; 5162 case nir_texop_txl: 5163 opcode = SHADER_OPCODE_TXL_LOGICAL; 5164 break; 5165 case nir_texop_txd: 5166 opcode = SHADER_OPCODE_TXD_LOGICAL; 5167 break; 5168 case nir_texop_txf: 5169 opcode = SHADER_OPCODE_TXF_LOGICAL; 5170 break; 5171 case nir_texop_txf_ms: 5172 if ((key_tex->msaa_16 & (1 << sampler))) 5173 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 5174 else 5175 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 5176 break; 5177 case nir_texop_txf_ms_mcs: 5178 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 5179 break; 5180 case nir_texop_query_levels: 5181 case nir_texop_txs: 5182 opcode = SHADER_OPCODE_TXS_LOGICAL; 5183 break; 5184 case nir_texop_lod: 5185 opcode = SHADER_OPCODE_LOD_LOGICAL; 5186 break; 5187 case nir_texop_tg4: 5188 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 5189 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 5190 else 5191 opcode = SHADER_OPCODE_TG4_LOGICAL; 5192 break; 5193 case nir_texop_texture_samples: 5194 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 5195 break; 5196 case nir_texop_samples_identical: { 5197 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 5198 5199 /* If mcs is an immediate value, it means there is no MCS. In that case 5200 * just return false. 5201 */ 5202 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 5203 bld.MOV(dst, brw_imm_ud(0u)); 5204 } else if ((key_tex->msaa_16 & (1 << sampler))) { 5205 fs_reg tmp = vgrf(glsl_type::uint_type); 5206 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 5207 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 5208 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 5209 } else { 5210 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 5211 BRW_CONDITIONAL_EQ); 5212 } 5213 return; 5214 } 5215 default: 5216 unreachable("unknown texture opcode"); 5217 } 5218 5219 if (instr->op == nir_texop_tg4) { 5220 if (instr->component == 1 && 5221 key_tex->gather_channel_quirk_mask & (1 << texture)) { 5222 /* gather4 sampler is broken for green channel on RG32F -- 5223 * we must ask for blue instead. 5224 */ 5225 header_bits |= 2 << 16; 5226 } else { 5227 header_bits |= instr->component << 16; 5228 } 5229 } 5230 5231 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 5232 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 5233 inst->offset = header_bits; 5234 5235 const unsigned dest_size = nir_tex_instr_dest_size(instr); 5236 if (devinfo->gen >= 9 && 5237 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 5238 unsigned write_mask = instr->dest.is_ssa ? 5239 nir_ssa_def_components_read(&instr->dest.ssa): 5240 (1 << dest_size) - 1; 5241 assert(write_mask != 0); /* dead code should have been eliminated */ 5242 inst->size_written = util_last_bit(write_mask) * 5243 inst->dst.component_size(inst->exec_size); 5244 } else { 5245 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 5246 } 5247 5248 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 5249 inst->shadow_compare = true; 5250 5251 if (instr->op == nir_texop_tg4 && devinfo->gen == 6) 5252 emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst); 5253 5254 fs_reg nir_dest[4]; 5255 for (unsigned i = 0; i < dest_size; i++) 5256 nir_dest[i] = offset(dst, bld, i); 5257 5258 if (instr->op == nir_texop_query_levels) { 5259 /* # levels is in .w */ 5260 nir_dest[0] = offset(dst, bld, 3); 5261 } else if (instr->op == nir_texop_txs && 5262 dest_size >= 3 && devinfo->gen < 7) { 5263 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */ 5264 fs_reg depth = offset(dst, bld, 2); 5265 nir_dest[2] = vgrf(glsl_type::int_type); 5266 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 5267 } 5268 5269 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 5270} 5271 5272void 5273fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 5274{ 5275 switch (instr->type) { 5276 case nir_jump_break: 5277 bld.emit(BRW_OPCODE_BREAK); 5278 break; 5279 case nir_jump_continue: 5280 bld.emit(BRW_OPCODE_CONTINUE); 5281 break; 5282 case nir_jump_return: 5283 default: 5284 unreachable("unknown jump"); 5285 } 5286} 5287 5288/* 5289 * This helper takes a source register and un/shuffles it into the destination 5290 * register. 5291 * 5292 * If source type size is smaller than destination type size the operation 5293 * needed is a component shuffle. The opposite case would be an unshuffle. If 5294 * source/destination type size is equal a shuffle is done that would be 5295 * equivalent to a simple MOV. 5296 * 5297 * For example, if source is a 16-bit type and destination is 32-bit. A 3 5298 * components .xyz 16-bit vector on SIMD8 would be. 5299 * 5300 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 5301 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 5302 * 5303 * This helper will return the following 2 32-bit components with the 16-bit 5304 * values shuffled: 5305 * 5306 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 5307 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 5308 * 5309 * For unshuffle, the example would be the opposite, a 64-bit type source 5310 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 5311 * would be: 5312 * 5313 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 5314 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 5315 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 5316 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 5317 * 5318 * The returned result would be the following 4 32-bit components unshuffled: 5319 * 5320 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 5321 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 5322 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 5323 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 5324 * 5325 * - Source and destination register must not be overlapped. 5326 * - components units are measured in terms of the smaller type between 5327 * source and destination because we are un/shuffling the smaller 5328 * components from/into the bigger ones. 5329 * - first_component parameter allows skipping source components. 5330 */ 5331void 5332shuffle_src_to_dst(const fs_builder &bld, 5333 const fs_reg &dst, 5334 const fs_reg &src, 5335 uint32_t first_component, 5336 uint32_t components) 5337{ 5338 if (type_sz(src.type) == type_sz(dst.type)) { 5339 assert(!regions_overlap(dst, 5340 type_sz(dst.type) * bld.dispatch_width() * components, 5341 offset(src, bld, first_component), 5342 type_sz(src.type) * bld.dispatch_width() * components)); 5343 for (unsigned i = 0; i < components; i++) { 5344 bld.MOV(retype(offset(dst, bld, i), src.type), 5345 offset(src, bld, i + first_component)); 5346 } 5347 } else if (type_sz(src.type) < type_sz(dst.type)) { 5348 /* Source is shuffled into destination */ 5349 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 5350 assert(!regions_overlap(dst, 5351 type_sz(dst.type) * bld.dispatch_width() * 5352 DIV_ROUND_UP(components, size_ratio), 5353 offset(src, bld, first_component), 5354 type_sz(src.type) * bld.dispatch_width() * components)); 5355 5356 brw_reg_type shuffle_type = 5357 brw_reg_type_from_bit_size(8 * type_sz(src.type), 5358 BRW_REGISTER_TYPE_D); 5359 for (unsigned i = 0; i < components; i++) { 5360 fs_reg shuffle_component_i = 5361 subscript(offset(dst, bld, i / size_ratio), 5362 shuffle_type, i % size_ratio); 5363 bld.MOV(shuffle_component_i, 5364 retype(offset(src, bld, i + first_component), shuffle_type)); 5365 } 5366 } else { 5367 /* Source is unshuffled into destination */ 5368 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 5369 assert(!regions_overlap(dst, 5370 type_sz(dst.type) * bld.dispatch_width() * components, 5371 offset(src, bld, first_component / size_ratio), 5372 type_sz(src.type) * bld.dispatch_width() * 5373 DIV_ROUND_UP(components + (first_component % size_ratio), 5374 size_ratio))); 5375 5376 brw_reg_type shuffle_type = 5377 brw_reg_type_from_bit_size(8 * type_sz(dst.type), 5378 BRW_REGISTER_TYPE_D); 5379 for (unsigned i = 0; i < components; i++) { 5380 fs_reg shuffle_component_i = 5381 subscript(offset(src, bld, (first_component + i) / size_ratio), 5382 shuffle_type, (first_component + i) % size_ratio); 5383 bld.MOV(retype(offset(dst, bld, i), shuffle_type), 5384 shuffle_component_i); 5385 } 5386 } 5387} 5388 5389void 5390shuffle_from_32bit_read(const fs_builder &bld, 5391 const fs_reg &dst, 5392 const fs_reg &src, 5393 uint32_t first_component, 5394 uint32_t components) 5395{ 5396 assert(type_sz(src.type) == 4); 5397 5398 /* This function takes components in units of the destination type while 5399 * shuffle_src_to_dst takes components in units of the smallest type 5400 */ 5401 if (type_sz(dst.type) > 4) { 5402 assert(type_sz(dst.type) == 8); 5403 first_component *= 2; 5404 components *= 2; 5405 } 5406 5407 shuffle_src_to_dst(bld, dst, src, first_component, components); 5408} 5409 5410fs_reg 5411shuffle_for_32bit_write(const fs_builder &bld, 5412 const fs_reg &src, 5413 uint32_t first_component, 5414 uint32_t components) 5415{ 5416 fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_D, 5417 DIV_ROUND_UP (components * type_sz(src.type), 4)); 5418 /* This function takes components in units of the source type while 5419 * shuffle_src_to_dst takes components in units of the smallest type 5420 */ 5421 if (type_sz(src.type) > 4) { 5422 assert(type_sz(src.type) == 8); 5423 first_component *= 2; 5424 components *= 2; 5425 } 5426 5427 shuffle_src_to_dst(bld, dst, src, first_component, components); 5428 5429 return dst; 5430} 5431 5432fs_reg 5433setup_imm_df(const fs_builder &bld, double v) 5434{ 5435 const struct gen_device_info *devinfo = bld.shader->devinfo; 5436 assert(devinfo->gen >= 7); 5437 5438 if (devinfo->gen >= 8) 5439 return brw_imm_df(v); 5440 5441 /* gen7.5 does not support DF immediates straighforward but the DIM 5442 * instruction allows to set the 64-bit immediate value. 5443 */ 5444 if (devinfo->is_haswell) { 5445 const fs_builder ubld = bld.exec_all().group(1, 0); 5446 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 5447 ubld.DIM(dst, brw_imm_df(v)); 5448 return component(dst, 0); 5449 } 5450 5451 /* gen7 does not support DF immediates, so we generate a 64-bit constant by 5452 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 5453 * the high 32-bit to suboffset 4 and then applying a stride of 0. 5454 * 5455 * Alternatively, we could also produce a normal VGRF (without stride 0) 5456 * by writing to all the channels in the VGRF, however, that would hit the 5457 * gen7 bug where we have to split writes that span more than 1 register 5458 * into instructions with a width of 4 (otherwise the write to the second 5459 * register written runs into an execmask hardware bug) which isn't very 5460 * nice. 5461 */ 5462 union { 5463 double d; 5464 struct { 5465 uint32_t i1; 5466 uint32_t i2; 5467 }; 5468 } di; 5469 5470 di.d = v; 5471 5472 const fs_builder ubld = bld.exec_all().group(1, 0); 5473 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 5474 ubld.MOV(tmp, brw_imm_ud(di.i1)); 5475 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 5476 5477 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 5478} 5479 5480fs_reg 5481setup_imm_b(const fs_builder &bld, int8_t v) 5482{ 5483 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 5484 bld.MOV(tmp, brw_imm_w(v)); 5485 return tmp; 5486} 5487 5488fs_reg 5489setup_imm_ub(const fs_builder &bld, uint8_t v) 5490{ 5491 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 5492 bld.MOV(tmp, brw_imm_uw(v)); 5493 return tmp; 5494} 5495