brw_fs_nir.cpp revision 7ec681f3
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "compiler/glsl/ir.h" 25#include "brw_fs.h" 26#include "brw_nir.h" 27#include "brw_rt.h" 28#include "brw_eu.h" 29#include "nir_search_helpers.h" 30#include "util/u_math.h" 31#include "util/bitscan.h" 32 33using namespace brw; 34 35void 36fs_visitor::emit_nir_code() 37{ 38 emit_shader_float_controls_execution_mode(); 39 40 /* emit the arrays used for inputs and outputs - load/store intrinsics will 41 * be converted to reads/writes of these arrays 42 */ 43 nir_setup_outputs(); 44 nir_setup_uniforms(); 45 nir_emit_system_values(); 46 last_scratch = ALIGN(nir->scratch_size, 4) * dispatch_width; 47 48 nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); 49 50 bld.emit(SHADER_OPCODE_HALT_TARGET); 51} 52 53void 54fs_visitor::nir_setup_outputs() 55{ 56 if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT) 57 return; 58 59 unsigned vec4s[VARYING_SLOT_TESS_MAX] = { 0, }; 60 61 /* Calculate the size of output registers in a separate pass, before 62 * allocating them. With ARB_enhanced_layouts, multiple output variables 63 * may occupy the same slot, but have different type sizes. 64 */ 65 nir_foreach_shader_out_variable(var, nir) { 66 const int loc = var->data.driver_location; 67 const unsigned var_vec4s = 68 var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4) 69 : type_size_vec4(var->type, true); 70 vec4s[loc] = MAX2(vec4s[loc], var_vec4s); 71 } 72 73 for (unsigned loc = 0; loc < ARRAY_SIZE(vec4s);) { 74 if (vec4s[loc] == 0) { 75 loc++; 76 continue; 77 } 78 79 unsigned reg_size = vec4s[loc]; 80 81 /* Check if there are any ranges that start within this range and extend 82 * past it. If so, include them in this allocation. 83 */ 84 for (unsigned i = 1; i < reg_size; i++) { 85 assert(i + loc < ARRAY_SIZE(vec4s)); 86 reg_size = MAX2(vec4s[i + loc] + i, reg_size); 87 } 88 89 fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * reg_size); 90 for (unsigned i = 0; i < reg_size; i++) { 91 assert(loc + i < ARRAY_SIZE(outputs)); 92 outputs[loc + i] = offset(reg, bld, 4 * i); 93 } 94 95 loc += reg_size; 96 } 97} 98 99void 100fs_visitor::nir_setup_uniforms() 101{ 102 /* Only the first compile gets to set up uniforms. */ 103 if (push_constant_loc) { 104 assert(pull_constant_loc); 105 return; 106 } 107 108 uniforms = nir->num_uniforms / 4; 109 110 if ((stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL) && 111 devinfo->verx10 < 125) { 112 /* Add uniforms for builtins after regular NIR uniforms. */ 113 assert(uniforms == prog_data->nr_params); 114 115 uint32_t *param; 116 if (nir->info.workgroup_size_variable && 117 compiler->lower_variable_group_size) { 118 param = brw_stage_prog_data_add_params(prog_data, 3); 119 for (unsigned i = 0; i < 3; i++) { 120 param[i] = (BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X + i); 121 group_size[i] = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 122 } 123 } 124 125 /* Subgroup ID must be the last uniform on the list. This will make 126 * easier later to split between cross thread and per thread 127 * uniforms. 128 */ 129 param = brw_stage_prog_data_add_params(prog_data, 1); 130 *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; 131 subgroup_id = fs_reg(UNIFORM, uniforms++, BRW_REGISTER_TYPE_UD); 132 } 133} 134 135static bool 136emit_system_values_block(nir_block *block, fs_visitor *v) 137{ 138 fs_reg *reg; 139 140 nir_foreach_instr(instr, block) { 141 if (instr->type != nir_instr_type_intrinsic) 142 continue; 143 144 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 145 switch (intrin->intrinsic) { 146 case nir_intrinsic_load_vertex_id: 147 case nir_intrinsic_load_base_vertex: 148 unreachable("should be lowered by nir_lower_system_values()."); 149 150 case nir_intrinsic_load_vertex_id_zero_base: 151 case nir_intrinsic_load_is_indexed_draw: 152 case nir_intrinsic_load_first_vertex: 153 case nir_intrinsic_load_instance_id: 154 case nir_intrinsic_load_base_instance: 155 case nir_intrinsic_load_draw_id: 156 unreachable("should be lowered by brw_nir_lower_vs_inputs()."); 157 158 case nir_intrinsic_load_invocation_id: 159 if (v->stage == MESA_SHADER_TESS_CTRL) 160 break; 161 assert(v->stage == MESA_SHADER_GEOMETRY); 162 reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 163 if (reg->file == BAD_FILE) { 164 const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL); 165 fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 166 fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 167 abld.SHR(iid, g1, brw_imm_ud(27u)); 168 *reg = iid; 169 } 170 break; 171 172 case nir_intrinsic_load_sample_pos: 173 assert(v->stage == MESA_SHADER_FRAGMENT); 174 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 175 if (reg->file == BAD_FILE) 176 *reg = *v->emit_samplepos_setup(); 177 break; 178 179 case nir_intrinsic_load_sample_id: 180 assert(v->stage == MESA_SHADER_FRAGMENT); 181 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 182 if (reg->file == BAD_FILE) 183 *reg = *v->emit_sampleid_setup(); 184 break; 185 186 case nir_intrinsic_load_sample_mask_in: 187 assert(v->stage == MESA_SHADER_FRAGMENT); 188 assert(v->devinfo->ver >= 7); 189 reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN]; 190 if (reg->file == BAD_FILE) 191 *reg = *v->emit_samplemaskin_setup(); 192 break; 193 194 case nir_intrinsic_load_workgroup_id: 195 assert(v->stage == MESA_SHADER_COMPUTE || 196 v->stage == MESA_SHADER_KERNEL); 197 reg = &v->nir_system_values[SYSTEM_VALUE_WORKGROUP_ID]; 198 if (reg->file == BAD_FILE) 199 *reg = *v->emit_cs_work_group_id_setup(); 200 break; 201 202 case nir_intrinsic_load_helper_invocation: 203 assert(v->stage == MESA_SHADER_FRAGMENT); 204 reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION]; 205 if (reg->file == BAD_FILE) { 206 const fs_builder abld = 207 v->bld.annotate("gl_HelperInvocation", NULL); 208 209 /* On Gfx6+ (gl_HelperInvocation is only exposed on Gfx7+) the 210 * pixel mask is in g1.7 of the thread payload. 211 * 212 * We move the per-channel pixel enable bit to the low bit of each 213 * channel by shifting the byte containing the pixel mask by the 214 * vector immediate 0x76543210UV. 215 * 216 * The region of <1,8,0> reads only 1 byte (the pixel masks for 217 * subspans 0 and 1) in SIMD8 and an additional byte (the pixel 218 * masks for 2 and 3) in SIMD16. 219 */ 220 fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1); 221 222 for (unsigned i = 0; i < DIV_ROUND_UP(v->dispatch_width, 16); i++) { 223 const fs_builder hbld = abld.group(MIN2(16, v->dispatch_width), i); 224 hbld.SHR(offset(shifted, hbld, i), 225 stride(retype(brw_vec1_grf(1 + i, 7), 226 BRW_REGISTER_TYPE_UB), 227 1, 8, 0), 228 brw_imm_v(0x76543210)); 229 } 230 231 /* A set bit in the pixel mask means the channel is enabled, but 232 * that is the opposite of gl_HelperInvocation so we need to invert 233 * the mask. 234 * 235 * The negate source-modifier bit of logical instructions on Gfx8+ 236 * performs 1's complement negation, so we can use that instead of 237 * a NOT instruction. 238 */ 239 fs_reg inverted = negate(shifted); 240 if (v->devinfo->ver < 8) { 241 inverted = abld.vgrf(BRW_REGISTER_TYPE_UW); 242 abld.NOT(inverted, shifted); 243 } 244 245 /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing 246 * with 1 and negating. 247 */ 248 fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1); 249 abld.AND(anded, inverted, brw_imm_uw(1)); 250 251 fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1); 252 abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D))); 253 *reg = dst; 254 } 255 break; 256 257 case nir_intrinsic_load_frag_shading_rate: 258 reg = &v->nir_system_values[SYSTEM_VALUE_FRAG_SHADING_RATE]; 259 if (reg->file == BAD_FILE) 260 *reg = *v->emit_shading_rate_setup(); 261 break; 262 263 default: 264 break; 265 } 266 } 267 268 return true; 269} 270 271void 272fs_visitor::nir_emit_system_values() 273{ 274 nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX); 275 for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) { 276 nir_system_values[i] = fs_reg(); 277 } 278 279 /* Always emit SUBGROUP_INVOCATION. Dead code will clean it up if we 280 * never end up using it. 281 */ 282 { 283 const fs_builder abld = bld.annotate("gl_SubgroupInvocation", NULL); 284 fs_reg ® = nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 285 reg = abld.vgrf(BRW_REGISTER_TYPE_UW); 286 287 const fs_builder allbld8 = abld.group(8, 0).exec_all(); 288 allbld8.MOV(reg, brw_imm_v(0x76543210)); 289 if (dispatch_width > 8) 290 allbld8.ADD(byte_offset(reg, 16), reg, brw_imm_uw(8u)); 291 if (dispatch_width > 16) { 292 const fs_builder allbld16 = abld.group(16, 0).exec_all(); 293 allbld16.ADD(byte_offset(reg, 32), reg, brw_imm_uw(16u)); 294 } 295 } 296 297 nir_function_impl *impl = nir_shader_get_entrypoint((nir_shader *)nir); 298 nir_foreach_block(block, impl) 299 emit_system_values_block(block, this); 300} 301 302void 303fs_visitor::nir_emit_impl(nir_function_impl *impl) 304{ 305 nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc); 306 for (unsigned i = 0; i < impl->reg_alloc; i++) { 307 nir_locals[i] = fs_reg(); 308 } 309 310 foreach_list_typed(nir_register, reg, node, &impl->registers) { 311 unsigned array_elems = 312 reg->num_array_elems == 0 ? 1 : reg->num_array_elems; 313 unsigned size = array_elems * reg->num_components; 314 const brw_reg_type reg_type = reg->bit_size == 8 ? BRW_REGISTER_TYPE_B : 315 brw_reg_type_from_bit_size(reg->bit_size, BRW_REGISTER_TYPE_F); 316 nir_locals[reg->index] = bld.vgrf(reg_type, size); 317 } 318 319 nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg, 320 impl->ssa_alloc); 321 322 nir_emit_cf_list(&impl->body); 323} 324 325void 326fs_visitor::nir_emit_cf_list(exec_list *list) 327{ 328 exec_list_validate(list); 329 foreach_list_typed(nir_cf_node, node, node, list) { 330 switch (node->type) { 331 case nir_cf_node_if: 332 nir_emit_if(nir_cf_node_as_if(node)); 333 break; 334 335 case nir_cf_node_loop: 336 nir_emit_loop(nir_cf_node_as_loop(node)); 337 break; 338 339 case nir_cf_node_block: 340 nir_emit_block(nir_cf_node_as_block(node)); 341 break; 342 343 default: 344 unreachable("Invalid CFG node block"); 345 } 346 } 347} 348 349void 350fs_visitor::nir_emit_if(nir_if *if_stmt) 351{ 352 bool invert; 353 fs_reg cond_reg; 354 355 /* If the condition has the form !other_condition, use other_condition as 356 * the source, but invert the predicate on the if instruction. 357 */ 358 nir_alu_instr *cond = nir_src_as_alu_instr(if_stmt->condition); 359 if (cond != NULL && cond->op == nir_op_inot) { 360 invert = true; 361 cond_reg = get_nir_src(cond->src[0].src); 362 cond_reg = offset(cond_reg, bld, cond->src[0].swizzle[0]); 363 } else { 364 invert = false; 365 cond_reg = get_nir_src(if_stmt->condition); 366 } 367 368 /* first, put the condition into f0 */ 369 fs_inst *inst = bld.MOV(bld.null_reg_d(), 370 retype(cond_reg, BRW_REGISTER_TYPE_D)); 371 inst->conditional_mod = BRW_CONDITIONAL_NZ; 372 373 bld.IF(BRW_PREDICATE_NORMAL)->predicate_inverse = invert; 374 375 nir_emit_cf_list(&if_stmt->then_list); 376 377 if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { 378 bld.emit(BRW_OPCODE_ELSE); 379 nir_emit_cf_list(&if_stmt->else_list); 380 } 381 382 bld.emit(BRW_OPCODE_ENDIF); 383 384 if (devinfo->ver < 7) 385 limit_dispatch_width(16, "Non-uniform control flow unsupported " 386 "in SIMD32 mode."); 387} 388 389void 390fs_visitor::nir_emit_loop(nir_loop *loop) 391{ 392 bld.emit(BRW_OPCODE_DO); 393 394 nir_emit_cf_list(&loop->body); 395 396 bld.emit(BRW_OPCODE_WHILE); 397 398 if (devinfo->ver < 7) 399 limit_dispatch_width(16, "Non-uniform control flow unsupported " 400 "in SIMD32 mode."); 401} 402 403void 404fs_visitor::nir_emit_block(nir_block *block) 405{ 406 nir_foreach_instr(instr, block) { 407 nir_emit_instr(instr); 408 } 409} 410 411void 412fs_visitor::nir_emit_instr(nir_instr *instr) 413{ 414 const fs_builder abld = bld.annotate(NULL, instr); 415 416 switch (instr->type) { 417 case nir_instr_type_alu: 418 nir_emit_alu(abld, nir_instr_as_alu(instr), true); 419 break; 420 421 case nir_instr_type_deref: 422 unreachable("All derefs should've been lowered"); 423 break; 424 425 case nir_instr_type_intrinsic: 426 switch (stage) { 427 case MESA_SHADER_VERTEX: 428 nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 429 break; 430 case MESA_SHADER_TESS_CTRL: 431 nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 432 break; 433 case MESA_SHADER_TESS_EVAL: 434 nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr)); 435 break; 436 case MESA_SHADER_GEOMETRY: 437 nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 438 break; 439 case MESA_SHADER_FRAGMENT: 440 nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 441 break; 442 case MESA_SHADER_COMPUTE: 443 case MESA_SHADER_KERNEL: 444 nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 445 break; 446 case MESA_SHADER_RAYGEN: 447 case MESA_SHADER_ANY_HIT: 448 case MESA_SHADER_CLOSEST_HIT: 449 case MESA_SHADER_MISS: 450 case MESA_SHADER_INTERSECTION: 451 case MESA_SHADER_CALLABLE: 452 nir_emit_bs_intrinsic(abld, nir_instr_as_intrinsic(instr)); 453 break; 454 default: 455 unreachable("unsupported shader stage"); 456 } 457 break; 458 459 case nir_instr_type_tex: 460 nir_emit_texture(abld, nir_instr_as_tex(instr)); 461 break; 462 463 case nir_instr_type_load_const: 464 nir_emit_load_const(abld, nir_instr_as_load_const(instr)); 465 break; 466 467 case nir_instr_type_ssa_undef: 468 /* We create a new VGRF for undefs on every use (by handling 469 * them in get_nir_src()), rather than for each definition. 470 * This helps register coalescing eliminate MOVs from undef. 471 */ 472 break; 473 474 case nir_instr_type_jump: 475 nir_emit_jump(abld, nir_instr_as_jump(instr)); 476 break; 477 478 default: 479 unreachable("unknown instruction type"); 480 } 481} 482 483/** 484 * Recognizes a parent instruction of nir_op_extract_* and changes the type to 485 * match instr. 486 */ 487bool 488fs_visitor::optimize_extract_to_float(nir_alu_instr *instr, 489 const fs_reg &result) 490{ 491 if (!instr->src[0].src.is_ssa || 492 !instr->src[0].src.ssa->parent_instr) 493 return false; 494 495 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 496 return false; 497 498 nir_alu_instr *src0 = 499 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 500 501 if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 && 502 src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16) 503 return false; 504 505 unsigned element = nir_src_as_uint(src0->src[1].src); 506 507 /* Element type to extract.*/ 508 const brw_reg_type type = brw_int_type( 509 src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1, 510 src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8); 511 512 fs_reg op0 = get_nir_src(src0->src[0].src); 513 op0.type = brw_type_for_nir_type(devinfo, 514 (nir_alu_type)(nir_op_infos[src0->op].input_types[0] | 515 nir_src_bit_size(src0->src[0].src))); 516 op0 = offset(op0, bld, src0->src[0].swizzle[0]); 517 518 bld.MOV(result, subscript(op0, type, element)); 519 return true; 520} 521 522bool 523fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr, 524 const fs_reg &result) 525{ 526 nir_intrinsic_instr *src0 = nir_src_as_intrinsic(instr->src[0].src); 527 if (src0 == NULL || src0->intrinsic != nir_intrinsic_load_front_face) 528 return false; 529 530 if (!nir_src_is_const(instr->src[1].src) || 531 !nir_src_is_const(instr->src[2].src)) 532 return false; 533 534 const float value1 = nir_src_as_float(instr->src[1].src); 535 const float value2 = nir_src_as_float(instr->src[2].src); 536 if (fabsf(value1) != 1.0f || fabsf(value2) != 1.0f) 537 return false; 538 539 /* nir_opt_algebraic should have gotten rid of bcsel(b, a, a) */ 540 assert(value1 == -value2); 541 542 fs_reg tmp = vgrf(glsl_type::int_type); 543 544 if (devinfo->ver >= 12) { 545 /* Bit 15 of g1.1 is 0 if the polygon is front facing. */ 546 fs_reg g1 = fs_reg(retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_W)); 547 548 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 549 * 550 * or(8) tmp.1<2>W g1.1<0,1,0>W 0x00003f80W 551 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 552 * 553 * and negate g1.1<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 554 */ 555 if (value1 == -1.0f) 556 g1.negate = true; 557 558 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 559 g1, brw_imm_uw(0x3f80)); 560 } else if (devinfo->ver >= 6) { 561 /* Bit 15 of g0.0 is 0 if the polygon is front facing. */ 562 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); 563 564 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 565 * 566 * or(8) tmp.1<2>W g0.0<0,1,0>W 0x00003f80W 567 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 568 * 569 * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0). 570 * 571 * This negation looks like it's safe in practice, because bits 0:4 will 572 * surely be TRIANGLES 573 */ 574 575 if (value1 == -1.0f) { 576 g0.negate = true; 577 } 578 579 bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1), 580 g0, brw_imm_uw(0x3f80)); 581 } else { 582 /* Bit 31 of g1.6 is 0 if the polygon is front facing. */ 583 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); 584 585 /* For (gl_FrontFacing ? 1.0 : -1.0), emit: 586 * 587 * or(8) tmp<1>D g1.6<0,1,0>D 0x3f800000D 588 * and(8) dst<1>D tmp<8,8,1>D 0xbf800000D 589 * 590 * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0). 591 * 592 * This negation looks like it's safe in practice, because bits 0:4 will 593 * surely be TRIANGLES 594 */ 595 596 if (value1 == -1.0f) { 597 g1_6.negate = true; 598 } 599 600 bld.OR(tmp, g1_6, brw_imm_d(0x3f800000)); 601 } 602 bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000)); 603 604 return true; 605} 606 607static void 608emit_find_msb_using_lzd(const fs_builder &bld, 609 const fs_reg &result, 610 const fs_reg &src, 611 bool is_signed) 612{ 613 fs_inst *inst; 614 fs_reg temp = src; 615 616 if (is_signed) { 617 /* LZD of an absolute value source almost always does the right 618 * thing. There are two problem values: 619 * 620 * * 0x80000000. Since abs(0x80000000) == 0x80000000, LZD returns 621 * 0. However, findMSB(int(0x80000000)) == 30. 622 * 623 * * 0xffffffff. Since abs(0xffffffff) == 1, LZD returns 624 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 625 * 626 * For a value of zero or negative one, -1 will be returned. 627 * 628 * * Negative powers of two. LZD(abs(-(1<<x))) returns x, but 629 * findMSB(-(1<<x)) should return x-1. 630 * 631 * For all negative number cases, including 0x80000000 and 632 * 0xffffffff, the correct value is obtained from LZD if instead of 633 * negating the (already negative) value the logical-not is used. A 634 * conditonal logical-not can be achieved in two instructions. 635 */ 636 temp = bld.vgrf(BRW_REGISTER_TYPE_D); 637 638 bld.ASR(temp, src, brw_imm_d(31)); 639 bld.XOR(temp, temp, src); 640 } 641 642 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), 643 retype(temp, BRW_REGISTER_TYPE_UD)); 644 645 /* LZD counts from the MSB side, while GLSL's findMSB() wants the count 646 * from the LSB side. Subtract the result from 31 to convert the MSB 647 * count into an LSB count. If no bits are set, LZD will return 32. 648 * 31-32 = -1, which is exactly what findMSB() is supposed to return. 649 */ 650 inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31)); 651 inst->src[0].negate = true; 652} 653 654static brw_rnd_mode 655brw_rnd_mode_from_nir_op (const nir_op op) { 656 switch (op) { 657 case nir_op_f2f16_rtz: 658 return BRW_RND_MODE_RTZ; 659 case nir_op_f2f16_rtne: 660 return BRW_RND_MODE_RTNE; 661 default: 662 unreachable("Operation doesn't support rounding mode"); 663 } 664} 665 666static brw_rnd_mode 667brw_rnd_mode_from_execution_mode(unsigned execution_mode) 668{ 669 if (nir_has_any_rounding_mode_rtne(execution_mode)) 670 return BRW_RND_MODE_RTNE; 671 if (nir_has_any_rounding_mode_rtz(execution_mode)) 672 return BRW_RND_MODE_RTZ; 673 return BRW_RND_MODE_UNSPECIFIED; 674} 675 676fs_reg 677fs_visitor::prepare_alu_destination_and_sources(const fs_builder &bld, 678 nir_alu_instr *instr, 679 fs_reg *op, 680 bool need_dest) 681{ 682 fs_reg result = 683 need_dest ? get_nir_dest(instr->dest.dest) : bld.null_reg_ud(); 684 685 result.type = brw_type_for_nir_type(devinfo, 686 (nir_alu_type)(nir_op_infos[instr->op].output_type | 687 nir_dest_bit_size(instr->dest.dest))); 688 689 assert(!instr->dest.saturate); 690 691 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 692 /* We don't lower to source modifiers so they should not exist. */ 693 assert(!instr->src[i].abs); 694 assert(!instr->src[i].negate); 695 696 op[i] = get_nir_src(instr->src[i].src); 697 op[i].type = brw_type_for_nir_type(devinfo, 698 (nir_alu_type)(nir_op_infos[instr->op].input_types[i] | 699 nir_src_bit_size(instr->src[i].src))); 700 } 701 702 /* Move and vecN instrutions may still be vectored. Return the raw, 703 * vectored source and destination so that fs_visitor::nir_emit_alu can 704 * handle it. Other callers should not have to handle these kinds of 705 * instructions. 706 */ 707 switch (instr->op) { 708 case nir_op_mov: 709 case nir_op_vec2: 710 case nir_op_vec3: 711 case nir_op_vec4: 712 case nir_op_vec8: 713 case nir_op_vec16: 714 return result; 715 default: 716 break; 717 } 718 719 /* At this point, we have dealt with any instruction that operates on 720 * more than a single channel. Therefore, we can just adjust the source 721 * and destination registers for that channel and emit the instruction. 722 */ 723 unsigned channel = 0; 724 if (nir_op_infos[instr->op].output_size == 0) { 725 /* Since NIR is doing the scalarizing for us, we should only ever see 726 * vectorized operations with a single channel. 727 */ 728 assert(util_bitcount(instr->dest.write_mask) == 1); 729 channel = ffs(instr->dest.write_mask) - 1; 730 731 result = offset(result, bld, channel); 732 } 733 734 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 735 assert(nir_op_infos[instr->op].input_sizes[i] < 2); 736 op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]); 737 } 738 739 return result; 740} 741 742void 743fs_visitor::resolve_inot_sources(const fs_builder &bld, nir_alu_instr *instr, 744 fs_reg *op) 745{ 746 for (unsigned i = 0; i < 2; i++) { 747 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[i].src); 748 749 if (inot_instr != NULL && inot_instr->op == nir_op_inot) { 750 /* The source of the inot is now the source of instr. */ 751 prepare_alu_destination_and_sources(bld, inot_instr, &op[i], false); 752 753 assert(!op[i].negate); 754 op[i].negate = true; 755 } else { 756 op[i] = resolve_source_modifiers(op[i]); 757 } 758 } 759} 760 761bool 762fs_visitor::try_emit_b2fi_of_inot(const fs_builder &bld, 763 fs_reg result, 764 nir_alu_instr *instr) 765{ 766 if (devinfo->ver < 6 || devinfo->ver >= 12) 767 return false; 768 769 nir_alu_instr *inot_instr = nir_src_as_alu_instr(instr->src[0].src); 770 771 if (inot_instr == NULL || inot_instr->op != nir_op_inot) 772 return false; 773 774 /* HF is also possible as a destination on BDW+. For nir_op_b2i, the set 775 * of valid size-changing combinations is a bit more complex. 776 * 777 * The source restriction is just because I was lazy about generating the 778 * constant below. 779 */ 780 if (nir_dest_bit_size(instr->dest.dest) != 32 || 781 nir_src_bit_size(inot_instr->src[0].src) != 32) 782 return false; 783 784 /* b2[fi](inot(a)) maps a=0 => 1, a=-1 => 0. Since a can only be 0 or -1, 785 * this is float(1 + a). 786 */ 787 fs_reg op; 788 789 prepare_alu_destination_and_sources(bld, inot_instr, &op, false); 790 791 /* Ignore the saturate modifier, if there is one. The result of the 792 * arithmetic can only be 0 or 1, so the clamping will do nothing anyway. 793 */ 794 bld.ADD(result, op, brw_imm_d(1)); 795 796 return true; 797} 798 799/** 800 * Emit code for nir_op_fsign possibly fused with a nir_op_fmul 801 * 802 * If \c instr is not the \c nir_op_fsign, then \c fsign_src is the index of 803 * the source of \c instr that is a \c nir_op_fsign. 804 */ 805void 806fs_visitor::emit_fsign(const fs_builder &bld, const nir_alu_instr *instr, 807 fs_reg result, fs_reg *op, unsigned fsign_src) 808{ 809 fs_inst *inst; 810 811 assert(instr->op == nir_op_fsign || instr->op == nir_op_fmul); 812 assert(fsign_src < nir_op_infos[instr->op].num_inputs); 813 814 if (instr->op != nir_op_fsign) { 815 const nir_alu_instr *const fsign_instr = 816 nir_src_as_alu_instr(instr->src[fsign_src].src); 817 818 /* op[fsign_src] has the nominal result of the fsign, and op[1 - 819 * fsign_src] has the other multiply source. This must be rearranged so 820 * that op[0] is the source of the fsign op[1] is the other multiply 821 * source. 822 */ 823 if (fsign_src != 0) 824 op[1] = op[0]; 825 826 op[0] = get_nir_src(fsign_instr->src[0].src); 827 828 const nir_alu_type t = 829 (nir_alu_type)(nir_op_infos[instr->op].input_types[0] | 830 nir_src_bit_size(fsign_instr->src[0].src)); 831 832 op[0].type = brw_type_for_nir_type(devinfo, t); 833 834 unsigned channel = 0; 835 if (nir_op_infos[instr->op].output_size == 0) { 836 /* Since NIR is doing the scalarizing for us, we should only ever see 837 * vectorized operations with a single channel. 838 */ 839 assert(util_bitcount(instr->dest.write_mask) == 1); 840 channel = ffs(instr->dest.write_mask) - 1; 841 } 842 843 op[0] = offset(op[0], bld, fsign_instr->src[0].swizzle[channel]); 844 } 845 846 if (type_sz(op[0].type) == 2) { 847 /* AND(val, 0x8000) gives the sign bit. 848 * 849 * Predicated OR ORs 1.0 (0x3c00) with the sign bit if val is not zero. 850 */ 851 fs_reg zero = retype(brw_imm_uw(0), BRW_REGISTER_TYPE_HF); 852 bld.CMP(bld.null_reg_f(), op[0], zero, BRW_CONDITIONAL_NZ); 853 854 op[0].type = BRW_REGISTER_TYPE_UW; 855 result.type = BRW_REGISTER_TYPE_UW; 856 bld.AND(result, op[0], brw_imm_uw(0x8000u)); 857 858 if (instr->op == nir_op_fsign) 859 inst = bld.OR(result, result, brw_imm_uw(0x3c00u)); 860 else { 861 /* Use XOR here to get the result sign correct. */ 862 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UW)); 863 } 864 865 inst->predicate = BRW_PREDICATE_NORMAL; 866 } else if (type_sz(op[0].type) == 4) { 867 /* AND(val, 0x80000000) gives the sign bit. 868 * 869 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not 870 * zero. 871 */ 872 bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ); 873 874 op[0].type = BRW_REGISTER_TYPE_UD; 875 result.type = BRW_REGISTER_TYPE_UD; 876 bld.AND(result, op[0], brw_imm_ud(0x80000000u)); 877 878 if (instr->op == nir_op_fsign) 879 inst = bld.OR(result, result, brw_imm_ud(0x3f800000u)); 880 else { 881 /* Use XOR here to get the result sign correct. */ 882 inst = bld.XOR(result, result, retype(op[1], BRW_REGISTER_TYPE_UD)); 883 } 884 885 inst->predicate = BRW_PREDICATE_NORMAL; 886 } else { 887 /* For doubles we do the same but we need to consider: 888 * 889 * - 2-src instructions can't operate with 64-bit immediates 890 * - The sign is encoded in the high 32-bit of each DF 891 * - We need to produce a DF result. 892 */ 893 894 fs_reg zero = vgrf(glsl_type::double_type); 895 bld.MOV(zero, setup_imm_df(bld, 0.0)); 896 bld.CMP(bld.null_reg_df(), op[0], zero, BRW_CONDITIONAL_NZ); 897 898 bld.MOV(result, zero); 899 900 fs_reg r = subscript(result, BRW_REGISTER_TYPE_UD, 1); 901 bld.AND(r, subscript(op[0], BRW_REGISTER_TYPE_UD, 1), 902 brw_imm_ud(0x80000000u)); 903 904 if (instr->op == nir_op_fsign) { 905 set_predicate(BRW_PREDICATE_NORMAL, 906 bld.OR(r, r, brw_imm_ud(0x3ff00000u))); 907 } else { 908 /* This could be done better in some cases. If the scale is an 909 * immediate with the low 32-bits all 0, emitting a separate XOR and 910 * OR would allow an algebraic optimization to remove the OR. There 911 * are currently zero instances of fsign(double(x))*IMM in shader-db 912 * or any test suite, so it is hard to care at this time. 913 */ 914 fs_reg result_int64 = retype(result, BRW_REGISTER_TYPE_UQ); 915 inst = bld.XOR(result_int64, result_int64, 916 retype(op[1], BRW_REGISTER_TYPE_UQ)); 917 } 918 } 919} 920 921/** 922 * Deteremine whether sources of a nir_op_fmul can be fused with a nir_op_fsign 923 * 924 * Checks the operands of a \c nir_op_fmul to determine whether or not 925 * \c emit_fsign could fuse the multiplication with the \c sign() calculation. 926 * 927 * \param instr The multiplication instruction 928 * 929 * \param fsign_src The source of \c instr that may or may not be a 930 * \c nir_op_fsign 931 */ 932static bool 933can_fuse_fmul_fsign(nir_alu_instr *instr, unsigned fsign_src) 934{ 935 assert(instr->op == nir_op_fmul); 936 937 nir_alu_instr *const fsign_instr = 938 nir_src_as_alu_instr(instr->src[fsign_src].src); 939 940 /* Rules: 941 * 942 * 1. instr->src[fsign_src] must be a nir_op_fsign. 943 * 2. The nir_op_fsign can only be used by this multiplication. 944 * 3. The source that is the nir_op_fsign does not have source modifiers. 945 * \c emit_fsign only examines the source modifiers of the source of the 946 * \c nir_op_fsign. 947 * 948 * The nir_op_fsign must also not have the saturate modifier, but steps 949 * have already been taken (in nir_opt_algebraic) to ensure that. 950 */ 951 return fsign_instr != NULL && fsign_instr->op == nir_op_fsign && 952 is_used_once(fsign_instr); 953} 954 955void 956fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr, 957 bool need_dest) 958{ 959 struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; 960 fs_inst *inst; 961 unsigned execution_mode = 962 bld.shader->nir->info.float_controls_execution_mode; 963 964 fs_reg op[NIR_MAX_VEC_COMPONENTS]; 965 fs_reg result = prepare_alu_destination_and_sources(bld, instr, op, need_dest); 966 967#ifndef NDEBUG 968 /* Everything except raw moves, some type conversions, iabs, and ineg 969 * should have 8-bit sources lowered by nir_lower_bit_size in 970 * brw_preprocess_nir or by brw_nir_lower_conversions in 971 * brw_postprocess_nir. 972 */ 973 switch (instr->op) { 974 case nir_op_mov: 975 case nir_op_vec2: 976 case nir_op_vec3: 977 case nir_op_vec4: 978 case nir_op_vec8: 979 case nir_op_vec16: 980 case nir_op_i2f16: 981 case nir_op_i2f32: 982 case nir_op_i2i16: 983 case nir_op_i2i32: 984 case nir_op_u2f16: 985 case nir_op_u2f32: 986 case nir_op_u2u16: 987 case nir_op_u2u32: 988 case nir_op_iabs: 989 case nir_op_ineg: 990 case nir_op_pack_32_4x8_split: 991 break; 992 993 default: 994 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 995 assert(type_sz(op[i].type) > 1); 996 } 997 } 998#endif 999 1000 switch (instr->op) { 1001 case nir_op_mov: 1002 case nir_op_vec2: 1003 case nir_op_vec3: 1004 case nir_op_vec4: 1005 case nir_op_vec8: 1006 case nir_op_vec16: { 1007 fs_reg temp = result; 1008 bool need_extra_copy = false; 1009 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1010 if (!instr->src[i].src.is_ssa && 1011 instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { 1012 need_extra_copy = true; 1013 temp = bld.vgrf(result.type, 4); 1014 break; 1015 } 1016 } 1017 1018 for (unsigned i = 0; i < 4; i++) { 1019 if (!(instr->dest.write_mask & (1 << i))) 1020 continue; 1021 1022 if (instr->op == nir_op_mov) { 1023 bld.MOV(offset(temp, bld, i), 1024 offset(op[0], bld, instr->src[0].swizzle[i])); 1025 } else { 1026 bld.MOV(offset(temp, bld, i), 1027 offset(op[i], bld, instr->src[i].swizzle[0])); 1028 } 1029 } 1030 1031 /* In this case the source and destination registers were the same, 1032 * so we need to insert an extra set of moves in order to deal with 1033 * any swizzling. 1034 */ 1035 if (need_extra_copy) { 1036 for (unsigned i = 0; i < 4; i++) { 1037 if (!(instr->dest.write_mask & (1 << i))) 1038 continue; 1039 1040 bld.MOV(offset(result, bld, i), offset(temp, bld, i)); 1041 } 1042 } 1043 return; 1044 } 1045 1046 case nir_op_i2f32: 1047 case nir_op_u2f32: 1048 if (optimize_extract_to_float(instr, result)) 1049 return; 1050 inst = bld.MOV(result, op[0]); 1051 break; 1052 1053 case nir_op_f2f16_rtne: 1054 case nir_op_f2f16_rtz: 1055 case nir_op_f2f16: { 1056 brw_rnd_mode rnd = BRW_RND_MODE_UNSPECIFIED; 1057 1058 if (nir_op_f2f16 == instr->op) 1059 rnd = brw_rnd_mode_from_execution_mode(execution_mode); 1060 else 1061 rnd = brw_rnd_mode_from_nir_op(instr->op); 1062 1063 if (BRW_RND_MODE_UNSPECIFIED != rnd) 1064 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), brw_imm_d(rnd)); 1065 1066 /* In theory, it would be better to use BRW_OPCODE_F32TO16. Depending 1067 * on the HW gen, it is a special hw opcode or just a MOV, and 1068 * brw_F32TO16 (at brw_eu_emit) would do the work to chose. 1069 * 1070 * But if we want to use that opcode, we need to provide support on 1071 * different optimizations and lowerings. As right now HF support is 1072 * only for gfx8+, it will be better to use directly the MOV, and use 1073 * BRW_OPCODE_F32TO16 when/if we work for HF support on gfx7. 1074 */ 1075 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1076 inst = bld.MOV(result, op[0]); 1077 break; 1078 } 1079 1080 case nir_op_b2i8: 1081 case nir_op_b2i16: 1082 case nir_op_b2i32: 1083 case nir_op_b2i64: 1084 case nir_op_b2f16: 1085 case nir_op_b2f32: 1086 case nir_op_b2f64: 1087 if (try_emit_b2fi_of_inot(bld, result, instr)) 1088 break; 1089 op[0].type = BRW_REGISTER_TYPE_D; 1090 op[0].negate = !op[0].negate; 1091 FALLTHROUGH; 1092 case nir_op_i2f64: 1093 case nir_op_i2i64: 1094 case nir_op_u2f64: 1095 case nir_op_u2u64: 1096 case nir_op_f2f64: 1097 case nir_op_f2i64: 1098 case nir_op_f2u64: 1099 case nir_op_i2i32: 1100 case nir_op_u2u32: 1101 case nir_op_f2i32: 1102 case nir_op_f2u32: 1103 case nir_op_i2f16: 1104 case nir_op_u2f16: 1105 case nir_op_f2i16: 1106 case nir_op_f2u16: 1107 case nir_op_f2i8: 1108 case nir_op_f2u8: 1109 if (result.type == BRW_REGISTER_TYPE_B || 1110 result.type == BRW_REGISTER_TYPE_UB || 1111 result.type == BRW_REGISTER_TYPE_HF) 1112 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1113 1114 if (op[0].type == BRW_REGISTER_TYPE_B || 1115 op[0].type == BRW_REGISTER_TYPE_UB || 1116 op[0].type == BRW_REGISTER_TYPE_HF) 1117 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1118 1119 inst = bld.MOV(result, op[0]); 1120 break; 1121 1122 case nir_op_i2i8: 1123 case nir_op_u2u8: 1124 assert(type_sz(op[0].type) < 8); /* brw_nir_lower_conversions */ 1125 FALLTHROUGH; 1126 case nir_op_i2i16: 1127 case nir_op_u2u16: { 1128 /* Emit better code for u2u8(extract_u8(a, b)) and similar patterns. 1129 * Emitting the instructions one by one results in two MOV instructions 1130 * that won't be propagated. By handling both instructions here, a 1131 * single MOV is emitted. 1132 */ 1133 nir_alu_instr *extract_instr = nir_src_as_alu_instr(instr->src[0].src); 1134 if (extract_instr != NULL) { 1135 if (extract_instr->op == nir_op_extract_u8 || 1136 extract_instr->op == nir_op_extract_i8) { 1137 prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1138 1139 const unsigned byte = nir_src_as_uint(extract_instr->src[1].src); 1140 const brw_reg_type type = 1141 brw_int_type(1, extract_instr->op == nir_op_extract_i8); 1142 1143 op[0] = subscript(op[0], type, byte); 1144 } else if (extract_instr->op == nir_op_extract_u16 || 1145 extract_instr->op == nir_op_extract_i16) { 1146 prepare_alu_destination_and_sources(bld, extract_instr, op, false); 1147 1148 const unsigned word = nir_src_as_uint(extract_instr->src[1].src); 1149 const brw_reg_type type = 1150 brw_int_type(2, extract_instr->op == nir_op_extract_i16); 1151 1152 op[0] = subscript(op[0], type, word); 1153 } 1154 } 1155 1156 inst = bld.MOV(result, op[0]); 1157 break; 1158 } 1159 1160 case nir_op_fsat: 1161 inst = bld.MOV(result, op[0]); 1162 inst->saturate = true; 1163 break; 1164 1165 case nir_op_fneg: 1166 case nir_op_ineg: 1167 op[0].negate = true; 1168 inst = bld.MOV(result, op[0]); 1169 break; 1170 1171 case nir_op_fabs: 1172 case nir_op_iabs: 1173 op[0].negate = false; 1174 op[0].abs = true; 1175 inst = bld.MOV(result, op[0]); 1176 break; 1177 1178 case nir_op_f2f32: 1179 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1180 brw_rnd_mode rnd = 1181 brw_rnd_mode_from_execution_mode(execution_mode); 1182 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1183 brw_imm_d(rnd)); 1184 } 1185 1186 if (op[0].type == BRW_REGISTER_TYPE_HF) 1187 assert(type_sz(result.type) < 8); /* brw_nir_lower_conversions */ 1188 1189 inst = bld.MOV(result, op[0]); 1190 break; 1191 1192 case nir_op_fsign: 1193 emit_fsign(bld, instr, result, op, 0); 1194 break; 1195 1196 case nir_op_frcp: 1197 inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]); 1198 break; 1199 1200 case nir_op_fexp2: 1201 inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]); 1202 break; 1203 1204 case nir_op_flog2: 1205 inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]); 1206 break; 1207 1208 case nir_op_fsin: 1209 inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]); 1210 break; 1211 1212 case nir_op_fcos: 1213 inst = bld.emit(SHADER_OPCODE_COS, result, op[0]); 1214 break; 1215 1216 case nir_op_fddx: 1217 if (fs_key->high_quality_derivatives) { 1218 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1219 } else { 1220 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1221 } 1222 break; 1223 case nir_op_fddx_fine: 1224 inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]); 1225 break; 1226 case nir_op_fddx_coarse: 1227 inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]); 1228 break; 1229 case nir_op_fddy: 1230 if (fs_key->high_quality_derivatives) { 1231 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1232 } else { 1233 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1234 } 1235 break; 1236 case nir_op_fddy_fine: 1237 inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]); 1238 break; 1239 case nir_op_fddy_coarse: 1240 inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]); 1241 break; 1242 1243 case nir_op_fadd: 1244 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1245 brw_rnd_mode rnd = 1246 brw_rnd_mode_from_execution_mode(execution_mode); 1247 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1248 brw_imm_d(rnd)); 1249 } 1250 FALLTHROUGH; 1251 case nir_op_iadd: 1252 inst = bld.ADD(result, op[0], op[1]); 1253 break; 1254 1255 case nir_op_iadd3: 1256 inst = bld.ADD3(result, op[0], op[1], op[2]); 1257 break; 1258 1259 case nir_op_iadd_sat: 1260 case nir_op_uadd_sat: 1261 inst = bld.ADD(result, op[0], op[1]); 1262 inst->saturate = true; 1263 break; 1264 1265 case nir_op_isub_sat: 1266 bld.emit(SHADER_OPCODE_ISUB_SAT, result, op[0], op[1]); 1267 break; 1268 1269 case nir_op_usub_sat: 1270 bld.emit(SHADER_OPCODE_USUB_SAT, result, op[0], op[1]); 1271 break; 1272 1273 case nir_op_irhadd: 1274 case nir_op_urhadd: 1275 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1276 inst = bld.AVG(result, op[0], op[1]); 1277 break; 1278 1279 case nir_op_ihadd: 1280 case nir_op_uhadd: { 1281 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1282 fs_reg tmp = bld.vgrf(result.type); 1283 1284 if (devinfo->ver >= 8) { 1285 op[0] = resolve_source_modifiers(op[0]); 1286 op[1] = resolve_source_modifiers(op[1]); 1287 } 1288 1289 /* AVG(x, y) - ((x ^ y) & 1) */ 1290 bld.XOR(tmp, op[0], op[1]); 1291 bld.AND(tmp, tmp, retype(brw_imm_ud(1), result.type)); 1292 bld.AVG(result, op[0], op[1]); 1293 inst = bld.ADD(result, result, tmp); 1294 inst->src[1].negate = true; 1295 break; 1296 } 1297 1298 case nir_op_fmul: 1299 for (unsigned i = 0; i < 2; i++) { 1300 if (can_fuse_fmul_fsign(instr, i)) { 1301 emit_fsign(bld, instr, result, op, i); 1302 return; 1303 } 1304 } 1305 1306 /* We emit the rounding mode after the previous fsign optimization since 1307 * it won't result in a MUL, but will try to negate the value by other 1308 * means. 1309 */ 1310 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1311 brw_rnd_mode rnd = 1312 brw_rnd_mode_from_execution_mode(execution_mode); 1313 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1314 brw_imm_d(rnd)); 1315 } 1316 1317 inst = bld.MUL(result, op[0], op[1]); 1318 break; 1319 1320 case nir_op_imul_2x32_64: 1321 case nir_op_umul_2x32_64: 1322 bld.MUL(result, op[0], op[1]); 1323 break; 1324 1325 case nir_op_imul_32x16: 1326 case nir_op_umul_32x16: { 1327 const bool ud = instr->op == nir_op_umul_32x16; 1328 1329 assert(nir_dest_bit_size(instr->dest.dest) == 32); 1330 1331 /* Before Gfx7, the order of the 32-bit source and the 16-bit source was 1332 * swapped. The extension isn't enabled on those platforms, so don't 1333 * pretend to support the differences. 1334 */ 1335 assert(devinfo->ver >= 7); 1336 1337 if (op[1].file == IMM) 1338 op[1] = ud ? brw_imm_uw(op[1].ud) : brw_imm_w(op[1].d); 1339 else { 1340 const enum brw_reg_type word_type = 1341 ud ? BRW_REGISTER_TYPE_UW : BRW_REGISTER_TYPE_W; 1342 1343 op[1] = subscript(op[1], word_type, 0); 1344 } 1345 1346 const enum brw_reg_type dword_type = 1347 ud ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; 1348 1349 bld.MUL(result, retype(op[0], dword_type), op[1]); 1350 break; 1351 } 1352 1353 case nir_op_imul: 1354 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1355 bld.MUL(result, op[0], op[1]); 1356 break; 1357 1358 case nir_op_imul_high: 1359 case nir_op_umul_high: 1360 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1361 bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]); 1362 break; 1363 1364 case nir_op_idiv: 1365 case nir_op_udiv: 1366 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1367 bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]); 1368 break; 1369 1370 case nir_op_uadd_carry: 1371 unreachable("Should have been lowered by carry_to_arith()."); 1372 1373 case nir_op_usub_borrow: 1374 unreachable("Should have been lowered by borrow_to_arith()."); 1375 1376 case nir_op_umod: 1377 case nir_op_irem: 1378 /* According to the sign table for INT DIV in the Ivy Bridge PRM, it 1379 * appears that our hardware just does the right thing for signed 1380 * remainder. 1381 */ 1382 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1383 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1384 break; 1385 1386 case nir_op_imod: { 1387 /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ 1388 bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]); 1389 1390 /* Math instructions don't support conditional mod */ 1391 inst = bld.MOV(bld.null_reg_d(), result); 1392 inst->conditional_mod = BRW_CONDITIONAL_NZ; 1393 1394 /* Now, we need to determine if signs of the sources are different. 1395 * When we XOR the sources, the top bit is 0 if they are the same and 1 1396 * if they are different. We can then use a conditional modifier to 1397 * turn that into a predicate. This leads us to an XOR.l instruction. 1398 * 1399 * Technically, according to the PRM, you're not allowed to use .l on a 1400 * XOR instruction. However, emperical experiments and Curro's reading 1401 * of the simulator source both indicate that it's safe. 1402 */ 1403 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D); 1404 inst = bld.XOR(tmp, op[0], op[1]); 1405 inst->predicate = BRW_PREDICATE_NORMAL; 1406 inst->conditional_mod = BRW_CONDITIONAL_L; 1407 1408 /* If the result of the initial remainder operation is non-zero and the 1409 * two sources have different signs, add in a copy of op[1] to get the 1410 * final integer modulus value. 1411 */ 1412 inst = bld.ADD(result, result, op[1]); 1413 inst->predicate = BRW_PREDICATE_NORMAL; 1414 break; 1415 } 1416 1417 case nir_op_flt32: 1418 case nir_op_fge32: 1419 case nir_op_feq32: 1420 case nir_op_fneu32: { 1421 fs_reg dest = result; 1422 1423 const uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1424 if (bit_size != 32) 1425 dest = bld.vgrf(op[0].type, 1); 1426 1427 bld.CMP(dest, op[0], op[1], brw_cmod_for_nir_comparison(instr->op)); 1428 1429 if (bit_size > 32) { 1430 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1431 } else if(bit_size < 32) { 1432 /* When we convert the result to 32-bit we need to be careful and do 1433 * it as a signed conversion to get sign extension (for 32-bit true) 1434 */ 1435 const brw_reg_type src_type = 1436 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1437 1438 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1439 } 1440 break; 1441 } 1442 1443 case nir_op_ilt32: 1444 case nir_op_ult32: 1445 case nir_op_ige32: 1446 case nir_op_uge32: 1447 case nir_op_ieq32: 1448 case nir_op_ine32: { 1449 fs_reg dest = result; 1450 1451 const uint32_t bit_size = type_sz(op[0].type) * 8; 1452 if (bit_size != 32) 1453 dest = bld.vgrf(op[0].type, 1); 1454 1455 bld.CMP(dest, op[0], op[1], 1456 brw_cmod_for_nir_comparison(instr->op)); 1457 1458 if (bit_size > 32) { 1459 bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0)); 1460 } else if (bit_size < 32) { 1461 /* When we convert the result to 32-bit we need to be careful and do 1462 * it as a signed conversion to get sign extension (for 32-bit true) 1463 */ 1464 const brw_reg_type src_type = 1465 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_D); 1466 1467 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), retype(dest, src_type)); 1468 } 1469 break; 1470 } 1471 1472 case nir_op_inot: 1473 if (devinfo->ver >= 8) { 1474 nir_alu_instr *inot_src_instr = nir_src_as_alu_instr(instr->src[0].src); 1475 1476 if (inot_src_instr != NULL && 1477 (inot_src_instr->op == nir_op_ior || 1478 inot_src_instr->op == nir_op_ixor || 1479 inot_src_instr->op == nir_op_iand)) { 1480 /* The sources of the source logical instruction are now the 1481 * sources of the instruction that will be generated. 1482 */ 1483 prepare_alu_destination_and_sources(bld, inot_src_instr, op, false); 1484 resolve_inot_sources(bld, inot_src_instr, op); 1485 1486 /* Smash all of the sources and destination to be signed. This 1487 * doesn't matter for the operation of the instruction, but cmod 1488 * propagation fails on unsigned sources with negation (due to 1489 * fs_inst::can_do_cmod returning false). 1490 */ 1491 result.type = 1492 brw_type_for_nir_type(devinfo, 1493 (nir_alu_type)(nir_type_int | 1494 nir_dest_bit_size(instr->dest.dest))); 1495 op[0].type = 1496 brw_type_for_nir_type(devinfo, 1497 (nir_alu_type)(nir_type_int | 1498 nir_src_bit_size(inot_src_instr->src[0].src))); 1499 op[1].type = 1500 brw_type_for_nir_type(devinfo, 1501 (nir_alu_type)(nir_type_int | 1502 nir_src_bit_size(inot_src_instr->src[1].src))); 1503 1504 /* For XOR, only invert one of the sources. Arbitrarily choose 1505 * the first source. 1506 */ 1507 op[0].negate = !op[0].negate; 1508 if (inot_src_instr->op != nir_op_ixor) 1509 op[1].negate = !op[1].negate; 1510 1511 switch (inot_src_instr->op) { 1512 case nir_op_ior: 1513 bld.AND(result, op[0], op[1]); 1514 return; 1515 1516 case nir_op_iand: 1517 bld.OR(result, op[0], op[1]); 1518 return; 1519 1520 case nir_op_ixor: 1521 bld.XOR(result, op[0], op[1]); 1522 return; 1523 1524 default: 1525 unreachable("impossible opcode"); 1526 } 1527 } 1528 op[0] = resolve_source_modifiers(op[0]); 1529 } 1530 bld.NOT(result, op[0]); 1531 break; 1532 case nir_op_ixor: 1533 if (devinfo->ver >= 8) { 1534 resolve_inot_sources(bld, instr, op); 1535 } 1536 bld.XOR(result, op[0], op[1]); 1537 break; 1538 case nir_op_ior: 1539 if (devinfo->ver >= 8) { 1540 resolve_inot_sources(bld, instr, op); 1541 } 1542 bld.OR(result, op[0], op[1]); 1543 break; 1544 case nir_op_iand: 1545 if (devinfo->ver >= 8) { 1546 resolve_inot_sources(bld, instr, op); 1547 } 1548 bld.AND(result, op[0], op[1]); 1549 break; 1550 1551 case nir_op_fdot2: 1552 case nir_op_fdot3: 1553 case nir_op_fdot4: 1554 case nir_op_b32all_fequal2: 1555 case nir_op_b32all_iequal2: 1556 case nir_op_b32all_fequal3: 1557 case nir_op_b32all_iequal3: 1558 case nir_op_b32all_fequal4: 1559 case nir_op_b32all_iequal4: 1560 case nir_op_b32any_fnequal2: 1561 case nir_op_b32any_inequal2: 1562 case nir_op_b32any_fnequal3: 1563 case nir_op_b32any_inequal3: 1564 case nir_op_b32any_fnequal4: 1565 case nir_op_b32any_inequal4: 1566 unreachable("Lowered by nir_lower_alu_reductions"); 1567 1568 case nir_op_ldexp: 1569 unreachable("not reached: should be handled by ldexp_to_arith()"); 1570 1571 case nir_op_fsqrt: 1572 inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]); 1573 break; 1574 1575 case nir_op_frsq: 1576 inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]); 1577 break; 1578 1579 case nir_op_i2b32: 1580 case nir_op_f2b32: { 1581 uint32_t bit_size = nir_src_bit_size(instr->src[0].src); 1582 if (bit_size == 64) { 1583 /* two-argument instructions can't take 64-bit immediates */ 1584 fs_reg zero; 1585 fs_reg tmp; 1586 1587 if (instr->op == nir_op_f2b32) { 1588 zero = vgrf(glsl_type::double_type); 1589 tmp = vgrf(glsl_type::double_type); 1590 bld.MOV(zero, setup_imm_df(bld, 0.0)); 1591 } else { 1592 zero = vgrf(glsl_type::int64_t_type); 1593 tmp = vgrf(glsl_type::int64_t_type); 1594 bld.MOV(zero, brw_imm_q(0)); 1595 } 1596 1597 /* A SIMD16 execution needs to be split in two instructions, so use 1598 * a vgrf instead of the flag register as dst so instruction splitting 1599 * works 1600 */ 1601 bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ); 1602 bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0)); 1603 } else { 1604 fs_reg zero; 1605 if (bit_size == 32) { 1606 zero = instr->op == nir_op_f2b32 ? brw_imm_f(0.0f) : brw_imm_d(0); 1607 } else { 1608 assert(bit_size == 16); 1609 zero = instr->op == nir_op_f2b32 ? 1610 retype(brw_imm_w(0), BRW_REGISTER_TYPE_HF) : brw_imm_w(0); 1611 } 1612 bld.CMP(result, op[0], zero, BRW_CONDITIONAL_NZ); 1613 } 1614 break; 1615 } 1616 1617 case nir_op_ftrunc: 1618 inst = bld.RNDZ(result, op[0]); 1619 if (devinfo->ver < 6) { 1620 set_condmod(BRW_CONDITIONAL_R, inst); 1621 set_predicate(BRW_PREDICATE_NORMAL, 1622 bld.ADD(result, result, brw_imm_f(1.0f))); 1623 inst = bld.MOV(result, result); /* for potential saturation */ 1624 } 1625 break; 1626 1627 case nir_op_fceil: { 1628 op[0].negate = !op[0].negate; 1629 fs_reg temp = vgrf(glsl_type::float_type); 1630 bld.RNDD(temp, op[0]); 1631 temp.negate = true; 1632 inst = bld.MOV(result, temp); 1633 break; 1634 } 1635 case nir_op_ffloor: 1636 inst = bld.RNDD(result, op[0]); 1637 break; 1638 case nir_op_ffract: 1639 inst = bld.FRC(result, op[0]); 1640 break; 1641 case nir_op_fround_even: 1642 inst = bld.RNDE(result, op[0]); 1643 if (devinfo->ver < 6) { 1644 set_condmod(BRW_CONDITIONAL_R, inst); 1645 set_predicate(BRW_PREDICATE_NORMAL, 1646 bld.ADD(result, result, brw_imm_f(1.0f))); 1647 inst = bld.MOV(result, result); /* for potential saturation */ 1648 } 1649 break; 1650 1651 case nir_op_fquantize2f16: { 1652 fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D); 1653 fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F); 1654 fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F); 1655 1656 /* The destination stride must be at least as big as the source stride. */ 1657 tmp16.type = BRW_REGISTER_TYPE_W; 1658 tmp16.stride = 2; 1659 1660 /* Check for denormal */ 1661 fs_reg abs_src0 = op[0]; 1662 abs_src0.abs = true; 1663 bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), 1664 BRW_CONDITIONAL_L); 1665 /* Get the appropriately signed zero */ 1666 bld.AND(retype(zero, BRW_REGISTER_TYPE_UD), 1667 retype(op[0], BRW_REGISTER_TYPE_UD), 1668 brw_imm_ud(0x80000000)); 1669 /* Do the actual F32 -> F16 -> F32 conversion */ 1670 bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]); 1671 bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16); 1672 /* Select that or zero based on normal status */ 1673 inst = bld.SEL(result, zero, tmp32); 1674 inst->predicate = BRW_PREDICATE_NORMAL; 1675 break; 1676 } 1677 1678 case nir_op_imin: 1679 case nir_op_umin: 1680 case nir_op_fmin: 1681 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L); 1682 break; 1683 1684 case nir_op_imax: 1685 case nir_op_umax: 1686 case nir_op_fmax: 1687 inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE); 1688 break; 1689 1690 case nir_op_pack_snorm_2x16: 1691 case nir_op_pack_snorm_4x8: 1692 case nir_op_pack_unorm_2x16: 1693 case nir_op_pack_unorm_4x8: 1694 case nir_op_unpack_snorm_2x16: 1695 case nir_op_unpack_snorm_4x8: 1696 case nir_op_unpack_unorm_2x16: 1697 case nir_op_unpack_unorm_4x8: 1698 case nir_op_unpack_half_2x16: 1699 case nir_op_pack_half_2x16: 1700 unreachable("not reached: should be handled by lower_packing_builtins"); 1701 1702 case nir_op_unpack_half_2x16_split_x_flush_to_zero: 1703 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1704 FALLTHROUGH; 1705 case nir_op_unpack_half_2x16_split_x: 1706 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1707 subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1708 break; 1709 1710 case nir_op_unpack_half_2x16_split_y_flush_to_zero: 1711 assert(FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 & execution_mode); 1712 FALLTHROUGH; 1713 case nir_op_unpack_half_2x16_split_y: 1714 inst = bld.emit(BRW_OPCODE_F16TO32, result, 1715 subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1716 break; 1717 1718 case nir_op_pack_64_2x32_split: 1719 case nir_op_pack_32_2x16_split: 1720 bld.emit(FS_OPCODE_PACK, result, op[0], op[1]); 1721 break; 1722 1723 case nir_op_pack_32_4x8_split: 1724 bld.emit(FS_OPCODE_PACK, result, op, 4); 1725 break; 1726 1727 case nir_op_unpack_64_2x32_split_x: 1728 case nir_op_unpack_64_2x32_split_y: { 1729 if (instr->op == nir_op_unpack_64_2x32_split_x) 1730 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0)); 1731 else 1732 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1)); 1733 break; 1734 } 1735 1736 case nir_op_unpack_32_2x16_split_x: 1737 case nir_op_unpack_32_2x16_split_y: { 1738 if (instr->op == nir_op_unpack_32_2x16_split_x) 1739 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 0)); 1740 else 1741 bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UW, 1)); 1742 break; 1743 } 1744 1745 case nir_op_fpow: 1746 inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]); 1747 break; 1748 1749 case nir_op_bitfield_reverse: 1750 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1751 bld.BFREV(result, op[0]); 1752 break; 1753 1754 case nir_op_bit_count: 1755 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1756 bld.CBIT(result, op[0]); 1757 break; 1758 1759 case nir_op_ufind_msb: { 1760 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1761 emit_find_msb_using_lzd(bld, result, op[0], false); 1762 break; 1763 } 1764 1765 case nir_op_uclz: 1766 assert(nir_dest_bit_size(instr->dest.dest) == 32); 1767 bld.LZD(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1768 break; 1769 1770 case nir_op_ifind_msb: { 1771 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1772 1773 if (devinfo->ver < 7) { 1774 emit_find_msb_using_lzd(bld, result, op[0], true); 1775 } else { 1776 bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]); 1777 1778 /* FBH counts from the MSB side, while GLSL's findMSB() wants the 1779 * count from the LSB side. If FBH didn't return an error 1780 * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB 1781 * count into an LSB count. 1782 */ 1783 bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ); 1784 1785 inst = bld.ADD(result, result, brw_imm_d(31)); 1786 inst->predicate = BRW_PREDICATE_NORMAL; 1787 inst->src[0].negate = true; 1788 } 1789 break; 1790 } 1791 1792 case nir_op_find_lsb: 1793 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1794 1795 if (devinfo->ver < 7) { 1796 fs_reg temp = vgrf(glsl_type::int_type); 1797 1798 /* (x & -x) generates a value that consists of only the LSB of x. 1799 * For all powers of 2, findMSB(y) == findLSB(y). 1800 */ 1801 fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D); 1802 fs_reg negated_src = src; 1803 1804 /* One must be negated, and the other must be non-negated. It 1805 * doesn't matter which is which. 1806 */ 1807 negated_src.negate = true; 1808 src.negate = false; 1809 1810 bld.AND(temp, src, negated_src); 1811 emit_find_msb_using_lzd(bld, result, temp, false); 1812 } else { 1813 bld.FBL(result, op[0]); 1814 } 1815 break; 1816 1817 case nir_op_ubitfield_extract: 1818 case nir_op_ibitfield_extract: 1819 unreachable("should have been lowered"); 1820 case nir_op_ubfe: 1821 case nir_op_ibfe: 1822 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1823 bld.BFE(result, op[2], op[1], op[0]); 1824 break; 1825 case nir_op_bfm: 1826 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1827 bld.BFI1(result, op[0], op[1]); 1828 break; 1829 case nir_op_bfi: 1830 assert(nir_dest_bit_size(instr->dest.dest) < 64); 1831 bld.BFI2(result, op[0], op[1], op[2]); 1832 break; 1833 1834 case nir_op_bitfield_insert: 1835 unreachable("not reached: should have been lowered"); 1836 1837 /* For all shift operations: 1838 * 1839 * Gen4 - Gen7: After application of source modifiers, the low 5-bits of 1840 * src1 are used an unsigned value for the shift count. 1841 * 1842 * Gen8: As with earlier platforms, but for Q and UQ types on src0, the low 1843 * 6-bit of src1 are used. 1844 * 1845 * Gen9+: The low bits of src1 matching the size of src0 (e.g., 4-bits for 1846 * W or UW src0). 1847 * 1848 * The implication is that the following instruction will produce a 1849 * different result on Gen9+ than on previous platforms: 1850 * 1851 * shr(8) g4<1>UW g12<8,8,1>UW 0x0010UW 1852 * 1853 * where Gen9+ will shift by zero, and earlier platforms will shift by 16. 1854 * 1855 * This does not seem to be the case. Experimentally, it has been 1856 * determined that shifts of 16-bit values on Gen8 behave properly. Shifts 1857 * of 8-bit values on both Gen8 and Gen9 do not. Gen11+ lowers 8-bit 1858 * values, so those platforms were not tested. No features expose access 1859 * to 8- or 16-bit types on Gen7 or earlier, so those platforms were not 1860 * tested either. See 1861 * https://gitlab.freedesktop.org/mesa/crucible/-/merge_requests/76. 1862 * 1863 * This is part of the reason 8-bit values are lowered to 16-bit on all 1864 * platforms. 1865 */ 1866 case nir_op_ishl: 1867 bld.SHL(result, op[0], op[1]); 1868 break; 1869 case nir_op_ishr: 1870 bld.ASR(result, op[0], op[1]); 1871 break; 1872 case nir_op_ushr: 1873 bld.SHR(result, op[0], op[1]); 1874 break; 1875 1876 case nir_op_urol: 1877 bld.ROL(result, op[0], op[1]); 1878 break; 1879 case nir_op_uror: 1880 bld.ROR(result, op[0], op[1]); 1881 break; 1882 1883 case nir_op_pack_half_2x16_split: 1884 bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]); 1885 break; 1886 1887 case nir_op_sdot_4x8_iadd: 1888 case nir_op_sdot_4x8_iadd_sat: 1889 inst = bld.DP4A(result, 1890 retype(op[2], BRW_REGISTER_TYPE_D), 1891 retype(op[0], BRW_REGISTER_TYPE_D), 1892 retype(op[1], BRW_REGISTER_TYPE_D)); 1893 1894 if (instr->op == nir_op_sdot_4x8_iadd_sat) 1895 inst->saturate = true; 1896 break; 1897 1898 case nir_op_udot_4x8_uadd: 1899 case nir_op_udot_4x8_uadd_sat: 1900 inst = bld.DP4A(result, 1901 retype(op[2], BRW_REGISTER_TYPE_UD), 1902 retype(op[0], BRW_REGISTER_TYPE_UD), 1903 retype(op[1], BRW_REGISTER_TYPE_UD)); 1904 1905 if (instr->op == nir_op_udot_4x8_uadd_sat) 1906 inst->saturate = true; 1907 break; 1908 1909 case nir_op_sudot_4x8_iadd: 1910 case nir_op_sudot_4x8_iadd_sat: 1911 inst = bld.DP4A(result, 1912 retype(op[2], BRW_REGISTER_TYPE_D), 1913 retype(op[0], BRW_REGISTER_TYPE_D), 1914 retype(op[1], BRW_REGISTER_TYPE_UD)); 1915 1916 if (instr->op == nir_op_sudot_4x8_iadd_sat) 1917 inst->saturate = true; 1918 break; 1919 1920 case nir_op_ffma: 1921 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1922 brw_rnd_mode rnd = 1923 brw_rnd_mode_from_execution_mode(execution_mode); 1924 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1925 brw_imm_d(rnd)); 1926 } 1927 1928 inst = bld.MAD(result, op[2], op[1], op[0]); 1929 break; 1930 1931 case nir_op_flrp: 1932 if (nir_has_any_rounding_mode_enabled(execution_mode)) { 1933 brw_rnd_mode rnd = 1934 brw_rnd_mode_from_execution_mode(execution_mode); 1935 bld.emit(SHADER_OPCODE_RND_MODE, bld.null_reg_ud(), 1936 brw_imm_d(rnd)); 1937 } 1938 1939 inst = bld.LRP(result, op[0], op[1], op[2]); 1940 break; 1941 1942 case nir_op_b32csel: 1943 if (optimize_frontfacing_ternary(instr, result)) 1944 return; 1945 1946 bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ); 1947 inst = bld.SEL(result, op[1], op[2]); 1948 inst->predicate = BRW_PREDICATE_NORMAL; 1949 break; 1950 1951 case nir_op_extract_u8: 1952 case nir_op_extract_i8: { 1953 unsigned byte = nir_src_as_uint(instr->src[1].src); 1954 1955 /* The PRMs say: 1956 * 1957 * BDW+ 1958 * There is no direct conversion from B/UB to Q/UQ or Q/UQ to B/UB. 1959 * Use two instructions and a word or DWord intermediate integer type. 1960 */ 1961 if (nir_dest_bit_size(instr->dest.dest) == 64) { 1962 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1963 1964 if (instr->op == nir_op_extract_i8) { 1965 /* If we need to sign extend, extract to a word first */ 1966 fs_reg w_temp = bld.vgrf(BRW_REGISTER_TYPE_W); 1967 bld.MOV(w_temp, subscript(op[0], type, byte)); 1968 bld.MOV(result, w_temp); 1969 } else if (byte & 1) { 1970 /* Extract the high byte from the word containing the desired byte 1971 * offset. 1972 */ 1973 bld.SHR(result, 1974 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1975 brw_imm_uw(8)); 1976 } else { 1977 /* Otherwise use an AND with 0xff and a word type */ 1978 bld.AND(result, 1979 subscript(op[0], BRW_REGISTER_TYPE_UW, byte / 2), 1980 brw_imm_uw(0xff)); 1981 } 1982 } else { 1983 const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8); 1984 bld.MOV(result, subscript(op[0], type, byte)); 1985 } 1986 break; 1987 } 1988 1989 case nir_op_extract_u16: 1990 case nir_op_extract_i16: { 1991 const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16); 1992 unsigned word = nir_src_as_uint(instr->src[1].src); 1993 bld.MOV(result, subscript(op[0], type, word)); 1994 break; 1995 } 1996 1997 default: 1998 unreachable("unhandled instruction"); 1999 } 2000 2001 /* If we need to do a boolean resolve, replace the result with -(x & 1) 2002 * to sign extend the low bit to 0/~0 2003 */ 2004 if (devinfo->ver <= 5 && 2005 !result.is_null() && 2006 (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { 2007 fs_reg masked = vgrf(glsl_type::int_type); 2008 bld.AND(masked, result, brw_imm_d(1)); 2009 masked.negate = true; 2010 bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked); 2011 } 2012} 2013 2014void 2015fs_visitor::nir_emit_load_const(const fs_builder &bld, 2016 nir_load_const_instr *instr) 2017{ 2018 const brw_reg_type reg_type = 2019 brw_reg_type_from_bit_size(instr->def.bit_size, BRW_REGISTER_TYPE_D); 2020 fs_reg reg = bld.vgrf(reg_type, instr->def.num_components); 2021 2022 switch (instr->def.bit_size) { 2023 case 8: 2024 for (unsigned i = 0; i < instr->def.num_components; i++) 2025 bld.MOV(offset(reg, bld, i), setup_imm_b(bld, instr->value[i].i8)); 2026 break; 2027 2028 case 16: 2029 for (unsigned i = 0; i < instr->def.num_components; i++) 2030 bld.MOV(offset(reg, bld, i), brw_imm_w(instr->value[i].i16)); 2031 break; 2032 2033 case 32: 2034 for (unsigned i = 0; i < instr->def.num_components; i++) 2035 bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value[i].i32)); 2036 break; 2037 2038 case 64: 2039 assert(devinfo->ver >= 7); 2040 if (devinfo->ver == 7) { 2041 /* We don't get 64-bit integer types until gfx8 */ 2042 for (unsigned i = 0; i < instr->def.num_components; i++) { 2043 bld.MOV(retype(offset(reg, bld, i), BRW_REGISTER_TYPE_DF), 2044 setup_imm_df(bld, instr->value[i].f64)); 2045 } 2046 } else { 2047 for (unsigned i = 0; i < instr->def.num_components; i++) 2048 bld.MOV(offset(reg, bld, i), brw_imm_q(instr->value[i].i64)); 2049 } 2050 break; 2051 2052 default: 2053 unreachable("Invalid bit size"); 2054 } 2055 2056 nir_ssa_values[instr->def.index] = reg; 2057} 2058 2059fs_reg 2060fs_visitor::get_nir_src(const nir_src &src) 2061{ 2062 fs_reg reg; 2063 if (src.is_ssa) { 2064 if (nir_src_is_undef(src)) { 2065 const brw_reg_type reg_type = 2066 brw_reg_type_from_bit_size(src.ssa->bit_size, BRW_REGISTER_TYPE_D); 2067 reg = bld.vgrf(reg_type, src.ssa->num_components); 2068 } else { 2069 reg = nir_ssa_values[src.ssa->index]; 2070 } 2071 } else { 2072 /* We don't handle indirects on locals */ 2073 assert(src.reg.indirect == NULL); 2074 reg = offset(nir_locals[src.reg.reg->index], bld, 2075 src.reg.base_offset * src.reg.reg->num_components); 2076 } 2077 2078 if (nir_src_bit_size(src) == 64 && devinfo->ver == 7) { 2079 /* The only 64-bit type available on gfx7 is DF, so use that. */ 2080 reg.type = BRW_REGISTER_TYPE_DF; 2081 } else { 2082 /* To avoid floating-point denorm flushing problems, set the type by 2083 * default to an integer type - instructions that need floating point 2084 * semantics will set this to F if they need to 2085 */ 2086 reg.type = brw_reg_type_from_bit_size(nir_src_bit_size(src), 2087 BRW_REGISTER_TYPE_D); 2088 } 2089 2090 return reg; 2091} 2092 2093/** 2094 * Return an IMM for constants; otherwise call get_nir_src() as normal. 2095 * 2096 * This function should not be called on any value which may be 64 bits. 2097 * We could theoretically support 64-bit on gfx8+ but we choose not to 2098 * because it wouldn't work in general (no gfx7 support) and there are 2099 * enough restrictions in 64-bit immediates that you can't take the return 2100 * value and treat it the same as the result of get_nir_src(). 2101 */ 2102fs_reg 2103fs_visitor::get_nir_src_imm(const nir_src &src) 2104{ 2105 assert(nir_src_bit_size(src) == 32); 2106 return nir_src_is_const(src) ? 2107 fs_reg(brw_imm_d(nir_src_as_int(src))) : get_nir_src(src); 2108} 2109 2110fs_reg 2111fs_visitor::get_nir_dest(const nir_dest &dest) 2112{ 2113 if (dest.is_ssa) { 2114 const brw_reg_type reg_type = 2115 brw_reg_type_from_bit_size(dest.ssa.bit_size, 2116 dest.ssa.bit_size == 8 ? 2117 BRW_REGISTER_TYPE_D : 2118 BRW_REGISTER_TYPE_F); 2119 nir_ssa_values[dest.ssa.index] = 2120 bld.vgrf(reg_type, dest.ssa.num_components); 2121 bld.UNDEF(nir_ssa_values[dest.ssa.index]); 2122 return nir_ssa_values[dest.ssa.index]; 2123 } else { 2124 /* We don't handle indirects on locals */ 2125 assert(dest.reg.indirect == NULL); 2126 return offset(nir_locals[dest.reg.reg->index], bld, 2127 dest.reg.base_offset * dest.reg.reg->num_components); 2128 } 2129} 2130 2131void 2132fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst, 2133 unsigned wr_mask) 2134{ 2135 for (unsigned i = 0; i < 4; i++) { 2136 if (!((wr_mask >> i) & 1)) 2137 continue; 2138 2139 fs_inst *new_inst = new(mem_ctx) fs_inst(inst); 2140 new_inst->dst = offset(new_inst->dst, bld, i); 2141 for (unsigned j = 0; j < new_inst->sources; j++) 2142 if (new_inst->src[j].file == VGRF) 2143 new_inst->src[j] = offset(new_inst->src[j], bld, i); 2144 2145 bld.emit(new_inst); 2146 } 2147} 2148 2149static fs_inst * 2150emit_pixel_interpolater_send(const fs_builder &bld, 2151 enum opcode opcode, 2152 const fs_reg &dst, 2153 const fs_reg &src, 2154 const fs_reg &desc, 2155 glsl_interp_mode interpolation) 2156{ 2157 struct brw_wm_prog_data *wm_prog_data = 2158 brw_wm_prog_data(bld.shader->stage_prog_data); 2159 2160 fs_inst *inst = bld.emit(opcode, dst, src, desc); 2161 /* 2 floats per slot returned */ 2162 inst->size_written = 2 * dst.component_size(inst->exec_size); 2163 inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE; 2164 2165 wm_prog_data->pulls_bary = true; 2166 2167 return inst; 2168} 2169 2170/** 2171 * Computes 1 << x, given a D/UD register containing some value x. 2172 */ 2173static fs_reg 2174intexp2(const fs_builder &bld, const fs_reg &x) 2175{ 2176 assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D); 2177 2178 fs_reg result = bld.vgrf(x.type, 1); 2179 fs_reg one = bld.vgrf(x.type, 1); 2180 2181 bld.MOV(one, retype(brw_imm_d(1), one.type)); 2182 bld.SHL(result, one, x); 2183 return result; 2184} 2185 2186void 2187fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src) 2188{ 2189 assert(stage == MESA_SHADER_GEOMETRY); 2190 2191 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2192 2193 if (gs_compile->control_data_header_size_bits == 0) 2194 return; 2195 2196 /* We can only do EndPrimitive() functionality when the control data 2197 * consists of cut bits. Fortunately, the only time it isn't is when the 2198 * output type is points, in which case EndPrimitive() is a no-op. 2199 */ 2200 if (gs_prog_data->control_data_format != 2201 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { 2202 return; 2203 } 2204 2205 /* Cut bits use one bit per vertex. */ 2206 assert(gs_compile->control_data_bits_per_vertex == 1); 2207 2208 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2209 vertex_count.type = BRW_REGISTER_TYPE_UD; 2210 2211 /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting 2212 * vertex n, 0 otherwise. So all we need to do here is mark bit 2213 * (vertex_count - 1) % 32 in the cut_bits register to indicate that 2214 * EndPrimitive() was called after emitting vertex (vertex_count - 1); 2215 * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. 2216 * 2217 * Note that if EndPrimitive() is called before emitting any vertices, this 2218 * will cause us to set bit 31 of the control_data_bits register to 1. 2219 * That's fine because: 2220 * 2221 * - If max_vertices < 32, then vertex number 31 (zero-based) will never be 2222 * output, so the hardware will ignore cut bit 31. 2223 * 2224 * - If max_vertices == 32, then vertex number 31 is guaranteed to be the 2225 * last vertex, so setting cut bit 31 has no effect (since the primitive 2226 * is automatically ended when the GS terminates). 2227 * 2228 * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the 2229 * control_data_bits register to 0 when the first vertex is emitted. 2230 */ 2231 2232 const fs_builder abld = bld.annotate("end primitive"); 2233 2234 /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ 2235 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2236 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2237 fs_reg mask = intexp2(abld, prev_count); 2238 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2239 * attention to the lower 5 bits of its second source argument, so on this 2240 * architecture, 1 << (vertex_count - 1) is equivalent to 1 << 2241 * ((vertex_count - 1) % 32). 2242 */ 2243 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2244} 2245 2246void 2247fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count) 2248{ 2249 assert(stage == MESA_SHADER_GEOMETRY); 2250 assert(gs_compile->control_data_bits_per_vertex != 0); 2251 2252 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2253 2254 const fs_builder abld = bld.annotate("emit control data bits"); 2255 const fs_builder fwa_bld = bld.exec_all(); 2256 2257 /* We use a single UD register to accumulate control data bits (32 bits 2258 * for each of the SIMD8 channels). So we need to write a DWord (32 bits) 2259 * at a time. 2260 * 2261 * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets. 2262 * We have select a 128-bit group via the Global and Per-Slot Offsets, then 2263 * use the Channel Mask phase to enable/disable which DWord within that 2264 * group to write. (Remember, different SIMD8 channels may have emitted 2265 * different numbers of vertices, so we may need per-slot offsets.) 2266 * 2267 * Channel masking presents an annoying problem: we may have to replicate 2268 * the data up to 4 times: 2269 * 2270 * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data. 2271 * 2272 * To avoid penalizing shaders that emit a small number of vertices, we 2273 * can avoid these sometimes: if the size of the control data header is 2274 * <= 128 bits, then there is only 1 OWord. All SIMD8 channels will land 2275 * land in the same 128-bit group, so we can skip per-slot offsets. 2276 * 2277 * Similarly, if the control data header is <= 32 bits, there is only one 2278 * DWord, so we can skip channel masks. 2279 */ 2280 enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 2281 2282 fs_reg channel_mask, per_slot_offset; 2283 2284 if (gs_compile->control_data_header_size_bits > 32) { 2285 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 2286 channel_mask = vgrf(glsl_type::uint_type); 2287 } 2288 2289 if (gs_compile->control_data_header_size_bits > 128) { 2290 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT; 2291 per_slot_offset = vgrf(glsl_type::uint_type); 2292 } 2293 2294 /* Figure out which DWord we're trying to write to using the formula: 2295 * 2296 * dword_index = (vertex_count - 1) * bits_per_vertex / 32 2297 * 2298 * Since bits_per_vertex is a power of two, and is known at compile 2299 * time, this can be optimized to: 2300 * 2301 * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) 2302 */ 2303 if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) { 2304 fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2305 fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2306 abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu)); 2307 unsigned log2_bits_per_vertex = 2308 util_last_bit(gs_compile->control_data_bits_per_vertex); 2309 abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex)); 2310 2311 if (per_slot_offset.file != BAD_FILE) { 2312 /* Set the per-slot offset to dword_index / 4, so that we'll write to 2313 * the appropriate OWord within the control data header. 2314 */ 2315 abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u)); 2316 } 2317 2318 /* Set the channel masks to 1 << (dword_index % 4), so that we'll 2319 * write to the appropriate DWORD within the OWORD. 2320 */ 2321 fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2322 fwa_bld.AND(channel, dword_index, brw_imm_ud(3u)); 2323 channel_mask = intexp2(fwa_bld, channel); 2324 /* Then the channel masks need to be in bits 23:16. */ 2325 fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u)); 2326 } 2327 2328 /* Store the control data bits in the message payload and send it. */ 2329 unsigned mlen = 2; 2330 if (channel_mask.file != BAD_FILE) 2331 mlen += 4; /* channel masks, plus 3 extra copies of the data */ 2332 if (per_slot_offset.file != BAD_FILE) 2333 mlen++; 2334 2335 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 2336 fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen); 2337 unsigned i = 0; 2338 sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 2339 if (per_slot_offset.file != BAD_FILE) 2340 sources[i++] = per_slot_offset; 2341 if (channel_mask.file != BAD_FILE) 2342 sources[i++] = channel_mask; 2343 while (i < mlen) { 2344 sources[i++] = this->control_data_bits; 2345 } 2346 2347 abld.LOAD_PAYLOAD(payload, sources, mlen, mlen); 2348 fs_inst *inst = abld.emit(opcode, reg_undef, payload); 2349 inst->mlen = mlen; 2350 /* We need to increment Global Offset by 256-bits to make room for 2351 * Broadwell's extra "Vertex Count" payload at the beginning of the 2352 * URB entry. Since this is an OWord message, Global Offset is counted 2353 * in 128-bit units, so we must set it to 2. 2354 */ 2355 if (gs_prog_data->static_vertex_count == -1) 2356 inst->offset = 2; 2357} 2358 2359void 2360fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count, 2361 unsigned stream_id) 2362{ 2363 /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ 2364 2365 /* Note: we are calling this *before* increasing vertex_count, so 2366 * this->vertex_count == vertex_count - 1 in the formula above. 2367 */ 2368 2369 /* Stream mode uses 2 bits per vertex */ 2370 assert(gs_compile->control_data_bits_per_vertex == 2); 2371 2372 /* Must be a valid stream */ 2373 assert(stream_id < MAX_VERTEX_STREAMS); 2374 2375 /* Control data bits are initialized to 0 so we don't have to set any 2376 * bits when sending vertices to stream 0. 2377 */ 2378 if (stream_id == 0) 2379 return; 2380 2381 const fs_builder abld = bld.annotate("set stream control data bits", NULL); 2382 2383 /* reg::sid = stream_id */ 2384 fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2385 abld.MOV(sid, brw_imm_ud(stream_id)); 2386 2387 /* reg:shift_count = 2 * (vertex_count - 1) */ 2388 fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2389 abld.SHL(shift_count, vertex_count, brw_imm_ud(1u)); 2390 2391 /* Note: we're relying on the fact that the GEN SHL instruction only pays 2392 * attention to the lower 5 bits of its second source argument, so on this 2393 * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to 2394 * stream_id << ((2 * (vertex_count - 1)) % 32). 2395 */ 2396 fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2397 abld.SHL(mask, sid, shift_count); 2398 abld.OR(this->control_data_bits, this->control_data_bits, mask); 2399} 2400 2401void 2402fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src, 2403 unsigned stream_id) 2404{ 2405 assert(stage == MESA_SHADER_GEOMETRY); 2406 2407 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2408 2409 fs_reg vertex_count = get_nir_src(vertex_count_nir_src); 2410 vertex_count.type = BRW_REGISTER_TYPE_UD; 2411 2412 /* Haswell and later hardware ignores the "Render Stream Select" bits 2413 * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, 2414 * and instead sends all primitives down the pipeline for rasterization. 2415 * If the SOL stage is enabled, "Render Stream Select" is honored and 2416 * primitives bound to non-zero streams are discarded after stream output. 2417 * 2418 * Since the only purpose of primives sent to non-zero streams is to 2419 * be recorded by transform feedback, we can simply discard all geometry 2420 * bound to these streams when transform feedback is disabled. 2421 */ 2422 if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) 2423 return; 2424 2425 /* If we're outputting 32 control data bits or less, then we can wait 2426 * until the shader is over to output them all. Otherwise we need to 2427 * output them as we go. Now is the time to do it, since we're about to 2428 * output the vertex_count'th vertex, so it's guaranteed that the 2429 * control data bits associated with the (vertex_count - 1)th vertex are 2430 * correct. 2431 */ 2432 if (gs_compile->control_data_header_size_bits > 32) { 2433 const fs_builder abld = 2434 bld.annotate("emit vertex: emit control data bits"); 2435 2436 /* Only emit control data bits if we've finished accumulating a batch 2437 * of 32 bits. This is the case when: 2438 * 2439 * (vertex_count * bits_per_vertex) % 32 == 0 2440 * 2441 * (in other words, when the last 5 bits of vertex_count * 2442 * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some 2443 * integer n (which is always the case, since bits_per_vertex is 2444 * always 1 or 2), this is equivalent to requiring that the last 5-n 2445 * bits of vertex_count are 0: 2446 * 2447 * vertex_count & (2^(5-n) - 1) == 0 2448 * 2449 * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is 2450 * equivalent to: 2451 * 2452 * vertex_count & (32 / bits_per_vertex - 1) == 0 2453 * 2454 * TODO: If vertex_count is an immediate, we could do some of this math 2455 * at compile time... 2456 */ 2457 fs_inst *inst = 2458 abld.AND(bld.null_reg_d(), vertex_count, 2459 brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u)); 2460 inst->conditional_mod = BRW_CONDITIONAL_Z; 2461 2462 abld.IF(BRW_PREDICATE_NORMAL); 2463 /* If vertex_count is 0, then no control data bits have been 2464 * accumulated yet, so we can skip emitting them. 2465 */ 2466 abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u), 2467 BRW_CONDITIONAL_NEQ); 2468 abld.IF(BRW_PREDICATE_NORMAL); 2469 emit_gs_control_data_bits(vertex_count); 2470 abld.emit(BRW_OPCODE_ENDIF); 2471 2472 /* Reset control_data_bits to 0 so we can start accumulating a new 2473 * batch. 2474 * 2475 * Note: in the case where vertex_count == 0, this neutralizes the 2476 * effect of any call to EndPrimitive() that the shader may have 2477 * made before outputting its first vertex. 2478 */ 2479 inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u)); 2480 inst->force_writemask_all = true; 2481 abld.emit(BRW_OPCODE_ENDIF); 2482 } 2483 2484 emit_urb_writes(vertex_count); 2485 2486 /* In stream mode we have to set control data bits for all vertices 2487 * unless we have disabled control data bits completely (which we do 2488 * do for GL_POINTS outputs that don't use streams). 2489 */ 2490 if (gs_compile->control_data_header_size_bits > 0 && 2491 gs_prog_data->control_data_format == 2492 GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { 2493 set_gs_stream_control_data_bits(vertex_count, stream_id); 2494 } 2495} 2496 2497void 2498fs_visitor::emit_gs_input_load(const fs_reg &dst, 2499 const nir_src &vertex_src, 2500 unsigned base_offset, 2501 const nir_src &offset_src, 2502 unsigned num_components, 2503 unsigned first_component) 2504{ 2505 assert(type_sz(dst.type) == 4); 2506 struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data); 2507 const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8; 2508 2509 /* TODO: figure out push input layout for invocations == 1 */ 2510 if (gs_prog_data->invocations == 1 && 2511 nir_src_is_const(offset_src) && nir_src_is_const(vertex_src) && 2512 4 * (base_offset + nir_src_as_uint(offset_src)) < push_reg_count) { 2513 int imm_offset = (base_offset + nir_src_as_uint(offset_src)) * 4 + 2514 nir_src_as_uint(vertex_src) * push_reg_count; 2515 for (unsigned i = 0; i < num_components; i++) { 2516 bld.MOV(offset(dst, bld, i), 2517 fs_reg(ATTR, imm_offset + i + first_component, dst.type)); 2518 } 2519 return; 2520 } 2521 2522 /* Resort to the pull model. Ensure the VUE handles are provided. */ 2523 assert(gs_prog_data->base.include_vue_handles); 2524 2525 unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2; 2526 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2527 2528 if (gs_prog_data->invocations == 1) { 2529 if (nir_src_is_const(vertex_src)) { 2530 /* The vertex index is constant; just select the proper URB handle. */ 2531 icp_handle = 2532 retype(brw_vec8_grf(first_icp_handle + nir_src_as_uint(vertex_src), 0), 2533 BRW_REGISTER_TYPE_UD); 2534 } else { 2535 /* The vertex index is non-constant. We need to use indirect 2536 * addressing to fetch the proper URB handle. 2537 * 2538 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2539 * indicating that channel <n> should read the handle from 2540 * DWord <n>. We convert that to bytes by multiplying by 4. 2541 * 2542 * Next, we convert the vertex index to bytes by multiplying 2543 * by 32 (shifting by 5), and add the two together. This is 2544 * the final indirect byte offset. 2545 */ 2546 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2547 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2548 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2549 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2550 2551 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2552 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2553 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2554 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2555 /* Convert vertex_index to bytes (multiply by 32) */ 2556 bld.SHL(vertex_offset_bytes, 2557 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2558 brw_imm_ud(5u)); 2559 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2560 2561 /* Use first_icp_handle as the base offset. There is one register 2562 * of URB handles per vertex, so inform the register allocator that 2563 * we might read up to nir->info.gs.vertices_in registers. 2564 */ 2565 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2566 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2567 fs_reg(icp_offset_bytes), 2568 brw_imm_ud(nir->info.gs.vertices_in * REG_SIZE)); 2569 } 2570 } else { 2571 assert(gs_prog_data->invocations > 1); 2572 2573 if (nir_src_is_const(vertex_src)) { 2574 unsigned vertex = nir_src_as_uint(vertex_src); 2575 assert(devinfo->ver >= 9 || vertex <= 5); 2576 bld.MOV(icp_handle, 2577 retype(brw_vec1_grf(first_icp_handle + vertex / 8, vertex % 8), 2578 BRW_REGISTER_TYPE_UD)); 2579 } else { 2580 /* The vertex index is non-constant. We need to use indirect 2581 * addressing to fetch the proper URB handle. 2582 * 2583 */ 2584 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2585 2586 /* Convert vertex_index to bytes (multiply by 4) */ 2587 bld.SHL(icp_offset_bytes, 2588 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2589 brw_imm_ud(2u)); 2590 2591 /* Use first_icp_handle as the base offset. There is one DWord 2592 * of URB handles per vertex, so inform the register allocator that 2593 * we might read up to ceil(nir->info.gs.vertices_in / 8) registers. 2594 */ 2595 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2596 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2597 fs_reg(icp_offset_bytes), 2598 brw_imm_ud(DIV_ROUND_UP(nir->info.gs.vertices_in, 8) * 2599 REG_SIZE)); 2600 } 2601 } 2602 2603 fs_inst *inst; 2604 fs_reg indirect_offset = get_nir_src(offset_src); 2605 2606 if (nir_src_is_const(offset_src)) { 2607 /* Constant indexing - use global offset. */ 2608 if (first_component != 0) { 2609 unsigned read_components = num_components + first_component; 2610 fs_reg tmp = bld.vgrf(dst.type, read_components); 2611 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2612 inst->size_written = read_components * 2613 tmp.component_size(inst->exec_size); 2614 for (unsigned i = 0; i < num_components; i++) { 2615 bld.MOV(offset(dst, bld, i), 2616 offset(tmp, bld, i + first_component)); 2617 } 2618 } else { 2619 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2620 inst->size_written = num_components * 2621 dst.component_size(inst->exec_size); 2622 } 2623 inst->offset = base_offset + nir_src_as_uint(offset_src); 2624 inst->mlen = 1; 2625 } else { 2626 /* Indirect indexing - use per-slot offsets as well. */ 2627 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2628 unsigned read_components = num_components + first_component; 2629 fs_reg tmp = bld.vgrf(dst.type, read_components); 2630 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2631 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2632 if (first_component != 0) { 2633 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2634 payload); 2635 inst->size_written = read_components * 2636 tmp.component_size(inst->exec_size); 2637 for (unsigned i = 0; i < num_components; i++) { 2638 bld.MOV(offset(dst, bld, i), 2639 offset(tmp, bld, i + first_component)); 2640 } 2641 } else { 2642 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, payload); 2643 inst->size_written = num_components * 2644 dst.component_size(inst->exec_size); 2645 } 2646 inst->offset = base_offset; 2647 inst->mlen = 2; 2648 } 2649} 2650 2651fs_reg 2652fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr) 2653{ 2654 nir_src *offset_src = nir_get_io_offset_src(instr); 2655 2656 if (nir_src_is_const(*offset_src)) { 2657 /* The only constant offset we should find is 0. brw_nir.c's 2658 * add_const_offset_to_base() will fold other constant offsets 2659 * into instr->const_index[0]. 2660 */ 2661 assert(nir_src_as_uint(*offset_src) == 0); 2662 return fs_reg(); 2663 } 2664 2665 return get_nir_src(*offset_src); 2666} 2667 2668void 2669fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld, 2670 nir_intrinsic_instr *instr) 2671{ 2672 assert(stage == MESA_SHADER_VERTEX); 2673 2674 fs_reg dest; 2675 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2676 dest = get_nir_dest(instr->dest); 2677 2678 switch (instr->intrinsic) { 2679 case nir_intrinsic_load_vertex_id: 2680 case nir_intrinsic_load_base_vertex: 2681 unreachable("should be lowered by nir_lower_system_values()"); 2682 2683 case nir_intrinsic_load_input: { 2684 assert(nir_dest_bit_size(instr->dest) == 32); 2685 fs_reg src = fs_reg(ATTR, nir_intrinsic_base(instr) * 4, dest.type); 2686 src = offset(src, bld, nir_intrinsic_component(instr)); 2687 src = offset(src, bld, nir_src_as_uint(instr->src[0])); 2688 2689 for (unsigned i = 0; i < instr->num_components; i++) 2690 bld.MOV(offset(dest, bld, i), offset(src, bld, i)); 2691 break; 2692 } 2693 2694 case nir_intrinsic_load_vertex_id_zero_base: 2695 case nir_intrinsic_load_instance_id: 2696 case nir_intrinsic_load_base_instance: 2697 case nir_intrinsic_load_draw_id: 2698 case nir_intrinsic_load_first_vertex: 2699 case nir_intrinsic_load_is_indexed_draw: 2700 unreachable("lowered by brw_nir_lower_vs_inputs"); 2701 2702 default: 2703 nir_emit_intrinsic(bld, instr); 2704 break; 2705 } 2706} 2707 2708fs_reg 2709fs_visitor::get_tcs_single_patch_icp_handle(const fs_builder &bld, 2710 nir_intrinsic_instr *instr) 2711{ 2712 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2713 const nir_src &vertex_src = instr->src[0]; 2714 nir_intrinsic_instr *vertex_intrin = nir_src_as_intrinsic(vertex_src); 2715 fs_reg icp_handle; 2716 2717 if (nir_src_is_const(vertex_src)) { 2718 /* Emit a MOV to resolve <0,1,0> regioning. */ 2719 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2720 unsigned vertex = nir_src_as_uint(vertex_src); 2721 bld.MOV(icp_handle, 2722 retype(brw_vec1_grf(1 + (vertex >> 3), vertex & 7), 2723 BRW_REGISTER_TYPE_UD)); 2724 } else if (tcs_prog_data->instances == 1 && vertex_intrin && 2725 vertex_intrin->intrinsic == nir_intrinsic_load_invocation_id) { 2726 /* For the common case of only 1 instance, an array index of 2727 * gl_InvocationID means reading g1. Skip all the indirect work. 2728 */ 2729 icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2730 } else { 2731 /* The vertex index is non-constant. We need to use indirect 2732 * addressing to fetch the proper URB handle. 2733 */ 2734 icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2735 2736 /* Each ICP handle is a single DWord (4 bytes) */ 2737 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2738 bld.SHL(vertex_offset_bytes, 2739 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2740 brw_imm_ud(2u)); 2741 2742 /* Start at g1. We might read up to 4 registers. */ 2743 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2744 retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes, 2745 brw_imm_ud(4 * REG_SIZE)); 2746 } 2747 2748 return icp_handle; 2749} 2750 2751fs_reg 2752fs_visitor::get_tcs_eight_patch_icp_handle(const fs_builder &bld, 2753 nir_intrinsic_instr *instr) 2754{ 2755 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2756 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2757 const nir_src &vertex_src = instr->src[0]; 2758 2759 unsigned first_icp_handle = tcs_prog_data->include_primitive_id ? 3 : 2; 2760 2761 if (nir_src_is_const(vertex_src)) { 2762 return fs_reg(retype(brw_vec8_grf(first_icp_handle + 2763 nir_src_as_uint(vertex_src), 0), 2764 BRW_REGISTER_TYPE_UD)); 2765 } 2766 2767 /* The vertex index is non-constant. We need to use indirect 2768 * addressing to fetch the proper URB handle. 2769 * 2770 * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0> 2771 * indicating that channel <n> should read the handle from 2772 * DWord <n>. We convert that to bytes by multiplying by 4. 2773 * 2774 * Next, we convert the vertex index to bytes by multiplying 2775 * by 32 (shifting by 5), and add the two together. This is 2776 * the final indirect byte offset. 2777 */ 2778 fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2779 fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_UW, 1); 2780 fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2781 fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2782 fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2783 2784 /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */ 2785 bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210))); 2786 /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */ 2787 bld.SHL(channel_offsets, sequence, brw_imm_ud(2u)); 2788 /* Convert vertex_index to bytes (multiply by 32) */ 2789 bld.SHL(vertex_offset_bytes, 2790 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD), 2791 brw_imm_ud(5u)); 2792 bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets); 2793 2794 /* Use first_icp_handle as the base offset. There is one register 2795 * of URB handles per vertex, so inform the register allocator that 2796 * we might read up to nir->info.gs.vertices_in registers. 2797 */ 2798 bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle, 2799 retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type), 2800 icp_offset_bytes, brw_imm_ud(tcs_key->input_vertices * REG_SIZE)); 2801 2802 return icp_handle; 2803} 2804 2805struct brw_reg 2806fs_visitor::get_tcs_output_urb_handle() 2807{ 2808 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 2809 2810 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) { 2811 return retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD); 2812 } else { 2813 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH); 2814 return retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD); 2815 } 2816} 2817 2818void 2819fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld, 2820 nir_intrinsic_instr *instr) 2821{ 2822 assert(stage == MESA_SHADER_TESS_CTRL); 2823 struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key; 2824 struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); 2825 struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base; 2826 2827 bool eight_patch = 2828 vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH; 2829 2830 fs_reg dst; 2831 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 2832 dst = get_nir_dest(instr->dest); 2833 2834 switch (instr->intrinsic) { 2835 case nir_intrinsic_load_primitive_id: 2836 bld.MOV(dst, fs_reg(eight_patch ? brw_vec8_grf(2, 0) 2837 : brw_vec1_grf(0, 1))); 2838 break; 2839 case nir_intrinsic_load_invocation_id: 2840 bld.MOV(retype(dst, invocation_id.type), invocation_id); 2841 break; 2842 case nir_intrinsic_load_patch_vertices_in: 2843 bld.MOV(retype(dst, BRW_REGISTER_TYPE_D), 2844 brw_imm_d(tcs_key->input_vertices)); 2845 break; 2846 2847 case nir_intrinsic_control_barrier: { 2848 if (tcs_prog_data->instances == 1) 2849 break; 2850 2851 fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2852 fs_reg m0_2 = component(m0, 2); 2853 2854 const fs_builder chanbld = bld.exec_all().group(1, 0); 2855 2856 /* Zero the message header */ 2857 bld.exec_all().MOV(m0, brw_imm_ud(0u)); 2858 2859 if (devinfo->verx10 >= 125) { 2860 /* From BSpec: 54006, mov r0.2[31:24] into m0.2[31:24] and m0.2[23:16] */ 2861 fs_reg m0_10ub = component(retype(m0, BRW_REGISTER_TYPE_UB), 10); 2862 fs_reg r0_11ub = 2863 stride(suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UB), 11), 2864 0, 1, 0); 2865 bld.exec_all().group(2, 0).MOV(m0_10ub, r0_11ub); 2866 } else if (devinfo->ver >= 11) { 2867 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2868 brw_imm_ud(INTEL_MASK(30, 24))); 2869 2870 /* Set the Barrier Count and the enable bit */ 2871 chanbld.OR(m0_2, m0_2, 2872 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15))); 2873 } else { 2874 /* Copy "Barrier ID" from r0.2, bits 16:13 */ 2875 chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 2876 brw_imm_ud(INTEL_MASK(16, 13))); 2877 2878 /* Shift it up to bits 27:24. */ 2879 chanbld.SHL(m0_2, m0_2, brw_imm_ud(11)); 2880 2881 /* Set the Barrier Count and the enable bit */ 2882 chanbld.OR(m0_2, m0_2, 2883 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15))); 2884 } 2885 2886 bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0); 2887 break; 2888 } 2889 2890 case nir_intrinsic_load_input: 2891 unreachable("nir_lower_io should never give us these."); 2892 break; 2893 2894 case nir_intrinsic_load_per_vertex_input: { 2895 assert(nir_dest_bit_size(instr->dest) == 32); 2896 fs_reg indirect_offset = get_indirect_offset(instr); 2897 unsigned imm_offset = instr->const_index[0]; 2898 fs_inst *inst; 2899 2900 fs_reg icp_handle = 2901 eight_patch ? get_tcs_eight_patch_icp_handle(bld, instr) 2902 : get_tcs_single_patch_icp_handle(bld, instr); 2903 2904 /* We can only read two double components with each URB read, so 2905 * we send two read messages in that case, each one loading up to 2906 * two double components. 2907 */ 2908 unsigned num_components = instr->num_components; 2909 unsigned first_component = nir_intrinsic_component(instr); 2910 2911 if (indirect_offset.file == BAD_FILE) { 2912 /* Constant indexing - use global offset. */ 2913 if (first_component != 0) { 2914 unsigned read_components = num_components + first_component; 2915 fs_reg tmp = bld.vgrf(dst.type, read_components); 2916 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle); 2917 for (unsigned i = 0; i < num_components; i++) { 2918 bld.MOV(offset(dst, bld, i), 2919 offset(tmp, bld, i + first_component)); 2920 } 2921 } else { 2922 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle); 2923 } 2924 inst->offset = imm_offset; 2925 inst->mlen = 1; 2926 } else { 2927 /* Indirect indexing - use per-slot offsets as well. */ 2928 const fs_reg srcs[] = { icp_handle, indirect_offset }; 2929 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 2930 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 2931 if (first_component != 0) { 2932 unsigned read_components = num_components + first_component; 2933 fs_reg tmp = bld.vgrf(dst.type, read_components); 2934 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 2935 payload); 2936 for (unsigned i = 0; i < num_components; i++) { 2937 bld.MOV(offset(dst, bld, i), 2938 offset(tmp, bld, i + first_component)); 2939 } 2940 } else { 2941 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 2942 payload); 2943 } 2944 inst->offset = imm_offset; 2945 inst->mlen = 2; 2946 } 2947 inst->size_written = (num_components + first_component) * 2948 inst->dst.component_size(inst->exec_size); 2949 2950 /* Copy the temporary to the destination to deal with writemasking. 2951 * 2952 * Also attempt to deal with gl_PointSize being in the .w component. 2953 */ 2954 if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { 2955 assert(type_sz(dst.type) == 4); 2956 inst->dst = bld.vgrf(dst.type, 4); 2957 inst->size_written = 4 * REG_SIZE; 2958 bld.MOV(dst, offset(inst->dst, bld, 3)); 2959 } 2960 break; 2961 } 2962 2963 case nir_intrinsic_load_output: 2964 case nir_intrinsic_load_per_vertex_output: { 2965 assert(nir_dest_bit_size(instr->dest) == 32); 2966 fs_reg indirect_offset = get_indirect_offset(instr); 2967 unsigned imm_offset = instr->const_index[0]; 2968 unsigned first_component = nir_intrinsic_component(instr); 2969 2970 struct brw_reg output_handles = get_tcs_output_urb_handle(); 2971 2972 fs_inst *inst; 2973 if (indirect_offset.file == BAD_FILE) { 2974 /* This MOV replicates the output handle to all enabled channels 2975 * is SINGLE_PATCH mode. 2976 */ 2977 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 2978 bld.MOV(patch_handle, output_handles); 2979 2980 { 2981 if (first_component != 0) { 2982 unsigned read_components = 2983 instr->num_components + first_component; 2984 fs_reg tmp = bld.vgrf(dst.type, read_components); 2985 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 2986 patch_handle); 2987 inst->size_written = read_components * REG_SIZE; 2988 for (unsigned i = 0; i < instr->num_components; i++) { 2989 bld.MOV(offset(dst, bld, i), 2990 offset(tmp, bld, i + first_component)); 2991 } 2992 } else { 2993 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, 2994 patch_handle); 2995 inst->size_written = instr->num_components * REG_SIZE; 2996 } 2997 inst->offset = imm_offset; 2998 inst->mlen = 1; 2999 } 3000 } else { 3001 /* Indirect indexing - use per-slot offsets as well. */ 3002 const fs_reg srcs[] = { output_handles, indirect_offset }; 3003 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3004 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 3005 if (first_component != 0) { 3006 unsigned read_components = 3007 instr->num_components + first_component; 3008 fs_reg tmp = bld.vgrf(dst.type, read_components); 3009 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 3010 payload); 3011 inst->size_written = read_components * REG_SIZE; 3012 for (unsigned i = 0; i < instr->num_components; i++) { 3013 bld.MOV(offset(dst, bld, i), 3014 offset(tmp, bld, i + first_component)); 3015 } 3016 } else { 3017 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst, 3018 payload); 3019 inst->size_written = instr->num_components * REG_SIZE; 3020 } 3021 inst->offset = imm_offset; 3022 inst->mlen = 2; 3023 } 3024 break; 3025 } 3026 3027 case nir_intrinsic_store_output: 3028 case nir_intrinsic_store_per_vertex_output: { 3029 assert(nir_src_bit_size(instr->src[0]) == 32); 3030 fs_reg value = get_nir_src(instr->src[0]); 3031 fs_reg indirect_offset = get_indirect_offset(instr); 3032 unsigned imm_offset = instr->const_index[0]; 3033 unsigned mask = instr->const_index[1]; 3034 unsigned header_regs = 0; 3035 struct brw_reg output_handles = get_tcs_output_urb_handle(); 3036 3037 fs_reg srcs[7]; 3038 srcs[header_regs++] = output_handles; 3039 3040 if (indirect_offset.file != BAD_FILE) { 3041 srcs[header_regs++] = indirect_offset; 3042 } 3043 3044 if (mask == 0) 3045 break; 3046 3047 unsigned num_components = util_last_bit(mask); 3048 enum opcode opcode; 3049 3050 /* We can only pack two 64-bit components in a single message, so send 3051 * 2 messages if we have more components 3052 */ 3053 unsigned first_component = nir_intrinsic_component(instr); 3054 mask = mask << first_component; 3055 3056 if (mask != WRITEMASK_XYZW) { 3057 srcs[header_regs++] = brw_imm_ud(mask << 16); 3058 opcode = indirect_offset.file != BAD_FILE ? 3059 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT : 3060 SHADER_OPCODE_URB_WRITE_SIMD8_MASKED; 3061 } else { 3062 opcode = indirect_offset.file != BAD_FILE ? 3063 SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT : 3064 SHADER_OPCODE_URB_WRITE_SIMD8; 3065 } 3066 3067 for (unsigned i = 0; i < num_components; i++) { 3068 if (!(mask & (1 << (i + first_component)))) 3069 continue; 3070 3071 srcs[header_regs + i + first_component] = offset(value, bld, i); 3072 } 3073 3074 unsigned mlen = header_regs + num_components + first_component; 3075 fs_reg payload = 3076 bld.vgrf(BRW_REGISTER_TYPE_UD, mlen); 3077 bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs); 3078 3079 fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload); 3080 inst->offset = imm_offset; 3081 inst->mlen = mlen; 3082 break; 3083 } 3084 3085 default: 3086 nir_emit_intrinsic(bld, instr); 3087 break; 3088 } 3089} 3090 3091void 3092fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld, 3093 nir_intrinsic_instr *instr) 3094{ 3095 assert(stage == MESA_SHADER_TESS_EVAL); 3096 struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); 3097 3098 fs_reg dest; 3099 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3100 dest = get_nir_dest(instr->dest); 3101 3102 switch (instr->intrinsic) { 3103 case nir_intrinsic_load_primitive_id: 3104 bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1))); 3105 break; 3106 case nir_intrinsic_load_tess_coord: 3107 /* gl_TessCoord is part of the payload in g1-3 */ 3108 for (unsigned i = 0; i < 3; i++) { 3109 bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0))); 3110 } 3111 break; 3112 3113 case nir_intrinsic_load_input: 3114 case nir_intrinsic_load_per_vertex_input: { 3115 assert(nir_dest_bit_size(instr->dest) == 32); 3116 fs_reg indirect_offset = get_indirect_offset(instr); 3117 unsigned imm_offset = instr->const_index[0]; 3118 unsigned first_component = nir_intrinsic_component(instr); 3119 3120 fs_inst *inst; 3121 if (indirect_offset.file == BAD_FILE) { 3122 /* Arbitrarily only push up to 32 vec4 slots worth of data, 3123 * which is 16 registers (since each holds 2 vec4 slots). 3124 */ 3125 const unsigned max_push_slots = 32; 3126 if (imm_offset < max_push_slots) { 3127 fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type); 3128 for (int i = 0; i < instr->num_components; i++) { 3129 unsigned comp = 4 * (imm_offset % 2) + i + first_component; 3130 bld.MOV(offset(dest, bld, i), component(src, comp)); 3131 } 3132 3133 tes_prog_data->base.urb_read_length = 3134 MAX2(tes_prog_data->base.urb_read_length, 3135 (imm_offset / 2) + 1); 3136 } else { 3137 /* Replicate the patch handle to all enabled channels */ 3138 const fs_reg srcs[] = { 3139 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD) 3140 }; 3141 fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); 3142 bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0); 3143 3144 if (first_component != 0) { 3145 unsigned read_components = 3146 instr->num_components + first_component; 3147 fs_reg tmp = bld.vgrf(dest.type, read_components); 3148 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, 3149 patch_handle); 3150 inst->size_written = read_components * REG_SIZE; 3151 for (unsigned i = 0; i < instr->num_components; i++) { 3152 bld.MOV(offset(dest, bld, i), 3153 offset(tmp, bld, i + first_component)); 3154 } 3155 } else { 3156 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest, 3157 patch_handle); 3158 inst->size_written = instr->num_components * REG_SIZE; 3159 } 3160 inst->mlen = 1; 3161 inst->offset = imm_offset; 3162 } 3163 } else { 3164 /* Indirect indexing - use per-slot offsets as well. */ 3165 3166 /* We can only read two double components with each URB read, so 3167 * we send two read messages in that case, each one loading up to 3168 * two double components. 3169 */ 3170 unsigned num_components = instr->num_components; 3171 const fs_reg srcs[] = { 3172 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD), 3173 indirect_offset 3174 }; 3175 fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); 3176 bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0); 3177 3178 if (first_component != 0) { 3179 unsigned read_components = 3180 num_components + first_component; 3181 fs_reg tmp = bld.vgrf(dest.type, read_components); 3182 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp, 3183 payload); 3184 for (unsigned i = 0; i < num_components; i++) { 3185 bld.MOV(offset(dest, bld, i), 3186 offset(tmp, bld, i + first_component)); 3187 } 3188 } else { 3189 inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest, 3190 payload); 3191 } 3192 inst->mlen = 2; 3193 inst->offset = imm_offset; 3194 inst->size_written = (num_components + first_component) * 3195 inst->dst.component_size(inst->exec_size); 3196 } 3197 break; 3198 } 3199 default: 3200 nir_emit_intrinsic(bld, instr); 3201 break; 3202 } 3203} 3204 3205void 3206fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld, 3207 nir_intrinsic_instr *instr) 3208{ 3209 assert(stage == MESA_SHADER_GEOMETRY); 3210 fs_reg indirect_offset; 3211 3212 fs_reg dest; 3213 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3214 dest = get_nir_dest(instr->dest); 3215 3216 switch (instr->intrinsic) { 3217 case nir_intrinsic_load_primitive_id: 3218 assert(stage == MESA_SHADER_GEOMETRY); 3219 assert(brw_gs_prog_data(prog_data)->include_primitive_id); 3220 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 3221 retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD)); 3222 break; 3223 3224 case nir_intrinsic_load_input: 3225 unreachable("load_input intrinsics are invalid for the GS stage"); 3226 3227 case nir_intrinsic_load_per_vertex_input: 3228 emit_gs_input_load(dest, instr->src[0], instr->const_index[0], 3229 instr->src[1], instr->num_components, 3230 nir_intrinsic_component(instr)); 3231 break; 3232 3233 case nir_intrinsic_emit_vertex_with_counter: 3234 emit_gs_vertex(instr->src[0], instr->const_index[0]); 3235 break; 3236 3237 case nir_intrinsic_end_primitive_with_counter: 3238 emit_gs_end_primitive(instr->src[0]); 3239 break; 3240 3241 case nir_intrinsic_set_vertex_and_primitive_count: 3242 bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0])); 3243 break; 3244 3245 case nir_intrinsic_load_invocation_id: { 3246 fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID]; 3247 assert(val.file != BAD_FILE); 3248 dest.type = val.type; 3249 bld.MOV(dest, val); 3250 break; 3251 } 3252 3253 default: 3254 nir_emit_intrinsic(bld, instr); 3255 break; 3256 } 3257} 3258 3259/** 3260 * Fetch the current render target layer index. 3261 */ 3262static fs_reg 3263fetch_render_target_array_index(const fs_builder &bld) 3264{ 3265 if (bld.shader->devinfo->ver >= 12) { 3266 /* The render target array index is provided in the thread payload as 3267 * bits 26:16 of r1.1. 3268 */ 3269 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3270 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 1, 3), 3271 brw_imm_uw(0x7ff)); 3272 return idx; 3273 } else if (bld.shader->devinfo->ver >= 6) { 3274 /* The render target array index is provided in the thread payload as 3275 * bits 26:16 of r0.0. 3276 */ 3277 const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD); 3278 bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1), 3279 brw_imm_uw(0x7ff)); 3280 return idx; 3281 } else { 3282 /* Pre-SNB we only ever render into the first layer of the framebuffer 3283 * since layered rendering is not implemented. 3284 */ 3285 return brw_imm_ud(0); 3286 } 3287} 3288 3289/** 3290 * Fake non-coherent framebuffer read implemented using TXF to fetch from the 3291 * framebuffer at the current fragment coordinates and sample index. 3292 */ 3293fs_inst * 3294fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, 3295 unsigned target) 3296{ 3297 const struct intel_device_info *devinfo = bld.shader->devinfo; 3298 3299 assert(bld.shader->stage == MESA_SHADER_FRAGMENT); 3300 const brw_wm_prog_key *wm_key = 3301 reinterpret_cast<const brw_wm_prog_key *>(key); 3302 assert(!wm_key->coherent_fb_fetch); 3303 const struct brw_wm_prog_data *wm_prog_data = 3304 brw_wm_prog_data(stage_prog_data); 3305 3306 /* Calculate the surface index relative to the start of the texture binding 3307 * table block, since that's what the texturing messages expect. 3308 */ 3309 const unsigned surface = target + 3310 wm_prog_data->binding_table.render_target_read_start - 3311 wm_prog_data->base.binding_table.texture_start; 3312 3313 /* Calculate the fragment coordinates. */ 3314 const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3); 3315 bld.MOV(offset(coords, bld, 0), pixel_x); 3316 bld.MOV(offset(coords, bld, 1), pixel_y); 3317 bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld)); 3318 3319 /* Calculate the sample index and MCS payload when multisampling. Luckily 3320 * the MCS fetch message behaves deterministically for UMS surfaces, so it 3321 * shouldn't be necessary to recompile based on whether the framebuffer is 3322 * CMS or UMS. 3323 */ 3324 if (wm_key->multisample_fbo && 3325 nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE) 3326 nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup(); 3327 3328 const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID]; 3329 const fs_reg mcs = wm_key->multisample_fbo ? 3330 emit_mcs_fetch(coords, 3, brw_imm_ud(surface), fs_reg()) : fs_reg(); 3331 3332 /* Use either a normal or a CMS texel fetch message depending on whether 3333 * the framebuffer is single or multisample. On SKL+ use the wide CMS 3334 * message just in case the framebuffer uses 16x multisampling, it should 3335 * be equivalent to the normal CMS fetch for lower multisampling modes. 3336 */ 3337 const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL : 3338 devinfo->ver >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL : 3339 SHADER_OPCODE_TXF_CMS_LOGICAL; 3340 3341 /* Emit the instruction. */ 3342 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 3343 srcs[TEX_LOGICAL_SRC_COORDINATE] = coords; 3344 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0); 3345 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = sample; 3346 srcs[TEX_LOGICAL_SRC_MCS] = mcs; 3347 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3348 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 3349 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(3); 3350 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0); 3351 3352 fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs)); 3353 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3354 3355 return inst; 3356} 3357 3358/** 3359 * Actual coherent framebuffer read implemented using the native render target 3360 * read message. Requires SKL+. 3361 */ 3362static fs_inst * 3363emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target) 3364{ 3365 assert(bld.shader->devinfo->ver >= 9); 3366 fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst); 3367 inst->target = target; 3368 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 3369 3370 return inst; 3371} 3372 3373static fs_reg 3374alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n) 3375{ 3376 if (n && regs[0].file != BAD_FILE) { 3377 return regs[0]; 3378 3379 } else { 3380 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size); 3381 3382 for (unsigned i = 0; i < n; i++) 3383 regs[i] = tmp; 3384 3385 return tmp; 3386 } 3387} 3388 3389static fs_reg 3390alloc_frag_output(fs_visitor *v, unsigned location) 3391{ 3392 assert(v->stage == MESA_SHADER_FRAGMENT); 3393 const brw_wm_prog_key *const key = 3394 reinterpret_cast<const brw_wm_prog_key *>(v->key); 3395 const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION); 3396 const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX); 3397 3398 if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1)) 3399 return alloc_temporary(v->bld, 4, &v->dual_src_output, 1); 3400 3401 else if (l == FRAG_RESULT_COLOR) 3402 return alloc_temporary(v->bld, 4, v->outputs, 3403 MAX2(key->nr_color_regions, 1)); 3404 3405 else if (l == FRAG_RESULT_DEPTH) 3406 return alloc_temporary(v->bld, 1, &v->frag_depth, 1); 3407 3408 else if (l == FRAG_RESULT_STENCIL) 3409 return alloc_temporary(v->bld, 1, &v->frag_stencil, 1); 3410 3411 else if (l == FRAG_RESULT_SAMPLE_MASK) 3412 return alloc_temporary(v->bld, 1, &v->sample_mask, 1); 3413 3414 else if (l >= FRAG_RESULT_DATA0 && 3415 l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS) 3416 return alloc_temporary(v->bld, 4, 3417 &v->outputs[l - FRAG_RESULT_DATA0], 1); 3418 3419 else 3420 unreachable("Invalid location"); 3421} 3422 3423void 3424fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld, 3425 nir_intrinsic_instr *instr) 3426{ 3427 assert(stage == MESA_SHADER_FRAGMENT); 3428 3429 fs_reg dest; 3430 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3431 dest = get_nir_dest(instr->dest); 3432 3433 switch (instr->intrinsic) { 3434 case nir_intrinsic_load_front_face: 3435 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 3436 *emit_frontfacing_interpolation()); 3437 break; 3438 3439 case nir_intrinsic_load_sample_pos: { 3440 fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS]; 3441 assert(sample_pos.file != BAD_FILE); 3442 dest.type = sample_pos.type; 3443 bld.MOV(dest, sample_pos); 3444 bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1)); 3445 break; 3446 } 3447 3448 case nir_intrinsic_load_layer_id: 3449 dest.type = BRW_REGISTER_TYPE_UD; 3450 bld.MOV(dest, fetch_render_target_array_index(bld)); 3451 break; 3452 3453 case nir_intrinsic_is_helper_invocation: { 3454 /* Unlike the regular gl_HelperInvocation, that is defined at dispatch, 3455 * the helperInvocationEXT() (aka SpvOpIsHelperInvocationEXT) takes into 3456 * consideration demoted invocations. That information is stored in 3457 * f0.1. 3458 */ 3459 dest.type = BRW_REGISTER_TYPE_UD; 3460 3461 bld.MOV(dest, brw_imm_ud(0)); 3462 3463 fs_inst *mov = bld.MOV(dest, brw_imm_ud(~0)); 3464 mov->predicate = BRW_PREDICATE_NORMAL; 3465 mov->predicate_inverse = true; 3466 mov->flag_subreg = sample_mask_flag_subreg(this); 3467 break; 3468 } 3469 3470 case nir_intrinsic_load_helper_invocation: 3471 case nir_intrinsic_load_sample_mask_in: 3472 case nir_intrinsic_load_sample_id: 3473 case nir_intrinsic_load_frag_shading_rate: { 3474 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3475 fs_reg val = nir_system_values[sv]; 3476 assert(val.file != BAD_FILE); 3477 dest.type = val.type; 3478 bld.MOV(dest, val); 3479 break; 3480 } 3481 3482 case nir_intrinsic_store_output: { 3483 const fs_reg src = get_nir_src(instr->src[0]); 3484 const unsigned store_offset = nir_src_as_uint(instr->src[1]); 3485 const unsigned location = nir_intrinsic_base(instr) + 3486 SET_FIELD(store_offset, BRW_NIR_FRAG_OUTPUT_LOCATION); 3487 const fs_reg new_dest = retype(alloc_frag_output(this, location), 3488 src.type); 3489 3490 for (unsigned j = 0; j < instr->num_components; j++) 3491 bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j), 3492 offset(src, bld, j)); 3493 3494 break; 3495 } 3496 3497 case nir_intrinsic_load_output: { 3498 const unsigned l = GET_FIELD(nir_intrinsic_base(instr), 3499 BRW_NIR_FRAG_OUTPUT_LOCATION); 3500 assert(l >= FRAG_RESULT_DATA0); 3501 const unsigned load_offset = nir_src_as_uint(instr->src[0]); 3502 const unsigned target = l - FRAG_RESULT_DATA0 + load_offset; 3503 const fs_reg tmp = bld.vgrf(dest.type, 4); 3504 3505 if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch) 3506 emit_coherent_fb_read(bld, tmp, target); 3507 else 3508 emit_non_coherent_fb_read(bld, tmp, target); 3509 3510 for (unsigned j = 0; j < instr->num_components; j++) { 3511 bld.MOV(offset(dest, bld, j), 3512 offset(tmp, bld, nir_intrinsic_component(instr) + j)); 3513 } 3514 3515 break; 3516 } 3517 3518 case nir_intrinsic_demote: 3519 case nir_intrinsic_discard: 3520 case nir_intrinsic_terminate: 3521 case nir_intrinsic_demote_if: 3522 case nir_intrinsic_discard_if: 3523 case nir_intrinsic_terminate_if: { 3524 /* We track our discarded pixels in f0.1/f1.0. By predicating on it, we 3525 * can update just the flag bits that aren't yet discarded. If there's 3526 * no condition, we emit a CMP of g0 != g0, so all currently executing 3527 * channels will get turned off. 3528 */ 3529 fs_inst *cmp = NULL; 3530 if (instr->intrinsic == nir_intrinsic_demote_if || 3531 instr->intrinsic == nir_intrinsic_discard_if || 3532 instr->intrinsic == nir_intrinsic_terminate_if) { 3533 nir_alu_instr *alu = nir_src_as_alu_instr(instr->src[0]); 3534 3535 if (alu != NULL && 3536 alu->op != nir_op_bcsel && 3537 (devinfo->ver > 5 || 3538 (alu->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) != BRW_NIR_BOOLEAN_NEEDS_RESOLVE || 3539 alu->op == nir_op_fneu32 || alu->op == nir_op_feq32 || 3540 alu->op == nir_op_flt32 || alu->op == nir_op_fge32 || 3541 alu->op == nir_op_ine32 || alu->op == nir_op_ieq32 || 3542 alu->op == nir_op_ilt32 || alu->op == nir_op_ige32 || 3543 alu->op == nir_op_ult32 || alu->op == nir_op_uge32)) { 3544 /* Re-emit the instruction that generated the Boolean value, but 3545 * do not store it. Since this instruction will be conditional, 3546 * other instructions that want to use the real Boolean value may 3547 * get garbage. This was a problem for piglit's fs-discard-exit-2 3548 * test. 3549 * 3550 * Ideally we'd detect that the instruction cannot have a 3551 * conditional modifier before emitting the instructions. Alas, 3552 * that is nigh impossible. Instead, we're going to assume the 3553 * instruction (or last instruction) generated can have a 3554 * conditional modifier. If it cannot, fallback to the old-style 3555 * compare, and hope dead code elimination will clean up the 3556 * extra instructions generated. 3557 */ 3558 nir_emit_alu(bld, alu, false); 3559 3560 cmp = (fs_inst *) instructions.get_tail(); 3561 if (cmp->conditional_mod == BRW_CONDITIONAL_NONE) { 3562 if (cmp->can_do_cmod()) 3563 cmp->conditional_mod = BRW_CONDITIONAL_Z; 3564 else 3565 cmp = NULL; 3566 } else { 3567 /* The old sequence that would have been generated is, 3568 * basically, bool_result == false. This is equivalent to 3569 * !bool_result, so negate the old modifier. 3570 */ 3571 cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod); 3572 } 3573 } 3574 3575 if (cmp == NULL) { 3576 cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]), 3577 brw_imm_d(0), BRW_CONDITIONAL_Z); 3578 } 3579 } else { 3580 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 3581 BRW_REGISTER_TYPE_UW)); 3582 cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ); 3583 } 3584 3585 cmp->predicate = BRW_PREDICATE_NORMAL; 3586 cmp->flag_subreg = sample_mask_flag_subreg(this); 3587 3588 fs_inst *jump = bld.emit(BRW_OPCODE_HALT); 3589 jump->flag_subreg = sample_mask_flag_subreg(this); 3590 jump->predicate_inverse = true; 3591 3592 if (instr->intrinsic == nir_intrinsic_terminate || 3593 instr->intrinsic == nir_intrinsic_terminate_if) { 3594 jump->predicate = BRW_PREDICATE_NORMAL; 3595 } else { 3596 /* Only jump when the whole quad is demoted. For historical 3597 * reasons this is also used for discard. 3598 */ 3599 jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H; 3600 } 3601 3602 if (devinfo->ver < 7) 3603 limit_dispatch_width( 3604 16, "Fragment discard/demote not implemented in SIMD32 mode.\n"); 3605 break; 3606 } 3607 3608 case nir_intrinsic_load_input: { 3609 /* load_input is only used for flat inputs */ 3610 assert(nir_dest_bit_size(instr->dest) == 32); 3611 unsigned base = nir_intrinsic_base(instr); 3612 unsigned comp = nir_intrinsic_component(instr); 3613 unsigned num_components = instr->num_components; 3614 3615 /* Special case fields in the VUE header */ 3616 if (base == VARYING_SLOT_LAYER) 3617 comp = 1; 3618 else if (base == VARYING_SLOT_VIEWPORT) 3619 comp = 2; 3620 3621 for (unsigned int i = 0; i < num_components; i++) { 3622 bld.MOV(offset(dest, bld, i), 3623 retype(component(interp_reg(base, comp + i), 3), dest.type)); 3624 } 3625 break; 3626 } 3627 3628 case nir_intrinsic_load_fs_input_interp_deltas: { 3629 assert(stage == MESA_SHADER_FRAGMENT); 3630 assert(nir_src_as_uint(instr->src[0]) == 0); 3631 fs_reg interp = interp_reg(nir_intrinsic_base(instr), 3632 nir_intrinsic_component(instr)); 3633 dest.type = BRW_REGISTER_TYPE_F; 3634 bld.MOV(offset(dest, bld, 0), component(interp, 3)); 3635 bld.MOV(offset(dest, bld, 1), component(interp, 1)); 3636 bld.MOV(offset(dest, bld, 2), component(interp, 0)); 3637 break; 3638 } 3639 3640 case nir_intrinsic_load_barycentric_pixel: 3641 case nir_intrinsic_load_barycentric_centroid: 3642 case nir_intrinsic_load_barycentric_sample: { 3643 /* Use the delta_xy values computed from the payload */ 3644 const glsl_interp_mode interp_mode = 3645 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3646 enum brw_barycentric_mode bary = 3647 brw_barycentric_mode(interp_mode, instr->intrinsic); 3648 const fs_reg srcs[] = { offset(this->delta_xy[bary], bld, 0), 3649 offset(this->delta_xy[bary], bld, 1) }; 3650 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 3651 break; 3652 } 3653 3654 case nir_intrinsic_load_barycentric_at_sample: { 3655 const glsl_interp_mode interpolation = 3656 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3657 3658 if (nir_src_is_const(instr->src[0])) { 3659 unsigned msg_data = nir_src_as_uint(instr->src[0]) << 4; 3660 3661 emit_pixel_interpolater_send(bld, 3662 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3663 dest, 3664 fs_reg(), /* src */ 3665 brw_imm_ud(msg_data), 3666 interpolation); 3667 } else { 3668 const fs_reg sample_src = retype(get_nir_src(instr->src[0]), 3669 BRW_REGISTER_TYPE_UD); 3670 3671 if (nir_src_is_dynamically_uniform(instr->src[0])) { 3672 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3673 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3674 bld.exec_all().group(1, 0) 3675 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3676 emit_pixel_interpolater_send(bld, 3677 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3678 dest, 3679 fs_reg(), /* src */ 3680 component(msg_data, 0), 3681 interpolation); 3682 } else { 3683 /* Make a loop that sends a message to the pixel interpolater 3684 * for the sample number in each live channel. If there are 3685 * multiple channels with the same sample number then these 3686 * will be handled simultaneously with a single interation of 3687 * the loop. 3688 */ 3689 bld.emit(BRW_OPCODE_DO); 3690 3691 /* Get the next live sample number into sample_id_reg */ 3692 const fs_reg sample_id = bld.emit_uniformize(sample_src); 3693 3694 /* Set the flag register so that we can perform the send 3695 * message on all channels that have the same sample number 3696 */ 3697 bld.CMP(bld.null_reg_ud(), 3698 sample_src, sample_id, 3699 BRW_CONDITIONAL_EQ); 3700 const fs_reg msg_data = vgrf(glsl_type::uint_type); 3701 bld.exec_all().group(1, 0) 3702 .SHL(msg_data, sample_id, brw_imm_ud(4u)); 3703 fs_inst *inst = 3704 emit_pixel_interpolater_send(bld, 3705 FS_OPCODE_INTERPOLATE_AT_SAMPLE, 3706 dest, 3707 fs_reg(), /* src */ 3708 component(msg_data, 0), 3709 interpolation); 3710 set_predicate(BRW_PREDICATE_NORMAL, inst); 3711 3712 /* Continue the loop if there are any live channels left */ 3713 set_predicate_inv(BRW_PREDICATE_NORMAL, 3714 true, /* inverse */ 3715 bld.emit(BRW_OPCODE_WHILE)); 3716 } 3717 } 3718 break; 3719 } 3720 3721 case nir_intrinsic_load_barycentric_at_offset: { 3722 const glsl_interp_mode interpolation = 3723 (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr); 3724 3725 nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]); 3726 3727 if (const_offset) { 3728 assert(nir_src_bit_size(instr->src[0]) == 32); 3729 unsigned off_x = const_offset[0].u32 & 0xf; 3730 unsigned off_y = const_offset[1].u32 & 0xf; 3731 3732 emit_pixel_interpolater_send(bld, 3733 FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, 3734 dest, 3735 fs_reg(), /* src */ 3736 brw_imm_ud(off_x | (off_y << 4)), 3737 interpolation); 3738 } else { 3739 fs_reg src = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_D); 3740 const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET; 3741 emit_pixel_interpolater_send(bld, 3742 opcode, 3743 dest, 3744 src, 3745 brw_imm_ud(0u), 3746 interpolation); 3747 } 3748 break; 3749 } 3750 3751 case nir_intrinsic_load_frag_coord: 3752 emit_fragcoord_interpolation(dest); 3753 break; 3754 3755 case nir_intrinsic_load_interpolated_input: { 3756 assert(instr->src[0].ssa && 3757 instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic); 3758 nir_intrinsic_instr *bary_intrinsic = 3759 nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr); 3760 nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic; 3761 enum glsl_interp_mode interp_mode = 3762 (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic); 3763 fs_reg dst_xy; 3764 3765 if (bary_intrin == nir_intrinsic_load_barycentric_at_offset || 3766 bary_intrin == nir_intrinsic_load_barycentric_at_sample) { 3767 /* Use the result of the PI message. */ 3768 dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F); 3769 } else { 3770 /* Use the delta_xy values computed from the payload */ 3771 enum brw_barycentric_mode bary = 3772 brw_barycentric_mode(interp_mode, bary_intrin); 3773 dst_xy = this->delta_xy[bary]; 3774 } 3775 3776 for (unsigned int i = 0; i < instr->num_components; i++) { 3777 fs_reg interp = 3778 component(interp_reg(nir_intrinsic_base(instr), 3779 nir_intrinsic_component(instr) + i), 0); 3780 interp.type = BRW_REGISTER_TYPE_F; 3781 dest.type = BRW_REGISTER_TYPE_F; 3782 3783 if (devinfo->ver < 6 && interp_mode == INTERP_MODE_SMOOTH) { 3784 fs_reg tmp = vgrf(glsl_type::float_type); 3785 bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp); 3786 bld.MUL(offset(dest, bld, i), tmp, this->pixel_w); 3787 } else { 3788 bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp); 3789 } 3790 } 3791 break; 3792 } 3793 3794 default: 3795 nir_emit_intrinsic(bld, instr); 3796 break; 3797 } 3798} 3799 3800void 3801fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld, 3802 nir_intrinsic_instr *instr) 3803{ 3804 assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); 3805 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 3806 3807 fs_reg dest; 3808 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3809 dest = get_nir_dest(instr->dest); 3810 3811 switch (instr->intrinsic) { 3812 case nir_intrinsic_control_barrier: 3813 /* The whole workgroup fits in a single HW thread, so all the 3814 * invocations are already executed lock-step. Instead of an actual 3815 * barrier just emit a scheduling fence, that will generate no code. 3816 */ 3817 if (!nir->info.workgroup_size_variable && 3818 workgroup_size() <= dispatch_width) { 3819 bld.exec_all().group(1, 0).emit(FS_OPCODE_SCHEDULING_FENCE); 3820 break; 3821 } 3822 3823 emit_barrier(); 3824 cs_prog_data->uses_barrier = true; 3825 break; 3826 3827 case nir_intrinsic_load_subgroup_id: 3828 if (devinfo->verx10 >= 125) 3829 bld.AND(retype(dest, BRW_REGISTER_TYPE_UD), 3830 retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), 3831 brw_imm_ud(INTEL_MASK(7, 0))); 3832 else 3833 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), subgroup_id); 3834 break; 3835 3836 case nir_intrinsic_load_local_invocation_id: 3837 case nir_intrinsic_load_workgroup_id: { 3838 gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic); 3839 fs_reg val = nir_system_values[sv]; 3840 assert(val.file != BAD_FILE); 3841 dest.type = val.type; 3842 for (unsigned i = 0; i < 3; i++) 3843 bld.MOV(offset(dest, bld, i), offset(val, bld, i)); 3844 break; 3845 } 3846 3847 case nir_intrinsic_load_num_workgroups: { 3848 assert(nir_dest_bit_size(instr->dest) == 32); 3849 const unsigned surface = 3850 cs_prog_data->binding_table.work_groups_start; 3851 3852 cs_prog_data->uses_num_work_groups = true; 3853 3854 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3855 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(surface); 3856 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3857 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(3); /* num components */ 3858 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = brw_imm_ud(0); 3859 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3860 fs_inst *inst = 3861 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3862 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3863 inst->size_written = 3 * dispatch_width * 4; 3864 break; 3865 } 3866 3867 case nir_intrinsic_shared_atomic_add: 3868 case nir_intrinsic_shared_atomic_imin: 3869 case nir_intrinsic_shared_atomic_umin: 3870 case nir_intrinsic_shared_atomic_imax: 3871 case nir_intrinsic_shared_atomic_umax: 3872 case nir_intrinsic_shared_atomic_and: 3873 case nir_intrinsic_shared_atomic_or: 3874 case nir_intrinsic_shared_atomic_xor: 3875 case nir_intrinsic_shared_atomic_exchange: 3876 case nir_intrinsic_shared_atomic_comp_swap: 3877 nir_emit_shared_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 3878 break; 3879 case nir_intrinsic_shared_atomic_fmin: 3880 case nir_intrinsic_shared_atomic_fmax: 3881 case nir_intrinsic_shared_atomic_fcomp_swap: 3882 nir_emit_shared_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 3883 break; 3884 3885 case nir_intrinsic_load_shared: { 3886 assert(devinfo->ver >= 7); 3887 assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); 3888 3889 const unsigned bit_size = nir_dest_bit_size(instr->dest); 3890 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3891 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3892 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[0]); 3893 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3894 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3895 3896 /* Make dest unsigned because that's what the temporary will be */ 3897 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3898 3899 /* Read the vector */ 3900 assert(nir_dest_bit_size(instr->dest) <= 32); 3901 assert(nir_intrinsic_align(instr) > 0); 3902 if (nir_dest_bit_size(instr->dest) == 32 && 3903 nir_intrinsic_align(instr) >= 4) { 3904 assert(nir_dest_num_components(instr->dest) <= 4); 3905 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3906 fs_inst *inst = 3907 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 3908 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 3909 inst->size_written = instr->num_components * dispatch_width * 4; 3910 } else { 3911 assert(nir_dest_num_components(instr->dest) == 1); 3912 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3913 3914 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 3915 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 3916 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 3917 bld.MOV(dest, subscript(read_result, dest.type, 0)); 3918 } 3919 break; 3920 } 3921 3922 case nir_intrinsic_store_shared: { 3923 assert(devinfo->ver >= 7); 3924 assert(stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_KERNEL); 3925 3926 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 3927 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 3928 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 3929 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 3930 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 3931 /* No point in masking with sample mask, here we're handling compute 3932 * intrinsics. 3933 */ 3934 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 3935 3936 fs_reg data = get_nir_src(instr->src[0]); 3937 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 3938 3939 assert(nir_src_bit_size(instr->src[0]) <= 32); 3940 assert(nir_intrinsic_write_mask(instr) == 3941 (1u << instr->num_components) - 1); 3942 assert(nir_intrinsic_align(instr) > 0); 3943 if (nir_src_bit_size(instr->src[0]) == 32 && 3944 nir_intrinsic_align(instr) >= 4) { 3945 assert(nir_src_num_components(instr->src[0]) <= 4); 3946 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 3947 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 3948 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 3949 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3950 } else { 3951 assert(nir_src_num_components(instr->src[0]) == 1); 3952 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 3953 3954 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 3955 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 3956 3957 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 3958 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 3959 } 3960 break; 3961 } 3962 3963 case nir_intrinsic_load_workgroup_size: { 3964 assert(compiler->lower_variable_group_size); 3965 assert(nir->info.workgroup_size_variable); 3966 for (unsigned i = 0; i < 3; i++) { 3967 bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 3968 group_size[i]); 3969 } 3970 break; 3971 } 3972 3973 default: 3974 nir_emit_intrinsic(bld, instr); 3975 break; 3976 } 3977} 3978 3979void 3980fs_visitor::nir_emit_bs_intrinsic(const fs_builder &bld, 3981 nir_intrinsic_instr *instr) 3982{ 3983 assert(brw_shader_stage_is_bindless(stage)); 3984 3985 fs_reg dest; 3986 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 3987 dest = get_nir_dest(instr->dest); 3988 3989 switch (instr->intrinsic) { 3990 case nir_intrinsic_load_btd_global_arg_addr_intel: 3991 bld.MOV(dest, retype(brw_vec1_grf(2, 0), dest.type)); 3992 break; 3993 3994 case nir_intrinsic_load_btd_local_arg_addr_intel: 3995 bld.MOV(dest, retype(brw_vec1_grf(2, 2), dest.type)); 3996 break; 3997 3998 case nir_intrinsic_trace_ray_initial_intel: 3999 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, 4000 bld.null_reg_ud(), 4001 brw_imm_ud(BRW_RT_BVH_LEVEL_WORLD), 4002 brw_imm_ud(GEN_RT_TRACE_RAY_INITAL)); 4003 break; 4004 4005 case nir_intrinsic_trace_ray_commit_intel: 4006 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, 4007 bld.null_reg_ud(), 4008 brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT), 4009 brw_imm_ud(GEN_RT_TRACE_RAY_COMMIT)); 4010 break; 4011 4012 case nir_intrinsic_trace_ray_continue_intel: 4013 bld.emit(RT_OPCODE_TRACE_RAY_LOGICAL, 4014 bld.null_reg_ud(), 4015 brw_imm_ud(BRW_RT_BVH_LEVEL_OBJECT), 4016 brw_imm_ud(GEN_RT_TRACE_RAY_CONTINUE)); 4017 break; 4018 4019 default: 4020 nir_emit_intrinsic(bld, instr); 4021 break; 4022 } 4023} 4024 4025static fs_reg 4026brw_nir_reduction_op_identity(const fs_builder &bld, 4027 nir_op op, brw_reg_type type) 4028{ 4029 nir_const_value value = nir_alu_binop_identity(op, type_sz(type) * 8); 4030 switch (type_sz(type)) { 4031 case 1: 4032 if (type == BRW_REGISTER_TYPE_UB) { 4033 return brw_imm_uw(value.u8); 4034 } else { 4035 assert(type == BRW_REGISTER_TYPE_B); 4036 return brw_imm_w(value.i8); 4037 } 4038 case 2: 4039 return retype(brw_imm_uw(value.u16), type); 4040 case 4: 4041 return retype(brw_imm_ud(value.u32), type); 4042 case 8: 4043 if (type == BRW_REGISTER_TYPE_DF) 4044 return setup_imm_df(bld, value.f64); 4045 else 4046 return retype(brw_imm_u64(value.u64), type); 4047 default: 4048 unreachable("Invalid type size"); 4049 } 4050} 4051 4052static opcode 4053brw_op_for_nir_reduction_op(nir_op op) 4054{ 4055 switch (op) { 4056 case nir_op_iadd: return BRW_OPCODE_ADD; 4057 case nir_op_fadd: return BRW_OPCODE_ADD; 4058 case nir_op_imul: return BRW_OPCODE_MUL; 4059 case nir_op_fmul: return BRW_OPCODE_MUL; 4060 case nir_op_imin: return BRW_OPCODE_SEL; 4061 case nir_op_umin: return BRW_OPCODE_SEL; 4062 case nir_op_fmin: return BRW_OPCODE_SEL; 4063 case nir_op_imax: return BRW_OPCODE_SEL; 4064 case nir_op_umax: return BRW_OPCODE_SEL; 4065 case nir_op_fmax: return BRW_OPCODE_SEL; 4066 case nir_op_iand: return BRW_OPCODE_AND; 4067 case nir_op_ior: return BRW_OPCODE_OR; 4068 case nir_op_ixor: return BRW_OPCODE_XOR; 4069 default: 4070 unreachable("Invalid reduction operation"); 4071 } 4072} 4073 4074static brw_conditional_mod 4075brw_cond_mod_for_nir_reduction_op(nir_op op) 4076{ 4077 switch (op) { 4078 case nir_op_iadd: return BRW_CONDITIONAL_NONE; 4079 case nir_op_fadd: return BRW_CONDITIONAL_NONE; 4080 case nir_op_imul: return BRW_CONDITIONAL_NONE; 4081 case nir_op_fmul: return BRW_CONDITIONAL_NONE; 4082 case nir_op_imin: return BRW_CONDITIONAL_L; 4083 case nir_op_umin: return BRW_CONDITIONAL_L; 4084 case nir_op_fmin: return BRW_CONDITIONAL_L; 4085 case nir_op_imax: return BRW_CONDITIONAL_GE; 4086 case nir_op_umax: return BRW_CONDITIONAL_GE; 4087 case nir_op_fmax: return BRW_CONDITIONAL_GE; 4088 case nir_op_iand: return BRW_CONDITIONAL_NONE; 4089 case nir_op_ior: return BRW_CONDITIONAL_NONE; 4090 case nir_op_ixor: return BRW_CONDITIONAL_NONE; 4091 default: 4092 unreachable("Invalid reduction operation"); 4093 } 4094} 4095 4096fs_reg 4097fs_visitor::get_nir_image_intrinsic_image(const brw::fs_builder &bld, 4098 nir_intrinsic_instr *instr) 4099{ 4100 fs_reg image = retype(get_nir_src_imm(instr->src[0]), BRW_REGISTER_TYPE_UD); 4101 fs_reg surf_index = image; 4102 4103 if (stage_prog_data->binding_table.image_start > 0) { 4104 if (image.file == BRW_IMMEDIATE_VALUE) { 4105 surf_index = 4106 brw_imm_ud(image.d + stage_prog_data->binding_table.image_start); 4107 } else { 4108 surf_index = vgrf(glsl_type::uint_type); 4109 bld.ADD(surf_index, image, 4110 brw_imm_d(stage_prog_data->binding_table.image_start)); 4111 } 4112 } 4113 4114 return bld.emit_uniformize(surf_index); 4115} 4116 4117fs_reg 4118fs_visitor::get_nir_ssbo_intrinsic_index(const brw::fs_builder &bld, 4119 nir_intrinsic_instr *instr) 4120{ 4121 /* SSBO stores are weird in that their index is in src[1] */ 4122 const bool is_store = 4123 instr->intrinsic == nir_intrinsic_store_ssbo || 4124 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 4125 const unsigned src = is_store ? 1 : 0; 4126 4127 if (nir_src_is_const(instr->src[src])) { 4128 unsigned index = stage_prog_data->binding_table.ssbo_start + 4129 nir_src_as_uint(instr->src[src]); 4130 return brw_imm_ud(index); 4131 } else { 4132 fs_reg surf_index = vgrf(glsl_type::uint_type); 4133 bld.ADD(surf_index, get_nir_src(instr->src[src]), 4134 brw_imm_ud(stage_prog_data->binding_table.ssbo_start)); 4135 return bld.emit_uniformize(surf_index); 4136 } 4137} 4138 4139/** 4140 * The offsets we get from NIR act as if each SIMD channel has it's own blob 4141 * of contiguous space. However, if we actually place each SIMD channel in 4142 * it's own space, we end up with terrible cache performance because each SIMD 4143 * channel accesses a different cache line even when they're all accessing the 4144 * same byte offset. To deal with this problem, we swizzle the address using 4145 * a simple algorithm which ensures that any time a SIMD message reads or 4146 * writes the same address, it's all in the same cache line. We have to keep 4147 * the bottom two bits fixed so that we can read/write up to a dword at a time 4148 * and the individual element is contiguous. We do this by splitting the 4149 * address as follows: 4150 * 4151 * 31 4-6 2 0 4152 * +-------------------------------+------------+----------+ 4153 * | Hi address bits | chan index | addr low | 4154 * +-------------------------------+------------+----------+ 4155 * 4156 * In other words, the bottom two address bits stay, and the top 30 get 4157 * shifted up so that we can stick the SIMD channel index in the middle. This 4158 * way, we can access 8, 16, or 32-bit elements and, when accessing a 32-bit 4159 * at the same logical offset, the scratch read/write instruction acts on 4160 * continuous elements and we get good cache locality. 4161 */ 4162fs_reg 4163fs_visitor::swizzle_nir_scratch_addr(const brw::fs_builder &bld, 4164 const fs_reg &nir_addr, 4165 bool in_dwords) 4166{ 4167 const fs_reg &chan_index = 4168 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]; 4169 const unsigned chan_index_bits = ffs(dispatch_width) - 1; 4170 4171 fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4172 if (in_dwords) { 4173 /* In this case, we know the address is aligned to a DWORD and we want 4174 * the final address in DWORDs. 4175 */ 4176 bld.SHL(addr, nir_addr, brw_imm_ud(chan_index_bits - 2)); 4177 bld.OR(addr, addr, chan_index); 4178 } else { 4179 /* This case substantially more annoying because we have to pay 4180 * attention to those pesky two bottom bits. 4181 */ 4182 fs_reg addr_hi = bld.vgrf(BRW_REGISTER_TYPE_UD); 4183 bld.AND(addr_hi, nir_addr, brw_imm_ud(~0x3u)); 4184 bld.SHL(addr_hi, addr_hi, brw_imm_ud(chan_index_bits)); 4185 fs_reg chan_addr = bld.vgrf(BRW_REGISTER_TYPE_UD); 4186 bld.SHL(chan_addr, chan_index, brw_imm_ud(2)); 4187 bld.AND(addr, nir_addr, brw_imm_ud(0x3u)); 4188 bld.OR(addr, addr, addr_hi); 4189 bld.OR(addr, addr, chan_addr); 4190 } 4191 return addr; 4192} 4193 4194static unsigned 4195choose_oword_block_size_dwords(unsigned dwords) 4196{ 4197 unsigned block; 4198 if (dwords >= 32) { 4199 block = 32; 4200 } else if (dwords >= 16) { 4201 block = 16; 4202 } else { 4203 block = 8; 4204 } 4205 assert(block <= dwords); 4206 return block; 4207} 4208 4209static void 4210increment_a64_address(const fs_builder &bld, fs_reg address, uint32_t v) 4211{ 4212 if (bld.shader->devinfo->has_64bit_int) { 4213 bld.ADD(address, address, brw_imm_ud(v)); 4214 } else { 4215 fs_reg low = retype(address, BRW_REGISTER_TYPE_UD); 4216 fs_reg high = offset(low, bld, 1); 4217 4218 /* Add low and if that overflows, add carry to high. */ 4219 bld.ADD(low, low, brw_imm_ud(v))->conditional_mod = BRW_CONDITIONAL_O; 4220 bld.ADD(high, high, brw_imm_ud(0x1))->predicate = BRW_PREDICATE_NORMAL; 4221 } 4222} 4223 4224static fs_reg 4225emit_fence(const fs_builder &bld, enum opcode opcode, 4226 uint8_t sfid, bool commit_enable, uint8_t bti) 4227{ 4228 assert(opcode == SHADER_OPCODE_INTERLOCK || 4229 opcode == SHADER_OPCODE_MEMORY_FENCE); 4230 4231 fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD); 4232 fs_inst *fence = bld.emit(opcode, dst, brw_vec8_grf(0, 0), 4233 brw_imm_ud(commit_enable), 4234 brw_imm_ud(bti)); 4235 fence->sfid = sfid; 4236 return dst; 4237} 4238 4239void 4240fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr) 4241{ 4242 fs_reg dest; 4243 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 4244 dest = get_nir_dest(instr->dest); 4245 4246 switch (instr->intrinsic) { 4247 case nir_intrinsic_image_load: 4248 case nir_intrinsic_image_store: 4249 case nir_intrinsic_image_atomic_add: 4250 case nir_intrinsic_image_atomic_imin: 4251 case nir_intrinsic_image_atomic_umin: 4252 case nir_intrinsic_image_atomic_imax: 4253 case nir_intrinsic_image_atomic_umax: 4254 case nir_intrinsic_image_atomic_and: 4255 case nir_intrinsic_image_atomic_or: 4256 case nir_intrinsic_image_atomic_xor: 4257 case nir_intrinsic_image_atomic_exchange: 4258 case nir_intrinsic_image_atomic_comp_swap: 4259 case nir_intrinsic_bindless_image_load: 4260 case nir_intrinsic_bindless_image_store: 4261 case nir_intrinsic_bindless_image_atomic_add: 4262 case nir_intrinsic_bindless_image_atomic_imin: 4263 case nir_intrinsic_bindless_image_atomic_umin: 4264 case nir_intrinsic_bindless_image_atomic_imax: 4265 case nir_intrinsic_bindless_image_atomic_umax: 4266 case nir_intrinsic_bindless_image_atomic_and: 4267 case nir_intrinsic_bindless_image_atomic_or: 4268 case nir_intrinsic_bindless_image_atomic_xor: 4269 case nir_intrinsic_bindless_image_atomic_exchange: 4270 case nir_intrinsic_bindless_image_atomic_comp_swap: { 4271 /* Get some metadata from the image intrinsic. */ 4272 const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic]; 4273 4274 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4275 4276 switch (instr->intrinsic) { 4277 case nir_intrinsic_image_load: 4278 case nir_intrinsic_image_store: 4279 case nir_intrinsic_image_atomic_add: 4280 case nir_intrinsic_image_atomic_imin: 4281 case nir_intrinsic_image_atomic_umin: 4282 case nir_intrinsic_image_atomic_imax: 4283 case nir_intrinsic_image_atomic_umax: 4284 case nir_intrinsic_image_atomic_and: 4285 case nir_intrinsic_image_atomic_or: 4286 case nir_intrinsic_image_atomic_xor: 4287 case nir_intrinsic_image_atomic_exchange: 4288 case nir_intrinsic_image_atomic_comp_swap: 4289 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4290 get_nir_image_intrinsic_image(bld, instr); 4291 break; 4292 4293 default: 4294 /* Bindless */ 4295 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = 4296 bld.emit_uniformize(get_nir_src(instr->src[0])); 4297 break; 4298 } 4299 4300 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4301 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = 4302 brw_imm_ud(nir_image_intrinsic_coord_components(instr)); 4303 4304 /* Emit an image load, store or atomic op. */ 4305 if (instr->intrinsic == nir_intrinsic_image_load || 4306 instr->intrinsic == nir_intrinsic_bindless_image_load) { 4307 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4308 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4309 fs_inst *inst = 4310 bld.emit(SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL, 4311 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4312 inst->size_written = instr->num_components * dispatch_width * 4; 4313 } else if (instr->intrinsic == nir_intrinsic_image_store || 4314 instr->intrinsic == nir_intrinsic_bindless_image_store) { 4315 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4316 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[3]); 4317 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4318 bld.emit(SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL, 4319 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4320 } else { 4321 unsigned num_srcs = info->num_srcs; 4322 int op = brw_aop_for_nir_intrinsic(instr); 4323 if (op == BRW_AOP_INC || op == BRW_AOP_DEC) { 4324 assert(num_srcs == 4); 4325 num_srcs = 3; 4326 } 4327 4328 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 4329 4330 fs_reg data; 4331 if (num_srcs >= 4) 4332 data = get_nir_src(instr->src[3]); 4333 if (num_srcs >= 5) { 4334 fs_reg tmp = bld.vgrf(data.type, 2); 4335 fs_reg sources[2] = { data, get_nir_src(instr->src[4]) }; 4336 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 4337 data = tmp; 4338 } 4339 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4340 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4341 4342 bld.emit(SHADER_OPCODE_TYPED_ATOMIC_LOGICAL, 4343 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4344 } 4345 break; 4346 } 4347 4348 case nir_intrinsic_image_size: 4349 case nir_intrinsic_bindless_image_size: { 4350 /* Cube image sizes should have previously been lowered to a 2D array */ 4351 assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE); 4352 4353 /* Unlike the [un]typed load and store opcodes, the TXS that this turns 4354 * into will handle the binding table index for us in the geneerator. 4355 * Incidentally, this means that we can handle bindless with exactly the 4356 * same code. 4357 */ 4358 fs_reg image = retype(get_nir_src_imm(instr->src[0]), 4359 BRW_REGISTER_TYPE_UD); 4360 image = bld.emit_uniformize(image); 4361 4362 assert(nir_src_as_uint(instr->src[1]) == 0); 4363 4364 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 4365 if (instr->intrinsic == nir_intrinsic_image_size) 4366 srcs[TEX_LOGICAL_SRC_SURFACE] = image; 4367 else 4368 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = image; 4369 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0); 4370 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(0); 4371 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 4372 4373 /* Since the image size is always uniform, we can just emit a SIMD8 4374 * query instruction and splat the result out. 4375 */ 4376 const fs_builder ubld = bld.exec_all().group(8, 0); 4377 4378 fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 4379 fs_inst *inst = ubld.emit(SHADER_OPCODE_IMAGE_SIZE_LOGICAL, 4380 tmp, srcs, ARRAY_SIZE(srcs)); 4381 inst->size_written = 4 * REG_SIZE; 4382 4383 for (unsigned c = 0; c < instr->dest.ssa.num_components; ++c) { 4384 bld.MOV(offset(retype(dest, tmp.type), bld, c), 4385 component(offset(tmp, ubld, c), 0)); 4386 } 4387 break; 4388 } 4389 4390 case nir_intrinsic_image_load_raw_intel: { 4391 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4392 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4393 get_nir_image_intrinsic_image(bld, instr); 4394 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4395 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4396 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4397 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4398 4399 fs_inst *inst = 4400 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4401 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4402 inst->size_written = instr->num_components * dispatch_width * 4; 4403 break; 4404 } 4405 4406 case nir_intrinsic_image_store_raw_intel: { 4407 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4408 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4409 get_nir_image_intrinsic_image(bld, instr); 4410 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4411 srcs[SURFACE_LOGICAL_SRC_DATA] = get_nir_src(instr->src[2]); 4412 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4413 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4414 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4415 4416 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4417 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4418 break; 4419 } 4420 4421 case nir_intrinsic_scoped_barrier: 4422 assert(nir_intrinsic_execution_scope(instr) == NIR_SCOPE_NONE); 4423 FALLTHROUGH; 4424 case nir_intrinsic_group_memory_barrier: 4425 case nir_intrinsic_memory_barrier_shared: 4426 case nir_intrinsic_memory_barrier_buffer: 4427 case nir_intrinsic_memory_barrier_image: 4428 case nir_intrinsic_memory_barrier: 4429 case nir_intrinsic_begin_invocation_interlock: 4430 case nir_intrinsic_end_invocation_interlock: { 4431 bool ugm_fence, slm_fence, tgm_fence, urb_fence; 4432 const enum opcode opcode = 4433 instr->intrinsic == nir_intrinsic_begin_invocation_interlock ? 4434 SHADER_OPCODE_INTERLOCK : SHADER_OPCODE_MEMORY_FENCE; 4435 4436 switch (instr->intrinsic) { 4437 case nir_intrinsic_scoped_barrier: { 4438 nir_variable_mode modes = nir_intrinsic_memory_modes(instr); 4439 ugm_fence = modes & (nir_var_mem_ssbo | nir_var_mem_global); 4440 slm_fence = modes & nir_var_mem_shared; 4441 tgm_fence = modes & nir_var_mem_ssbo; 4442 urb_fence = modes & nir_var_shader_out; 4443 break; 4444 } 4445 4446 case nir_intrinsic_begin_invocation_interlock: 4447 case nir_intrinsic_end_invocation_interlock: 4448 /* For beginInvocationInterlockARB(), we will generate a memory fence 4449 * but with a different opcode so that generator can pick SENDC 4450 * instead of SEND. 4451 * 4452 * For endInvocationInterlockARB(), we need to insert a memory fence which 4453 * stalls in the shader until the memory transactions prior to that 4454 * fence are complete. This ensures that the shader does not end before 4455 * any writes from its critical section have landed. Otherwise, you can 4456 * end up with a case where the next invocation on that pixel properly 4457 * stalls for previous FS invocation on its pixel to complete but 4458 * doesn't actually wait for the dataport memory transactions from that 4459 * thread to land before submitting its own. 4460 * 4461 * Handling them here will allow the logic for IVB render cache (see 4462 * below) to be reused. 4463 */ 4464 assert(stage == MESA_SHADER_FRAGMENT); 4465 ugm_fence = tgm_fence = true; 4466 slm_fence = urb_fence = false; 4467 break; 4468 4469 default: 4470 ugm_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared && 4471 instr->intrinsic != nir_intrinsic_memory_barrier_image; 4472 slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4473 instr->intrinsic == nir_intrinsic_memory_barrier || 4474 instr->intrinsic == nir_intrinsic_memory_barrier_shared; 4475 tgm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier || 4476 instr->intrinsic == nir_intrinsic_memory_barrier || 4477 instr->intrinsic == nir_intrinsic_memory_barrier_image; 4478 urb_fence = instr->intrinsic == nir_intrinsic_memory_barrier; 4479 break; 4480 } 4481 4482 if (nir->info.shared_size > 0) { 4483 assert(gl_shader_stage_uses_workgroup(stage)); 4484 } else { 4485 slm_fence = false; 4486 } 4487 4488 /* If the workgroup fits in a single HW thread, the messages for SLM are 4489 * processed in-order and the shader itself is already synchronized so 4490 * the memory fence is not necessary. 4491 * 4492 * TODO: Check if applies for many HW threads sharing same Data Port. 4493 */ 4494 if (!nir->info.workgroup_size_variable && 4495 slm_fence && workgroup_size() <= dispatch_width) 4496 slm_fence = false; 4497 4498 if (stage != MESA_SHADER_TESS_CTRL) 4499 urb_fence = false; 4500 4501 unsigned fence_regs_count = 0; 4502 fs_reg fence_regs[3] = {}; 4503 4504 const fs_builder ubld = bld.group(8, 0); 4505 4506 if (devinfo->has_lsc) { 4507 assert(devinfo->verx10 >= 125); 4508 if (ugm_fence) { 4509 fence_regs[fence_regs_count++] = 4510 emit_fence(ubld, opcode, GFX12_SFID_UGM, 4511 true /* commit_enable */, 4512 0 /* bti; ignored for LSC */); 4513 } 4514 4515 if (tgm_fence) { 4516 fence_regs[fence_regs_count++] = 4517 emit_fence(ubld, opcode, GFX12_SFID_TGM, 4518 true /* commit_enable */, 4519 0 /* bti; ignored for LSC */); 4520 } 4521 4522 if (slm_fence) { 4523 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4524 fence_regs[fence_regs_count++] = 4525 emit_fence(ubld, opcode, GFX12_SFID_SLM, 4526 true /* commit_enable */, 4527 0 /* BTI; ignored for LSC */); 4528 } 4529 4530 if (urb_fence) { 4531 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4532 fence_regs[fence_regs_count++] = 4533 emit_fence(ubld, opcode, BRW_SFID_URB, 4534 true /* commit_enable */, 4535 0 /* BTI; ignored for LSC */); 4536 } 4537 } else if (devinfo->ver >= 11) { 4538 if (tgm_fence || ugm_fence || urb_fence) { 4539 fence_regs[fence_regs_count++] = 4540 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 4541 true /* commit_enable HSD ES # 1404612949 */, 4542 0 /* BTI = 0 means data cache */); 4543 } 4544 4545 if (slm_fence) { 4546 assert(opcode == SHADER_OPCODE_MEMORY_FENCE); 4547 fence_regs[fence_regs_count++] = 4548 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 4549 true /* commit_enable HSD ES # 1404612949 */, 4550 GFX7_BTI_SLM); 4551 } 4552 } else { 4553 /* Prior to Icelake, they're all lumped into a single cache except on 4554 * Ivy Bridge and Bay Trail where typed messages actually go through 4555 * the render cache. There, we need both fences because we may 4556 * access storage images as either typed or untyped. 4557 */ 4558 const bool render_fence = tgm_fence && devinfo->verx10 == 70; 4559 4560 const bool commit_enable = render_fence || 4561 instr->intrinsic == nir_intrinsic_end_invocation_interlock; 4562 4563 if (tgm_fence || ugm_fence || slm_fence || urb_fence) { 4564 fence_regs[fence_regs_count++] = 4565 emit_fence(ubld, opcode, GFX7_SFID_DATAPORT_DATA_CACHE, 4566 commit_enable, 0 /* BTI */); 4567 } 4568 4569 if (render_fence) { 4570 fence_regs[fence_regs_count++] = 4571 emit_fence(ubld, opcode, GFX6_SFID_DATAPORT_RENDER_CACHE, 4572 commit_enable, /* bti */ 0); 4573 } 4574 } 4575 4576 assert(fence_regs_count <= ARRAY_SIZE(fence_regs)); 4577 4578 /* There are three cases where we want to insert a stall: 4579 * 4580 * 1. If we're a nir_intrinsic_end_invocation_interlock. This is 4581 * required to ensure that the shader EOT doesn't happen until 4582 * after the fence returns. Otherwise, we might end up with the 4583 * next shader invocation for that pixel not respecting our fence 4584 * because it may happen on a different HW thread. 4585 * 4586 * 2. If we have multiple fences. This is required to ensure that 4587 * they all complete and nothing gets weirdly out-of-order. 4588 * 4589 * 3. If we have no fences. In this case, we need at least a 4590 * scheduling barrier to keep the compiler from moving things 4591 * around in an invalid way. 4592 */ 4593 if (instr->intrinsic == nir_intrinsic_end_invocation_interlock || 4594 fence_regs_count != 1) { 4595 ubld.exec_all().group(1, 0).emit( 4596 FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), 4597 fence_regs, fence_regs_count); 4598 } 4599 4600 break; 4601 } 4602 4603 case nir_intrinsic_memory_barrier_tcs_patch: 4604 break; 4605 4606 case nir_intrinsic_shader_clock: { 4607 /* We cannot do anything if there is an event, so ignore it for now */ 4608 const fs_reg shader_clock = get_timestamp(bld); 4609 const fs_reg srcs[] = { component(shader_clock, 0), 4610 component(shader_clock, 1) }; 4611 bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0); 4612 break; 4613 } 4614 4615 case nir_intrinsic_image_samples: 4616 /* The driver does not support multi-sampled images. */ 4617 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1)); 4618 break; 4619 4620 case nir_intrinsic_load_reloc_const_intel: { 4621 uint32_t id = nir_intrinsic_param_idx(instr); 4622 bld.emit(SHADER_OPCODE_MOV_RELOC_IMM, 4623 dest, brw_imm_ud(id)); 4624 break; 4625 } 4626 4627 case nir_intrinsic_load_uniform: { 4628 /* Offsets are in bytes but they should always aligned to 4629 * the type size 4630 */ 4631 assert(instr->const_index[0] % 4 == 0 || 4632 instr->const_index[0] % type_sz(dest.type) == 0); 4633 4634 fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type); 4635 4636 if (nir_src_is_const(instr->src[0])) { 4637 unsigned load_offset = nir_src_as_uint(instr->src[0]); 4638 assert(load_offset % type_sz(dest.type) == 0); 4639 /* For 16-bit types we add the module of the const_index[0] 4640 * offset to access to not 32-bit aligned element 4641 */ 4642 src.offset = load_offset + instr->const_index[0] % 4; 4643 4644 for (unsigned j = 0; j < instr->num_components; j++) { 4645 bld.MOV(offset(dest, bld, j), offset(src, bld, j)); 4646 } 4647 } else { 4648 fs_reg indirect = retype(get_nir_src(instr->src[0]), 4649 BRW_REGISTER_TYPE_UD); 4650 4651 /* We need to pass a size to the MOV_INDIRECT but we don't want it to 4652 * go past the end of the uniform. In order to keep the n'th 4653 * component from running past, we subtract off the size of all but 4654 * one component of the vector. 4655 */ 4656 assert(instr->const_index[1] >= 4657 instr->num_components * (int) type_sz(dest.type)); 4658 unsigned read_size = instr->const_index[1] - 4659 (instr->num_components - 1) * type_sz(dest.type); 4660 4661 bool supports_64bit_indirects = 4662 !devinfo->is_cherryview && !intel_device_info_is_9lp(devinfo); 4663 4664 if (type_sz(dest.type) != 8 || supports_64bit_indirects) { 4665 for (unsigned j = 0; j < instr->num_components; j++) { 4666 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4667 offset(dest, bld, j), offset(src, bld, j), 4668 indirect, brw_imm_ud(read_size)); 4669 } 4670 } else { 4671 const unsigned num_mov_indirects = 4672 type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD); 4673 /* We read a little bit less per MOV INDIRECT, as they are now 4674 * 32-bits ones instead of 64-bit. Fix read_size then. 4675 */ 4676 const unsigned read_size_32bit = read_size - 4677 (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD); 4678 for (unsigned j = 0; j < instr->num_components; j++) { 4679 for (unsigned i = 0; i < num_mov_indirects; i++) { 4680 bld.emit(SHADER_OPCODE_MOV_INDIRECT, 4681 subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i), 4682 subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i), 4683 indirect, brw_imm_ud(read_size_32bit)); 4684 } 4685 } 4686 } 4687 } 4688 break; 4689 } 4690 4691 case nir_intrinsic_load_ubo: { 4692 fs_reg surf_index; 4693 if (nir_src_is_const(instr->src[0])) { 4694 const unsigned index = stage_prog_data->binding_table.ubo_start + 4695 nir_src_as_uint(instr->src[0]); 4696 surf_index = brw_imm_ud(index); 4697 } else { 4698 /* The block index is not a constant. Evaluate the index expression 4699 * per-channel and add the base UBO index; we have to select a value 4700 * from any live channel. 4701 */ 4702 surf_index = vgrf(glsl_type::uint_type); 4703 bld.ADD(surf_index, get_nir_src(instr->src[0]), 4704 brw_imm_ud(stage_prog_data->binding_table.ubo_start)); 4705 surf_index = bld.emit_uniformize(surf_index); 4706 } 4707 4708 if (!nir_src_is_const(instr->src[1])) { 4709 fs_reg base_offset = retype(get_nir_src(instr->src[1]), 4710 BRW_REGISTER_TYPE_UD); 4711 4712 for (int i = 0; i < instr->num_components; i++) 4713 VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index, 4714 base_offset, i * type_sz(dest.type), 4715 nir_dest_bit_size(instr->dest) / 8); 4716 4717 prog_data->has_ubo_pull = true; 4718 } else { 4719 /* Even if we are loading doubles, a pull constant load will load 4720 * a 32-bit vec4, so should only reserve vgrf space for that. If we 4721 * need to load a full dvec4 we will have to emit 2 loads. This is 4722 * similar to demote_pull_constants(), except that in that case we 4723 * see individual accesses to each component of the vector and then 4724 * we let CSE deal with duplicate loads. Here we see a vector access 4725 * and we have to split it if necessary. 4726 */ 4727 const unsigned type_size = type_sz(dest.type); 4728 const unsigned load_offset = nir_src_as_uint(instr->src[1]); 4729 4730 /* See if we've selected this as a push constant candidate */ 4731 if (nir_src_is_const(instr->src[0])) { 4732 const unsigned ubo_block = nir_src_as_uint(instr->src[0]); 4733 const unsigned offset_256b = load_offset / 32; 4734 4735 fs_reg push_reg; 4736 for (int i = 0; i < 4; i++) { 4737 const struct brw_ubo_range *range = &prog_data->ubo_ranges[i]; 4738 if (range->block == ubo_block && 4739 offset_256b >= range->start && 4740 offset_256b < range->start + range->length) { 4741 4742 push_reg = fs_reg(UNIFORM, UBO_START + i, dest.type); 4743 push_reg.offset = load_offset - 32 * range->start; 4744 break; 4745 } 4746 } 4747 4748 if (push_reg.file != BAD_FILE) { 4749 for (unsigned i = 0; i < instr->num_components; i++) { 4750 bld.MOV(offset(dest, bld, i), 4751 byte_offset(push_reg, i * type_size)); 4752 } 4753 break; 4754 } 4755 } 4756 4757 prog_data->has_ubo_pull = true; 4758 4759 const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ 4760 const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0); 4761 const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4762 4763 for (unsigned c = 0; c < instr->num_components;) { 4764 const unsigned base = load_offset + c * type_size; 4765 /* Number of usable components in the next block-aligned load. */ 4766 const unsigned count = MIN2(instr->num_components - c, 4767 (block_sz - base % block_sz) / type_size); 4768 4769 ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 4770 packed_consts, surf_index, 4771 brw_imm_ud(base & ~(block_sz - 1))); 4772 4773 const fs_reg consts = 4774 retype(byte_offset(packed_consts, base & (block_sz - 1)), 4775 dest.type); 4776 4777 for (unsigned d = 0; d < count; d++) 4778 bld.MOV(offset(dest, bld, c + d), component(consts, d)); 4779 4780 c += count; 4781 } 4782 } 4783 break; 4784 } 4785 4786 case nir_intrinsic_load_global: 4787 case nir_intrinsic_load_global_constant: { 4788 assert(devinfo->ver >= 8); 4789 4790 assert(nir_dest_bit_size(instr->dest) <= 32); 4791 assert(nir_intrinsic_align(instr) > 0); 4792 if (nir_dest_bit_size(instr->dest) == 32 && 4793 nir_intrinsic_align(instr) >= 4) { 4794 assert(nir_dest_num_components(instr->dest) <= 4); 4795 fs_inst *inst = bld.emit(SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL, 4796 dest, 4797 get_nir_src(instr->src[0]), /* Address */ 4798 fs_reg(), /* No source data */ 4799 brw_imm_ud(instr->num_components)); 4800 inst->size_written = instr->num_components * 4801 inst->dst.component_size(inst->exec_size); 4802 } else { 4803 const unsigned bit_size = nir_dest_bit_size(instr->dest); 4804 assert(nir_dest_num_components(instr->dest) == 1); 4805 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4806 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL, 4807 tmp, 4808 get_nir_src(instr->src[0]), /* Address */ 4809 fs_reg(), /* No source data */ 4810 brw_imm_ud(bit_size)); 4811 bld.MOV(dest, subscript(tmp, dest.type, 0)); 4812 } 4813 break; 4814 } 4815 4816 case nir_intrinsic_store_global: 4817 assert(devinfo->ver >= 8); 4818 4819 assert(nir_src_bit_size(instr->src[0]) <= 32); 4820 assert(nir_intrinsic_write_mask(instr) == 4821 (1u << instr->num_components) - 1); 4822 assert(nir_intrinsic_align(instr) > 0); 4823 if (nir_src_bit_size(instr->src[0]) == 32 && 4824 nir_intrinsic_align(instr) >= 4) { 4825 assert(nir_src_num_components(instr->src[0]) <= 4); 4826 bld.emit(SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL, 4827 fs_reg(), 4828 get_nir_src(instr->src[1]), /* Address */ 4829 get_nir_src(instr->src[0]), /* Data */ 4830 brw_imm_ud(instr->num_components)); 4831 } else { 4832 assert(nir_src_num_components(instr->src[0]) == 1); 4833 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4834 brw_reg_type data_type = 4835 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4836 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 4837 bld.MOV(tmp, retype(get_nir_src(instr->src[0]), data_type)); 4838 bld.emit(SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL, 4839 fs_reg(), 4840 get_nir_src(instr->src[1]), /* Address */ 4841 tmp, /* Data */ 4842 brw_imm_ud(nir_src_bit_size(instr->src[0]))); 4843 } 4844 break; 4845 4846 case nir_intrinsic_global_atomic_add: 4847 case nir_intrinsic_global_atomic_imin: 4848 case nir_intrinsic_global_atomic_umin: 4849 case nir_intrinsic_global_atomic_imax: 4850 case nir_intrinsic_global_atomic_umax: 4851 case nir_intrinsic_global_atomic_and: 4852 case nir_intrinsic_global_atomic_or: 4853 case nir_intrinsic_global_atomic_xor: 4854 case nir_intrinsic_global_atomic_exchange: 4855 case nir_intrinsic_global_atomic_comp_swap: 4856 nir_emit_global_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 4857 break; 4858 case nir_intrinsic_global_atomic_fadd: 4859 case nir_intrinsic_global_atomic_fmin: 4860 case nir_intrinsic_global_atomic_fmax: 4861 case nir_intrinsic_global_atomic_fcomp_swap: 4862 nir_emit_global_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 4863 break; 4864 4865 case nir_intrinsic_load_global_const_block_intel: { 4866 assert(nir_dest_bit_size(instr->dest) == 32); 4867 assert(instr->num_components == 8 || instr->num_components == 16); 4868 4869 const fs_builder ubld = bld.exec_all().group(instr->num_components, 0); 4870 fs_reg load_val; 4871 4872 bool is_pred_const = nir_src_is_const(instr->src[1]); 4873 if (is_pred_const && nir_src_as_uint(instr->src[1]) == 0) { 4874 /* In this case, we don't want the UBO load at all. We really 4875 * shouldn't get here but it's possible. 4876 */ 4877 load_val = brw_imm_ud(0); 4878 } else { 4879 /* The uniform process may stomp the flag so do this first */ 4880 fs_reg addr = bld.emit_uniformize(get_nir_src(instr->src[0])); 4881 4882 load_val = ubld.vgrf(BRW_REGISTER_TYPE_UD); 4883 4884 /* If the predicate is constant and we got here, then it's non-zero 4885 * and we don't need the predicate at all. 4886 */ 4887 if (!is_pred_const) { 4888 /* Load the predicate */ 4889 fs_reg pred = bld.emit_uniformize(get_nir_src(instr->src[1])); 4890 fs_inst *mov = ubld.MOV(bld.null_reg_d(), pred); 4891 mov->conditional_mod = BRW_CONDITIONAL_NZ; 4892 4893 /* Stomp the destination with 0 if we're OOB */ 4894 mov = ubld.MOV(load_val, brw_imm_ud(0)); 4895 mov->predicate = BRW_PREDICATE_NORMAL; 4896 mov->predicate_inverse = true; 4897 } 4898 4899 fs_inst *load = ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL, 4900 load_val, addr, 4901 fs_reg(), /* No source data */ 4902 brw_imm_ud(instr->num_components)); 4903 4904 if (!is_pred_const) 4905 load->predicate = BRW_PREDICATE_NORMAL; 4906 } 4907 4908 /* From the HW perspective, we just did a single SIMD16 instruction 4909 * which loaded a dword in each SIMD channel. From NIR's perspective, 4910 * this instruction returns a vec16. Any users of this data in the 4911 * back-end will expect a vec16 per SIMD channel so we have to emit a 4912 * pile of MOVs to resolve this discrepancy. Fortunately, copy-prop 4913 * will generally clean them up for us. 4914 */ 4915 for (unsigned i = 0; i < instr->num_components; i++) { 4916 bld.MOV(retype(offset(dest, bld, i), BRW_REGISTER_TYPE_UD), 4917 component(load_val, i)); 4918 } 4919 break; 4920 } 4921 4922 case nir_intrinsic_load_ssbo: { 4923 assert(devinfo->ver >= 7); 4924 4925 const unsigned bit_size = nir_dest_bit_size(instr->dest); 4926 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4927 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4928 get_nir_ssbo_intrinsic_index(bld, instr); 4929 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 4930 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4931 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 4932 4933 /* Make dest unsigned because that's what the temporary will be */ 4934 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4935 4936 /* Read the vector */ 4937 assert(nir_dest_bit_size(instr->dest) <= 32); 4938 assert(nir_intrinsic_align(instr) > 0); 4939 if (nir_dest_bit_size(instr->dest) == 32 && 4940 nir_intrinsic_align(instr) >= 4) { 4941 assert(nir_dest_num_components(instr->dest) <= 4); 4942 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4943 fs_inst *inst = 4944 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 4945 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 4946 inst->size_written = instr->num_components * dispatch_width * 4; 4947 } else { 4948 assert(nir_dest_num_components(instr->dest) == 1); 4949 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4950 4951 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 4952 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 4953 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 4954 bld.MOV(dest, subscript(read_result, dest.type, 0)); 4955 } 4956 break; 4957 } 4958 4959 case nir_intrinsic_store_ssbo: { 4960 assert(devinfo->ver >= 7); 4961 4962 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 4963 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 4964 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 4965 get_nir_ssbo_intrinsic_index(bld, instr); 4966 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[2]); 4967 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 4968 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 4969 4970 fs_reg data = get_nir_src(instr->src[0]); 4971 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 4972 4973 assert(nir_src_bit_size(instr->src[0]) <= 32); 4974 assert(nir_intrinsic_write_mask(instr) == 4975 (1u << instr->num_components) - 1); 4976 assert(nir_intrinsic_align(instr) > 0); 4977 if (nir_src_bit_size(instr->src[0]) == 32 && 4978 nir_intrinsic_align(instr) >= 4) { 4979 assert(nir_src_num_components(instr->src[0]) <= 4); 4980 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 4981 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(instr->num_components); 4982 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 4983 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4984 } else { 4985 assert(nir_src_num_components(instr->src[0]) == 1); 4986 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 4987 4988 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 4989 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 4990 4991 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 4992 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 4993 } 4994 break; 4995 } 4996 4997 case nir_intrinsic_store_output: { 4998 assert(nir_src_bit_size(instr->src[0]) == 32); 4999 fs_reg src = get_nir_src(instr->src[0]); 5000 5001 unsigned store_offset = nir_src_as_uint(instr->src[1]); 5002 unsigned num_components = instr->num_components; 5003 unsigned first_component = nir_intrinsic_component(instr); 5004 5005 fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld, 5006 4 * store_offset), src.type); 5007 for (unsigned j = 0; j < num_components; j++) { 5008 bld.MOV(offset(new_dest, bld, j + first_component), 5009 offset(src, bld, j)); 5010 } 5011 break; 5012 } 5013 5014 case nir_intrinsic_ssbo_atomic_add: 5015 case nir_intrinsic_ssbo_atomic_imin: 5016 case nir_intrinsic_ssbo_atomic_umin: 5017 case nir_intrinsic_ssbo_atomic_imax: 5018 case nir_intrinsic_ssbo_atomic_umax: 5019 case nir_intrinsic_ssbo_atomic_and: 5020 case nir_intrinsic_ssbo_atomic_or: 5021 case nir_intrinsic_ssbo_atomic_xor: 5022 case nir_intrinsic_ssbo_atomic_exchange: 5023 case nir_intrinsic_ssbo_atomic_comp_swap: 5024 nir_emit_ssbo_atomic(bld, brw_aop_for_nir_intrinsic(instr), instr); 5025 break; 5026 case nir_intrinsic_ssbo_atomic_fadd: 5027 case nir_intrinsic_ssbo_atomic_fmin: 5028 case nir_intrinsic_ssbo_atomic_fmax: 5029 case nir_intrinsic_ssbo_atomic_fcomp_swap: 5030 nir_emit_ssbo_atomic_float(bld, brw_aop_for_nir_intrinsic(instr), instr); 5031 break; 5032 5033 case nir_intrinsic_get_ssbo_size: { 5034 assert(nir_src_num_components(instr->src[0]) == 1); 5035 unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? 5036 nir_src_as_uint(instr->src[0]) : 0; 5037 5038 /* A resinfo's sampler message is used to get the buffer size. The 5039 * SIMD8's writeback message consists of four registers and SIMD16's 5040 * writeback message consists of 8 destination registers (two per each 5041 * component). Because we are only interested on the first channel of 5042 * the first returned component, where resinfo returns the buffer size 5043 * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of 5044 * the dispatch width. 5045 */ 5046 const fs_builder ubld = bld.exec_all().group(8, 0); 5047 fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5048 fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4); 5049 5050 /* Set LOD = 0 */ 5051 ubld.MOV(src_payload, brw_imm_d(0)); 5052 5053 const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index; 5054 fs_inst *inst = ubld.emit(SHADER_OPCODE_GET_BUFFER_SIZE, ret_payload, 5055 src_payload, brw_imm_ud(index)); 5056 inst->header_size = 0; 5057 inst->mlen = 1; 5058 inst->size_written = 4 * REG_SIZE; 5059 5060 /* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting: 5061 * 5062 * "Out-of-bounds checking is always performed at a DWord granularity. If 5063 * any part of the DWord is out-of-bounds then the whole DWord is 5064 * considered out-of-bounds." 5065 * 5066 * This implies that types with size smaller than 4-bytes need to be 5067 * padded if they don't complete the last dword of the buffer. But as we 5068 * need to maintain the original size we need to reverse the padding 5069 * calculation to return the correct size to know the number of elements 5070 * of an unsized array. As we stored in the last two bits of the surface 5071 * size the needed padding for the buffer, we calculate here the 5072 * original buffer_size reversing the surface_size calculation: 5073 * 5074 * surface_size = isl_align(buffer_size, 4) + 5075 * (isl_align(buffer_size) - buffer_size) 5076 * 5077 * buffer_size = surface_size & ~3 - surface_size & 3 5078 */ 5079 5080 fs_reg size_aligned4 = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5081 fs_reg size_padding = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5082 fs_reg buffer_size = ubld.vgrf(BRW_REGISTER_TYPE_UD); 5083 5084 ubld.AND(size_padding, ret_payload, brw_imm_ud(3)); 5085 ubld.AND(size_aligned4, ret_payload, brw_imm_ud(~3)); 5086 ubld.ADD(buffer_size, size_aligned4, negate(size_padding)); 5087 5088 bld.MOV(retype(dest, ret_payload.type), component(buffer_size, 0)); 5089 break; 5090 } 5091 5092 case nir_intrinsic_load_scratch: { 5093 assert(devinfo->ver >= 7); 5094 5095 assert(nir_dest_num_components(instr->dest) == 1); 5096 const unsigned bit_size = nir_dest_bit_size(instr->dest); 5097 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5098 5099 if (devinfo->verx10 >= 125) { 5100 const fs_builder ubld = bld.exec_all().group(1, 0); 5101 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5102 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5103 brw_imm_ud(~0x3ffu)); 5104 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5105 } else if (devinfo->ver >= 8) { 5106 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5107 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5108 } else { 5109 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5110 } 5111 5112 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5113 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5114 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5115 const fs_reg nir_addr = get_nir_src(instr->src[0]); 5116 5117 /* Make dest unsigned because that's what the temporary will be */ 5118 dest.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5119 5120 /* Read the vector */ 5121 assert(nir_dest_num_components(instr->dest) == 1); 5122 assert(nir_dest_bit_size(instr->dest) <= 32); 5123 assert(nir_intrinsic_align(instr) > 0); 5124 if (devinfo->verx10 >= 125) { 5125 assert(nir_dest_bit_size(instr->dest) == 32 && 5126 nir_intrinsic_align(instr) >= 4); 5127 5128 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5129 swizzle_nir_scratch_addr(bld, nir_addr, false); 5130 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5131 5132 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL, 5133 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5134 } else if (nir_dest_bit_size(instr->dest) >= 4 && 5135 nir_intrinsic_align(instr) >= 4) { 5136 /* The offset for a DWORD scattered message is in dwords. */ 5137 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5138 swizzle_nir_scratch_addr(bld, nir_addr, true); 5139 5140 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL, 5141 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5142 } else { 5143 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5144 swizzle_nir_scratch_addr(bld, nir_addr, false); 5145 5146 fs_reg read_result = bld.vgrf(BRW_REGISTER_TYPE_UD); 5147 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL, 5148 read_result, srcs, SURFACE_LOGICAL_NUM_SRCS); 5149 bld.MOV(dest, read_result); 5150 } 5151 break; 5152 } 5153 5154 case nir_intrinsic_store_scratch: { 5155 assert(devinfo->ver >= 7); 5156 5157 assert(nir_src_num_components(instr->src[0]) == 1); 5158 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5159 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5160 5161 if (devinfo->verx10 >= 125) { 5162 const fs_builder ubld = bld.exec_all().group(1, 0); 5163 fs_reg handle = component(ubld.vgrf(BRW_REGISTER_TYPE_UD), 0); 5164 ubld.AND(handle, retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD), 5165 brw_imm_ud(~0x3ffu)); 5166 srcs[SURFACE_LOGICAL_SRC_SURFACE_HANDLE] = handle; 5167 } else if (devinfo->ver >= 8) { 5168 srcs[SURFACE_LOGICAL_SRC_SURFACE] = 5169 brw_imm_ud(GFX8_BTI_STATELESS_NON_COHERENT); 5170 } else { 5171 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(BRW_BTI_STATELESS); 5172 } 5173 5174 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5175 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(bit_size); 5176 /** 5177 * While this instruction has side-effects, it should not be predicated 5178 * on sample mask, because otherwise fs helper invocations would 5179 * load undefined values from scratch memory. And scratch memory 5180 * load-stores are produced from operations without side-effects, thus 5181 * they should not have different behaviour in the helper invocations. 5182 */ 5183 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(0); 5184 const fs_reg nir_addr = get_nir_src(instr->src[1]); 5185 5186 fs_reg data = get_nir_src(instr->src[0]); 5187 data.type = brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_UD); 5188 5189 assert(nir_src_num_components(instr->src[0]) == 1); 5190 assert(nir_src_bit_size(instr->src[0]) <= 32); 5191 assert(nir_intrinsic_write_mask(instr) == 1); 5192 assert(nir_intrinsic_align(instr) > 0); 5193 if (devinfo->verx10 >= 125) { 5194 assert(nir_src_bit_size(instr->src[0]) == 32 && 5195 nir_intrinsic_align(instr) >= 4); 5196 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5197 5198 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5199 swizzle_nir_scratch_addr(bld, nir_addr, false); 5200 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(1); 5201 5202 bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL, 5203 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5204 } else if (nir_src_bit_size(instr->src[0]) == 32 && 5205 nir_intrinsic_align(instr) >= 4) { 5206 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5207 5208 /* The offset for a DWORD scattered message is in dwords. */ 5209 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5210 swizzle_nir_scratch_addr(bld, nir_addr, true); 5211 5212 bld.emit(SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL, 5213 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5214 } else { 5215 srcs[SURFACE_LOGICAL_SRC_DATA] = bld.vgrf(BRW_REGISTER_TYPE_UD); 5216 bld.MOV(srcs[SURFACE_LOGICAL_SRC_DATA], data); 5217 5218 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5219 swizzle_nir_scratch_addr(bld, nir_addr, false); 5220 5221 bld.emit(SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL, 5222 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5223 } 5224 break; 5225 } 5226 5227 case nir_intrinsic_load_subgroup_size: 5228 /* This should only happen for fragment shaders because every other case 5229 * is lowered in NIR so we can optimize on it. 5230 */ 5231 assert(stage == MESA_SHADER_FRAGMENT); 5232 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(dispatch_width)); 5233 break; 5234 5235 case nir_intrinsic_load_subgroup_invocation: 5236 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), 5237 nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION]); 5238 break; 5239 5240 case nir_intrinsic_load_subgroup_eq_mask: 5241 case nir_intrinsic_load_subgroup_ge_mask: 5242 case nir_intrinsic_load_subgroup_gt_mask: 5243 case nir_intrinsic_load_subgroup_le_mask: 5244 case nir_intrinsic_load_subgroup_lt_mask: 5245 unreachable("not reached"); 5246 5247 case nir_intrinsic_vote_any: { 5248 const fs_builder ubld = bld.exec_all().group(1, 0); 5249 5250 /* The any/all predicates do not consider channel enables. To prevent 5251 * dead channels from affecting the result, we initialize the flag with 5252 * with the identity value for the logical operation. 5253 */ 5254 if (dispatch_width == 32) { 5255 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5256 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5257 brw_imm_ud(0)); 5258 } else { 5259 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0)); 5260 } 5261 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5262 5263 /* For some reason, the any/all predicates don't work properly with 5264 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5265 * doesn't read the correct subset of the flag register and you end up 5266 * getting garbage in the second half. Work around this by using a pair 5267 * of 1-wide MOVs and scattering the result. 5268 */ 5269 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5270 ubld.MOV(res1, brw_imm_d(0)); 5271 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ANY8H : 5272 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ANY16H : 5273 BRW_PREDICATE_ALIGN1_ANY32H, 5274 ubld.MOV(res1, brw_imm_d(-1))); 5275 5276 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5277 break; 5278 } 5279 case nir_intrinsic_vote_all: { 5280 const fs_builder ubld = bld.exec_all().group(1, 0); 5281 5282 /* The any/all predicates do not consider channel enables. To prevent 5283 * dead channels from affecting the result, we initialize the flag with 5284 * with the identity value for the logical operation. 5285 */ 5286 if (dispatch_width == 32) { 5287 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5288 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5289 brw_imm_ud(0xffffffff)); 5290 } else { 5291 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5292 } 5293 bld.CMP(bld.null_reg_d(), get_nir_src(instr->src[0]), brw_imm_d(0), BRW_CONDITIONAL_NZ); 5294 5295 /* For some reason, the any/all predicates don't work properly with 5296 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5297 * doesn't read the correct subset of the flag register and you end up 5298 * getting garbage in the second half. Work around this by using a pair 5299 * of 1-wide MOVs and scattering the result. 5300 */ 5301 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5302 ubld.MOV(res1, brw_imm_d(0)); 5303 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5304 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5305 BRW_PREDICATE_ALIGN1_ALL32H, 5306 ubld.MOV(res1, brw_imm_d(-1))); 5307 5308 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5309 break; 5310 } 5311 case nir_intrinsic_vote_feq: 5312 case nir_intrinsic_vote_ieq: { 5313 fs_reg value = get_nir_src(instr->src[0]); 5314 if (instr->intrinsic == nir_intrinsic_vote_feq) { 5315 const unsigned bit_size = nir_src_bit_size(instr->src[0]); 5316 value.type = bit_size == 8 ? BRW_REGISTER_TYPE_B : 5317 brw_reg_type_from_bit_size(bit_size, BRW_REGISTER_TYPE_F); 5318 } 5319 5320 fs_reg uniformized = bld.emit_uniformize(value); 5321 const fs_builder ubld = bld.exec_all().group(1, 0); 5322 5323 /* The any/all predicates do not consider channel enables. To prevent 5324 * dead channels from affecting the result, we initialize the flag with 5325 * with the identity value for the logical operation. 5326 */ 5327 if (dispatch_width == 32) { 5328 /* For SIMD32, we use a UD type so we fill both f0.0 and f0.1. */ 5329 ubld.MOV(retype(brw_flag_reg(0, 0), BRW_REGISTER_TYPE_UD), 5330 brw_imm_ud(0xffffffff)); 5331 } else { 5332 ubld.MOV(brw_flag_reg(0, 0), brw_imm_uw(0xffff)); 5333 } 5334 bld.CMP(bld.null_reg_d(), value, uniformized, BRW_CONDITIONAL_Z); 5335 5336 /* For some reason, the any/all predicates don't work properly with 5337 * SIMD32. In particular, it appears that a SEL with a QtrCtrl of 2H 5338 * doesn't read the correct subset of the flag register and you end up 5339 * getting garbage in the second half. Work around this by using a pair 5340 * of 1-wide MOVs and scattering the result. 5341 */ 5342 fs_reg res1 = ubld.vgrf(BRW_REGISTER_TYPE_D); 5343 ubld.MOV(res1, brw_imm_d(0)); 5344 set_predicate(dispatch_width == 8 ? BRW_PREDICATE_ALIGN1_ALL8H : 5345 dispatch_width == 16 ? BRW_PREDICATE_ALIGN1_ALL16H : 5346 BRW_PREDICATE_ALIGN1_ALL32H, 5347 ubld.MOV(res1, brw_imm_d(-1))); 5348 5349 bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), component(res1, 0)); 5350 break; 5351 } 5352 5353 case nir_intrinsic_ballot: { 5354 const fs_reg value = retype(get_nir_src(instr->src[0]), 5355 BRW_REGISTER_TYPE_UD); 5356 struct brw_reg flag = brw_flag_reg(0, 0); 5357 /* FIXME: For SIMD32 programs, this causes us to stomp on f0.1 as well 5358 * as f0.0. This is a problem for fragment programs as we currently use 5359 * f0.1 for discards. Fortunately, we don't support SIMD32 fragment 5360 * programs yet so this isn't a problem. When we do, something will 5361 * have to change. 5362 */ 5363 if (dispatch_width == 32) 5364 flag.type = BRW_REGISTER_TYPE_UD; 5365 5366 bld.exec_all().group(1, 0).MOV(flag, brw_imm_ud(0u)); 5367 bld.CMP(bld.null_reg_ud(), value, brw_imm_ud(0u), BRW_CONDITIONAL_NZ); 5368 5369 if (instr->dest.ssa.bit_size > 32) { 5370 dest.type = BRW_REGISTER_TYPE_UQ; 5371 } else { 5372 dest.type = BRW_REGISTER_TYPE_UD; 5373 } 5374 bld.MOV(dest, flag); 5375 break; 5376 } 5377 5378 case nir_intrinsic_read_invocation: { 5379 const fs_reg value = get_nir_src(instr->src[0]); 5380 const fs_reg invocation = get_nir_src(instr->src[1]); 5381 fs_reg tmp = bld.vgrf(value.type); 5382 5383 bld.exec_all().emit(SHADER_OPCODE_BROADCAST, tmp, value, 5384 bld.emit_uniformize(invocation)); 5385 5386 bld.MOV(retype(dest, value.type), fs_reg(component(tmp, 0))); 5387 break; 5388 } 5389 5390 case nir_intrinsic_read_first_invocation: { 5391 const fs_reg value = get_nir_src(instr->src[0]); 5392 bld.MOV(retype(dest, value.type), bld.emit_uniformize(value)); 5393 break; 5394 } 5395 5396 case nir_intrinsic_shuffle: { 5397 const fs_reg value = get_nir_src(instr->src[0]); 5398 const fs_reg index = get_nir_src(instr->src[1]); 5399 5400 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, index); 5401 break; 5402 } 5403 5404 case nir_intrinsic_first_invocation: { 5405 fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); 5406 bld.exec_all().emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, tmp); 5407 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5408 fs_reg(component(tmp, 0))); 5409 break; 5410 } 5411 5412 case nir_intrinsic_quad_broadcast: { 5413 const fs_reg value = get_nir_src(instr->src[0]); 5414 const unsigned index = nir_src_as_uint(instr->src[1]); 5415 5416 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, retype(dest, value.type), 5417 value, brw_imm_ud(index), brw_imm_ud(4)); 5418 break; 5419 } 5420 5421 case nir_intrinsic_quad_swap_horizontal: { 5422 const fs_reg value = get_nir_src(instr->src[0]); 5423 const fs_reg tmp = bld.vgrf(value.type); 5424 if (devinfo->ver <= 7) { 5425 /* The hardware doesn't seem to support these crazy regions with 5426 * compressed instructions on gfx7 and earlier so we fall back to 5427 * using quad swizzles. Fortunately, we don't support 64-bit 5428 * anything in Vulkan on gfx7. 5429 */ 5430 assert(nir_src_bit_size(instr->src[0]) == 32); 5431 const fs_builder ubld = bld.exec_all(); 5432 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5433 brw_imm_ud(BRW_SWIZZLE4(1,0,3,2))); 5434 bld.MOV(retype(dest, value.type), tmp); 5435 } else { 5436 const fs_builder ubld = bld.exec_all().group(dispatch_width / 2, 0); 5437 5438 const fs_reg src_left = horiz_stride(value, 2); 5439 const fs_reg src_right = horiz_stride(horiz_offset(value, 1), 2); 5440 const fs_reg tmp_left = horiz_stride(tmp, 2); 5441 const fs_reg tmp_right = horiz_stride(horiz_offset(tmp, 1), 2); 5442 5443 ubld.MOV(tmp_left, src_right); 5444 ubld.MOV(tmp_right, src_left); 5445 5446 } 5447 bld.MOV(retype(dest, value.type), tmp); 5448 break; 5449 } 5450 5451 case nir_intrinsic_quad_swap_vertical: { 5452 const fs_reg value = get_nir_src(instr->src[0]); 5453 if (nir_src_bit_size(instr->src[0]) == 32) { 5454 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5455 const fs_reg tmp = bld.vgrf(value.type); 5456 const fs_builder ubld = bld.exec_all(); 5457 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5458 brw_imm_ud(BRW_SWIZZLE4(2,3,0,1))); 5459 bld.MOV(retype(dest, value.type), tmp); 5460 } else { 5461 /* For larger data types, we have to either emit dispatch_width many 5462 * MOVs or else fall back to doing indirects. 5463 */ 5464 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5465 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5466 brw_imm_w(0x2)); 5467 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5468 } 5469 break; 5470 } 5471 5472 case nir_intrinsic_quad_swap_diagonal: { 5473 const fs_reg value = get_nir_src(instr->src[0]); 5474 if (nir_src_bit_size(instr->src[0]) == 32) { 5475 /* For 32-bit, we can use a SIMD4x2 instruction to do this easily */ 5476 const fs_reg tmp = bld.vgrf(value.type); 5477 const fs_builder ubld = bld.exec_all(); 5478 ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp, value, 5479 brw_imm_ud(BRW_SWIZZLE4(3,2,1,0))); 5480 bld.MOV(retype(dest, value.type), tmp); 5481 } else { 5482 /* For larger data types, we have to either emit dispatch_width many 5483 * MOVs or else fall back to doing indirects. 5484 */ 5485 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5486 bld.XOR(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5487 brw_imm_w(0x3)); 5488 bld.emit(SHADER_OPCODE_SHUFFLE, retype(dest, value.type), value, idx); 5489 } 5490 break; 5491 } 5492 5493 case nir_intrinsic_reduce: { 5494 fs_reg src = get_nir_src(instr->src[0]); 5495 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5496 unsigned cluster_size = nir_intrinsic_cluster_size(instr); 5497 if (cluster_size == 0 || cluster_size > dispatch_width) 5498 cluster_size = dispatch_width; 5499 5500 /* Figure out the source type */ 5501 src.type = brw_type_for_nir_type(devinfo, 5502 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5503 nir_src_bit_size(instr->src[0]))); 5504 5505 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5506 opcode brw_op = brw_op_for_nir_reduction_op(redop); 5507 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5508 5509 /* Set up a register for all of our scratching around and initialize it 5510 * to reduction operation's identity value. 5511 */ 5512 fs_reg scan = bld.vgrf(src.type); 5513 bld.exec_all().emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5514 5515 bld.emit_scan(brw_op, scan, cluster_size, cond_mod); 5516 5517 dest.type = src.type; 5518 if (cluster_size * type_sz(src.type) >= REG_SIZE * 2) { 5519 /* In this case, CLUSTER_BROADCAST instruction isn't needed because 5520 * the distance between clusters is at least 2 GRFs. In this case, 5521 * we don't need the weird striding of the CLUSTER_BROADCAST 5522 * instruction and can just do regular MOVs. 5523 */ 5524 assert((cluster_size * type_sz(src.type)) % (REG_SIZE * 2) == 0); 5525 const unsigned groups = 5526 (dispatch_width * type_sz(src.type)) / (REG_SIZE * 2); 5527 const unsigned group_size = dispatch_width / groups; 5528 for (unsigned i = 0; i < groups; i++) { 5529 const unsigned cluster = (i * group_size) / cluster_size; 5530 const unsigned comp = cluster * cluster_size + (cluster_size - 1); 5531 bld.group(group_size, i).MOV(horiz_offset(dest, i * group_size), 5532 component(scan, comp)); 5533 } 5534 } else { 5535 bld.emit(SHADER_OPCODE_CLUSTER_BROADCAST, dest, scan, 5536 brw_imm_ud(cluster_size - 1), brw_imm_ud(cluster_size)); 5537 } 5538 break; 5539 } 5540 5541 case nir_intrinsic_inclusive_scan: 5542 case nir_intrinsic_exclusive_scan: { 5543 fs_reg src = get_nir_src(instr->src[0]); 5544 nir_op redop = (nir_op)nir_intrinsic_reduction_op(instr); 5545 5546 /* Figure out the source type */ 5547 src.type = brw_type_for_nir_type(devinfo, 5548 (nir_alu_type)(nir_op_infos[redop].input_types[0] | 5549 nir_src_bit_size(instr->src[0]))); 5550 5551 fs_reg identity = brw_nir_reduction_op_identity(bld, redop, src.type); 5552 opcode brw_op = brw_op_for_nir_reduction_op(redop); 5553 brw_conditional_mod cond_mod = brw_cond_mod_for_nir_reduction_op(redop); 5554 5555 /* Set up a register for all of our scratching around and initialize it 5556 * to reduction operation's identity value. 5557 */ 5558 fs_reg scan = bld.vgrf(src.type); 5559 const fs_builder allbld = bld.exec_all(); 5560 allbld.emit(SHADER_OPCODE_SEL_EXEC, scan, src, identity); 5561 5562 if (instr->intrinsic == nir_intrinsic_exclusive_scan) { 5563 /* Exclusive scan is a bit harder because we have to do an annoying 5564 * shift of the contents before we can begin. To make things worse, 5565 * we can't do this with a normal stride; we have to use indirects. 5566 */ 5567 fs_reg shifted = bld.vgrf(src.type); 5568 fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_W); 5569 allbld.ADD(idx, nir_system_values[SYSTEM_VALUE_SUBGROUP_INVOCATION], 5570 brw_imm_w(-1)); 5571 allbld.emit(SHADER_OPCODE_SHUFFLE, shifted, scan, idx); 5572 allbld.group(1, 0).MOV(component(shifted, 0), identity); 5573 scan = shifted; 5574 } 5575 5576 bld.emit_scan(brw_op, scan, dispatch_width, cond_mod); 5577 5578 bld.MOV(retype(dest, src.type), scan); 5579 break; 5580 } 5581 5582 case nir_intrinsic_load_global_block_intel: { 5583 assert(nir_dest_bit_size(instr->dest) == 32); 5584 5585 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[0])); 5586 5587 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5588 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5589 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5590 5591 const unsigned total = instr->num_components * dispatch_width; 5592 unsigned loaded = 0; 5593 5594 while (loaded < total) { 5595 const unsigned block = 5596 choose_oword_block_size_dwords(total - loaded); 5597 const unsigned block_bytes = block * 4; 5598 5599 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5600 ubld.emit(SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5601 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5602 address, 5603 fs_reg(), /* No source data */ 5604 brw_imm_ud(block))->size_written = block_bytes; 5605 5606 increment_a64_address(ubld1, address, block_bytes); 5607 loaded += block; 5608 } 5609 5610 assert(loaded == total); 5611 break; 5612 } 5613 5614 case nir_intrinsic_store_global_block_intel: { 5615 assert(nir_src_bit_size(instr->src[0]) == 32); 5616 5617 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[1])); 5618 fs_reg src = get_nir_src(instr->src[0]); 5619 5620 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5621 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5622 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5623 5624 const unsigned total = instr->num_components * dispatch_width; 5625 unsigned written = 0; 5626 5627 while (written < total) { 5628 const unsigned block = 5629 choose_oword_block_size_dwords(total - written); 5630 5631 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5632 ubld.emit(SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL, 5633 fs_reg(), 5634 address, 5635 retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD), 5636 brw_imm_ud(block)); 5637 5638 const unsigned block_bytes = block * 4; 5639 increment_a64_address(ubld1, address, block_bytes); 5640 written += block; 5641 } 5642 5643 assert(written == total); 5644 break; 5645 } 5646 5647 case nir_intrinsic_load_shared_block_intel: 5648 case nir_intrinsic_load_ssbo_block_intel: { 5649 assert(nir_dest_bit_size(instr->dest) == 32); 5650 5651 const bool is_ssbo = 5652 instr->intrinsic == nir_intrinsic_load_ssbo_block_intel; 5653 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 1 : 0])); 5654 5655 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5656 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5657 get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5658 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5659 5660 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5661 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5662 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5663 5664 const unsigned total = instr->num_components * dispatch_width; 5665 unsigned loaded = 0; 5666 5667 while (loaded < total) { 5668 const unsigned block = 5669 choose_oword_block_size_dwords(total - loaded); 5670 const unsigned block_bytes = block * 4; 5671 5672 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5673 5674 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5675 ubld.emit(SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL, 5676 retype(byte_offset(dest, loaded * 4), BRW_REGISTER_TYPE_UD), 5677 srcs, SURFACE_LOGICAL_NUM_SRCS)->size_written = block_bytes; 5678 5679 ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5680 loaded += block; 5681 } 5682 5683 assert(loaded == total); 5684 break; 5685 } 5686 5687 case nir_intrinsic_store_shared_block_intel: 5688 case nir_intrinsic_store_ssbo_block_intel: { 5689 assert(nir_src_bit_size(instr->src[0]) == 32); 5690 5691 const bool is_ssbo = 5692 instr->intrinsic == nir_intrinsic_store_ssbo_block_intel; 5693 5694 fs_reg address = bld.emit_uniformize(get_nir_src(instr->src[is_ssbo ? 2 : 1])); 5695 fs_reg src = get_nir_src(instr->src[0]); 5696 5697 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5698 srcs[SURFACE_LOGICAL_SRC_SURFACE] = is_ssbo ? 5699 get_nir_ssbo_intrinsic_index(bld, instr) : fs_reg(brw_imm_ud(GFX7_BTI_SLM)); 5700 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = address; 5701 5702 const fs_builder ubld1 = bld.exec_all().group(1, 0); 5703 const fs_builder ubld8 = bld.exec_all().group(8, 0); 5704 const fs_builder ubld16 = bld.exec_all().group(16, 0); 5705 5706 const unsigned total = instr->num_components * dispatch_width; 5707 unsigned written = 0; 5708 5709 while (written < total) { 5710 const unsigned block = 5711 choose_oword_block_size_dwords(total - written); 5712 5713 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(block); 5714 srcs[SURFACE_LOGICAL_SRC_DATA] = 5715 retype(byte_offset(src, written * 4), BRW_REGISTER_TYPE_UD); 5716 5717 const fs_builder &ubld = block == 8 ? ubld8 : ubld16; 5718 ubld.emit(SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL, 5719 fs_reg(), srcs, SURFACE_LOGICAL_NUM_SRCS); 5720 5721 const unsigned block_bytes = block * 4; 5722 ubld1.ADD(address, address, brw_imm_ud(block_bytes)); 5723 written += block; 5724 } 5725 5726 assert(written == total); 5727 break; 5728 } 5729 5730 case nir_intrinsic_load_btd_dss_id_intel: 5731 bld.emit(SHADER_OPCODE_GET_DSS_ID, 5732 retype(dest, BRW_REGISTER_TYPE_UD)); 5733 break; 5734 5735 case nir_intrinsic_load_btd_stack_id_intel: 5736 if (stage == MESA_SHADER_COMPUTE) { 5737 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5738 } else { 5739 assert(brw_shader_stage_is_bindless(stage)); 5740 } 5741 /* Stack IDs are always in R1 regardless of whether we're coming from a 5742 * bindless shader or a regular compute shader. 5743 */ 5744 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), 5745 retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UW)); 5746 break; 5747 5748 case nir_intrinsic_btd_spawn_intel: 5749 if (stage == MESA_SHADER_COMPUTE) { 5750 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5751 } else { 5752 assert(brw_shader_stage_is_bindless(stage)); 5753 } 5754 bld.emit(SHADER_OPCODE_BTD_SPAWN_LOGICAL, bld.null_reg_ud(), 5755 bld.emit_uniformize(get_nir_src(instr->src[0])), 5756 get_nir_src(instr->src[1])); 5757 break; 5758 5759 case nir_intrinsic_btd_retire_intel: 5760 if (stage == MESA_SHADER_COMPUTE) { 5761 assert(brw_cs_prog_data(prog_data)->uses_btd_stack_ids); 5762 } else { 5763 assert(brw_shader_stage_is_bindless(stage)); 5764 } 5765 bld.emit(SHADER_OPCODE_BTD_RETIRE_LOGICAL); 5766 break; 5767 5768 default: 5769 unreachable("unknown intrinsic"); 5770 } 5771} 5772 5773void 5774fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld, 5775 int op, nir_intrinsic_instr *instr) 5776{ 5777 /* The BTI untyped atomic messages only support 32-bit atomics. If you 5778 * just look at the big table of messages in the Vol 7 of the SKL PRM, they 5779 * appear to exist. However, if you look at Vol 2a, there are no message 5780 * descriptors provided for Qword atomic ops except for A64 messages. 5781 */ 5782 assert(nir_dest_bit_size(instr->dest) == 32 || 5783 (nir_dest_bit_size(instr->dest) == 64 && devinfo->has_lsc)); 5784 5785 fs_reg dest; 5786 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5787 dest = get_nir_dest(instr->dest); 5788 5789 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5790 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5791 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5792 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5793 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5794 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5795 5796 fs_reg data; 5797 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5798 data = get_nir_src(instr->src[2]); 5799 5800 if (op == BRW_AOP_CMPWR) { 5801 fs_reg tmp = bld.vgrf(data.type, 2); 5802 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5803 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5804 data = tmp; 5805 } 5806 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5807 5808 /* Emit the actual atomic operation */ 5809 5810 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5811 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5812} 5813 5814void 5815fs_visitor::nir_emit_ssbo_atomic_float(const fs_builder &bld, 5816 int op, nir_intrinsic_instr *instr) 5817{ 5818 fs_reg dest; 5819 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5820 dest = get_nir_dest(instr->dest); 5821 5822 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5823 srcs[SURFACE_LOGICAL_SRC_SURFACE] = get_nir_ssbo_intrinsic_index(bld, instr); 5824 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = get_nir_src(instr->src[1]); 5825 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5826 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5827 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5828 5829 fs_reg data = get_nir_src(instr->src[2]); 5830 if (op == BRW_AOP_FCMPWR) { 5831 fs_reg tmp = bld.vgrf(data.type, 2); 5832 fs_reg sources[2] = { data, get_nir_src(instr->src[3]) }; 5833 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5834 data = tmp; 5835 } 5836 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5837 5838 /* Emit the actual atomic operation */ 5839 5840 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5841 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5842} 5843 5844void 5845fs_visitor::nir_emit_shared_atomic(const fs_builder &bld, 5846 int op, nir_intrinsic_instr *instr) 5847{ 5848 fs_reg dest; 5849 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5850 dest = get_nir_dest(instr->dest); 5851 5852 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5853 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 5854 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5855 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5856 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5857 5858 fs_reg data; 5859 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5860 data = get_nir_src(instr->src[1]); 5861 if (op == BRW_AOP_CMPWR) { 5862 fs_reg tmp = bld.vgrf(data.type, 2); 5863 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5864 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5865 data = tmp; 5866 } 5867 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5868 5869 /* Get the offset */ 5870 if (nir_src_is_const(instr->src[0])) { 5871 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5872 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5873 } else { 5874 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5875 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5876 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5877 brw_imm_ud(instr->const_index[0])); 5878 } 5879 5880 /* Emit the actual atomic operation operation */ 5881 5882 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL, 5883 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5884} 5885 5886void 5887fs_visitor::nir_emit_shared_atomic_float(const fs_builder &bld, 5888 int op, nir_intrinsic_instr *instr) 5889{ 5890 fs_reg dest; 5891 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5892 dest = get_nir_dest(instr->dest); 5893 5894 fs_reg srcs[SURFACE_LOGICAL_NUM_SRCS]; 5895 srcs[SURFACE_LOGICAL_SRC_SURFACE] = brw_imm_ud(GFX7_BTI_SLM); 5896 srcs[SURFACE_LOGICAL_SRC_IMM_DIMS] = brw_imm_ud(1); 5897 srcs[SURFACE_LOGICAL_SRC_IMM_ARG] = brw_imm_ud(op); 5898 srcs[SURFACE_LOGICAL_SRC_ALLOW_SAMPLE_MASK] = brw_imm_ud(1); 5899 5900 fs_reg data = get_nir_src(instr->src[1]); 5901 if (op == BRW_AOP_FCMPWR) { 5902 fs_reg tmp = bld.vgrf(data.type, 2); 5903 fs_reg sources[2] = { data, get_nir_src(instr->src[2]) }; 5904 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5905 data = tmp; 5906 } 5907 srcs[SURFACE_LOGICAL_SRC_DATA] = data; 5908 5909 /* Get the offset */ 5910 if (nir_src_is_const(instr->src[0])) { 5911 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = 5912 brw_imm_ud(instr->const_index[0] + nir_src_as_uint(instr->src[0])); 5913 } else { 5914 srcs[SURFACE_LOGICAL_SRC_ADDRESS] = vgrf(glsl_type::uint_type); 5915 bld.ADD(srcs[SURFACE_LOGICAL_SRC_ADDRESS], 5916 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD), 5917 brw_imm_ud(instr->const_index[0])); 5918 } 5919 5920 /* Emit the actual atomic operation operation */ 5921 5922 bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL, 5923 dest, srcs, SURFACE_LOGICAL_NUM_SRCS); 5924} 5925 5926static fs_reg 5927expand_to_32bit(const fs_builder &bld, const fs_reg &src) 5928{ 5929 if (type_sz(src.type) == 2) { 5930 fs_reg src32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 5931 bld.MOV(src32, retype(src, BRW_REGISTER_TYPE_UW)); 5932 return src32; 5933 } else { 5934 return src; 5935 } 5936} 5937 5938void 5939fs_visitor::nir_emit_global_atomic(const fs_builder &bld, 5940 int op, nir_intrinsic_instr *instr) 5941{ 5942 fs_reg dest; 5943 if (nir_intrinsic_infos[instr->intrinsic].has_dest) 5944 dest = get_nir_dest(instr->dest); 5945 5946 fs_reg addr = get_nir_src(instr->src[0]); 5947 5948 fs_reg data; 5949 if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) 5950 data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 5951 5952 if (op == BRW_AOP_CMPWR) { 5953 fs_reg tmp = bld.vgrf(data.type, 2); 5954 fs_reg sources[2] = { 5955 data, 5956 expand_to_32bit(bld, get_nir_src(instr->src[2])) 5957 }; 5958 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 5959 data = tmp; 5960 } 5961 5962 switch (nir_dest_bit_size(instr->dest)) { 5963 case 16: { 5964 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 5965 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT16_LOGICAL, 5966 dest32, addr, data, brw_imm_ud(op)); 5967 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 5968 break; 5969 } 5970 case 32: 5971 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL, 5972 dest, addr, data, brw_imm_ud(op)); 5973 break; 5974 case 64: 5975 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL, 5976 dest, addr, data, brw_imm_ud(op)); 5977 break; 5978 default: 5979 unreachable("Unsupported bit size"); 5980 } 5981} 5982 5983void 5984fs_visitor::nir_emit_global_atomic_float(const fs_builder &bld, 5985 int op, nir_intrinsic_instr *instr) 5986{ 5987 assert(nir_intrinsic_infos[instr->intrinsic].has_dest); 5988 fs_reg dest = get_nir_dest(instr->dest); 5989 5990 fs_reg addr = get_nir_src(instr->src[0]); 5991 5992 assert(op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC); 5993 fs_reg data = expand_to_32bit(bld, get_nir_src(instr->src[1])); 5994 5995 if (op == BRW_AOP_FCMPWR) { 5996 fs_reg tmp = bld.vgrf(data.type, 2); 5997 fs_reg sources[2] = { 5998 data, 5999 expand_to_32bit(bld, get_nir_src(instr->src[2])) 6000 }; 6001 bld.LOAD_PAYLOAD(tmp, sources, 2, 0); 6002 data = tmp; 6003 } 6004 6005 switch (nir_dest_bit_size(instr->dest)) { 6006 case 16: { 6007 fs_reg dest32 = bld.vgrf(BRW_REGISTER_TYPE_UD); 6008 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT16_LOGICAL, 6009 dest32, addr, data, brw_imm_ud(op)); 6010 bld.MOV(retype(dest, BRW_REGISTER_TYPE_UW), dest32); 6011 break; 6012 } 6013 case 32: 6014 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT32_LOGICAL, 6015 dest, addr, data, brw_imm_ud(op)); 6016 break; 6017 case 64: 6018 bld.emit(SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT64_LOGICAL, 6019 dest, addr, data, brw_imm_ud(op)); 6020 break; 6021 default: 6022 unreachable("Unsupported bit size"); 6023 } 6024} 6025 6026void 6027fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr) 6028{ 6029 unsigned texture = instr->texture_index; 6030 unsigned sampler = instr->sampler_index; 6031 6032 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 6033 6034 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture); 6035 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler); 6036 6037 int lod_components = 0; 6038 6039 /* The hardware requires a LOD for buffer textures */ 6040 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 6041 srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0); 6042 6043 uint32_t header_bits = 0; 6044 for (unsigned i = 0; i < instr->num_srcs; i++) { 6045 fs_reg src = get_nir_src(instr->src[i].src); 6046 switch (instr->src[i].src_type) { 6047 case nir_tex_src_bias: 6048 srcs[TEX_LOGICAL_SRC_LOD] = 6049 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6050 break; 6051 case nir_tex_src_comparator: 6052 srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F); 6053 break; 6054 case nir_tex_src_coord: 6055 switch (instr->op) { 6056 case nir_texop_txf: 6057 case nir_texop_txf_ms: 6058 case nir_texop_txf_ms_mcs_intel: 6059 case nir_texop_samples_identical: 6060 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D); 6061 break; 6062 default: 6063 srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F); 6064 break; 6065 } 6066 6067 /* Wa_14013363432: 6068 * 6069 * Compiler should send U,V,R parameters even if V,R are 0. 6070 */ 6071 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && devinfo->verx10 == 125) 6072 assert(instr->coord_components >= 3u); 6073 break; 6074 case nir_tex_src_ddx: 6075 srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F); 6076 lod_components = nir_tex_instr_src_size(instr, i); 6077 break; 6078 case nir_tex_src_ddy: 6079 srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F); 6080 break; 6081 case nir_tex_src_lod: 6082 switch (instr->op) { 6083 case nir_texop_txs: 6084 srcs[TEX_LOGICAL_SRC_LOD] = 6085 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD); 6086 break; 6087 case nir_texop_txf: 6088 srcs[TEX_LOGICAL_SRC_LOD] = 6089 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D); 6090 break; 6091 default: 6092 srcs[TEX_LOGICAL_SRC_LOD] = 6093 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6094 break; 6095 } 6096 break; 6097 case nir_tex_src_min_lod: 6098 srcs[TEX_LOGICAL_SRC_MIN_LOD] = 6099 retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F); 6100 break; 6101 case nir_tex_src_ms_index: 6102 srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD); 6103 break; 6104 6105 case nir_tex_src_offset: { 6106 uint32_t offset_bits = 0; 6107 if (brw_texture_offset(instr, i, &offset_bits)) { 6108 header_bits |= offset_bits; 6109 } else { 6110 srcs[TEX_LOGICAL_SRC_TG4_OFFSET] = 6111 retype(src, BRW_REGISTER_TYPE_D); 6112 } 6113 break; 6114 } 6115 6116 case nir_tex_src_projector: 6117 unreachable("should be lowered"); 6118 6119 case nir_tex_src_texture_offset: { 6120 /* Emit code to evaluate the actual indexing expression */ 6121 fs_reg tmp = vgrf(glsl_type::uint_type); 6122 bld.ADD(tmp, src, brw_imm_ud(texture)); 6123 srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp); 6124 break; 6125 } 6126 6127 case nir_tex_src_sampler_offset: { 6128 /* Emit code to evaluate the actual indexing expression */ 6129 fs_reg tmp = vgrf(glsl_type::uint_type); 6130 bld.ADD(tmp, src, brw_imm_ud(sampler)); 6131 srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp); 6132 break; 6133 } 6134 6135 case nir_tex_src_texture_handle: 6136 assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1); 6137 srcs[TEX_LOGICAL_SRC_SURFACE] = fs_reg(); 6138 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = bld.emit_uniformize(src); 6139 break; 6140 6141 case nir_tex_src_sampler_handle: 6142 assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1); 6143 srcs[TEX_LOGICAL_SRC_SAMPLER] = fs_reg(); 6144 srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] = bld.emit_uniformize(src); 6145 break; 6146 6147 case nir_tex_src_ms_mcs_intel: 6148 assert(instr->op == nir_texop_txf_ms); 6149 srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D); 6150 break; 6151 6152 case nir_tex_src_plane: { 6153 const uint32_t plane = nir_src_as_uint(instr->src[i].src); 6154 const uint32_t texture_index = 6155 instr->texture_index + 6156 stage_prog_data->binding_table.plane_start[plane] - 6157 stage_prog_data->binding_table.texture_start; 6158 6159 srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index); 6160 break; 6161 } 6162 6163 default: 6164 unreachable("unknown texture source"); 6165 } 6166 } 6167 6168 if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE && 6169 (instr->op == nir_texop_txf_ms || 6170 instr->op == nir_texop_samples_identical)) { 6171 if (devinfo->ver >= 7 && 6172 key_tex->compressed_multisample_layout_mask & (1 << texture)) { 6173 srcs[TEX_LOGICAL_SRC_MCS] = 6174 emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE], 6175 instr->coord_components, 6176 srcs[TEX_LOGICAL_SRC_SURFACE], 6177 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE]); 6178 } else { 6179 srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u); 6180 } 6181 } 6182 6183 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components); 6184 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components); 6185 6186 enum opcode opcode; 6187 switch (instr->op) { 6188 case nir_texop_tex: 6189 opcode = SHADER_OPCODE_TEX_LOGICAL; 6190 break; 6191 case nir_texop_txb: 6192 opcode = FS_OPCODE_TXB_LOGICAL; 6193 break; 6194 case nir_texop_txl: 6195 opcode = SHADER_OPCODE_TXL_LOGICAL; 6196 break; 6197 case nir_texop_txd: 6198 opcode = SHADER_OPCODE_TXD_LOGICAL; 6199 break; 6200 case nir_texop_txf: 6201 opcode = SHADER_OPCODE_TXF_LOGICAL; 6202 break; 6203 case nir_texop_txf_ms: 6204 if ((key_tex->msaa_16 & (1 << sampler))) 6205 opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL; 6206 else 6207 opcode = SHADER_OPCODE_TXF_CMS_LOGICAL; 6208 break; 6209 case nir_texop_txf_ms_mcs_intel: 6210 opcode = SHADER_OPCODE_TXF_MCS_LOGICAL; 6211 break; 6212 case nir_texop_query_levels: 6213 case nir_texop_txs: 6214 opcode = SHADER_OPCODE_TXS_LOGICAL; 6215 break; 6216 case nir_texop_lod: 6217 opcode = SHADER_OPCODE_LOD_LOGICAL; 6218 break; 6219 case nir_texop_tg4: 6220 if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) 6221 opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL; 6222 else 6223 opcode = SHADER_OPCODE_TG4_LOGICAL; 6224 break; 6225 case nir_texop_texture_samples: 6226 opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL; 6227 break; 6228 case nir_texop_samples_identical: { 6229 fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D); 6230 6231 /* If mcs is an immediate value, it means there is no MCS. In that case 6232 * just return false. 6233 */ 6234 if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) { 6235 bld.MOV(dst, brw_imm_ud(0u)); 6236 } else if ((key_tex->msaa_16 & (1 << sampler))) { 6237 fs_reg tmp = vgrf(glsl_type::uint_type); 6238 bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS], 6239 offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1)); 6240 bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ); 6241 } else { 6242 bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u), 6243 BRW_CONDITIONAL_EQ); 6244 } 6245 return; 6246 } 6247 default: 6248 unreachable("unknown texture opcode"); 6249 } 6250 6251 if (instr->op == nir_texop_tg4) { 6252 if (instr->component == 1 && 6253 key_tex->gather_channel_quirk_mask & (1 << texture)) { 6254 /* gather4 sampler is broken for green channel on RG32F -- 6255 * we must ask for blue instead. 6256 */ 6257 header_bits |= 2 << 16; 6258 } else { 6259 header_bits |= instr->component << 16; 6260 } 6261 } 6262 6263 fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4); 6264 fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs)); 6265 inst->offset = header_bits; 6266 6267 const unsigned dest_size = nir_tex_instr_dest_size(instr); 6268 if (devinfo->ver >= 9 && 6269 instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) { 6270 unsigned write_mask = instr->dest.is_ssa ? 6271 nir_ssa_def_components_read(&instr->dest.ssa): 6272 (1 << dest_size) - 1; 6273 assert(write_mask != 0); /* dead code should have been eliminated */ 6274 inst->size_written = util_last_bit(write_mask) * 6275 inst->dst.component_size(inst->exec_size); 6276 } else { 6277 inst->size_written = 4 * inst->dst.component_size(inst->exec_size); 6278 } 6279 6280 if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE) 6281 inst->shadow_compare = true; 6282 6283 if (instr->op == nir_texop_tg4 && devinfo->ver == 6) 6284 emit_gfx6_gather_wa(key_tex->gfx6_gather_wa[texture], dst); 6285 6286 fs_reg nir_dest[5]; 6287 for (unsigned i = 0; i < dest_size; i++) 6288 nir_dest[i] = offset(dst, bld, i); 6289 6290 if (instr->op == nir_texop_query_levels) { 6291 /* # levels is in .w */ 6292 if (devinfo->ver <= 9) { 6293 /** 6294 * Wa_1940217: 6295 * 6296 * When a surface of type SURFTYPE_NULL is accessed by resinfo, the 6297 * MIPCount returned is undefined instead of 0. 6298 */ 6299 fs_inst *mov = bld.MOV(bld.null_reg_d(), dst); 6300 mov->conditional_mod = BRW_CONDITIONAL_NZ; 6301 nir_dest[0] = bld.vgrf(BRW_REGISTER_TYPE_D); 6302 fs_inst *sel = bld.SEL(nir_dest[0], offset(dst, bld, 3), brw_imm_d(0)); 6303 sel->predicate = BRW_PREDICATE_NORMAL; 6304 } else { 6305 nir_dest[0] = offset(dst, bld, 3); 6306 } 6307 } else if (instr->op == nir_texop_txs && 6308 dest_size >= 3 && devinfo->ver < 7) { 6309 /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ 6310 fs_reg depth = offset(dst, bld, 2); 6311 nir_dest[2] = vgrf(glsl_type::int_type); 6312 bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE); 6313 } 6314 6315 bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0); 6316} 6317 6318void 6319fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr) 6320{ 6321 switch (instr->type) { 6322 case nir_jump_break: 6323 bld.emit(BRW_OPCODE_BREAK); 6324 break; 6325 case nir_jump_continue: 6326 bld.emit(BRW_OPCODE_CONTINUE); 6327 break; 6328 case nir_jump_halt: 6329 bld.emit(BRW_OPCODE_HALT); 6330 break; 6331 case nir_jump_return: 6332 default: 6333 unreachable("unknown jump"); 6334 } 6335} 6336 6337/* 6338 * This helper takes a source register and un/shuffles it into the destination 6339 * register. 6340 * 6341 * If source type size is smaller than destination type size the operation 6342 * needed is a component shuffle. The opposite case would be an unshuffle. If 6343 * source/destination type size is equal a shuffle is done that would be 6344 * equivalent to a simple MOV. 6345 * 6346 * For example, if source is a 16-bit type and destination is 32-bit. A 3 6347 * components .xyz 16-bit vector on SIMD8 would be. 6348 * 6349 * |x1|x2|x3|x4|x5|x6|x7|x8|y1|y2|y3|y4|y5|y6|y7|y8| 6350 * |z1|z2|z3|z4|z5|z6|z7|z8| | | | | | | | | 6351 * 6352 * This helper will return the following 2 32-bit components with the 16-bit 6353 * values shuffled: 6354 * 6355 * |x1 y1|x2 y2|x3 y3|x4 y4|x5 y5|x6 y6|x7 y7|x8 y8| 6356 * |z1 |z2 |z3 |z4 |z5 |z6 |z7 |z8 | 6357 * 6358 * For unshuffle, the example would be the opposite, a 64-bit type source 6359 * and a 32-bit destination. A 2 component .xy 64-bit vector on SIMD8 6360 * would be: 6361 * 6362 * | x1l x1h | x2l x2h | x3l x3h | x4l x4h | 6363 * | x5l x5h | x6l x6h | x7l x7h | x8l x8h | 6364 * | y1l y1h | y2l y2h | y3l y3h | y4l y4h | 6365 * | y5l y5h | y6l y6h | y7l y7h | y8l y8h | 6366 * 6367 * The returned result would be the following 4 32-bit components unshuffled: 6368 * 6369 * | x1l | x2l | x3l | x4l | x5l | x6l | x7l | x8l | 6370 * | x1h | x2h | x3h | x4h | x5h | x6h | x7h | x8h | 6371 * | y1l | y2l | y3l | y4l | y5l | y6l | y7l | y8l | 6372 * | y1h | y2h | y3h | y4h | y5h | y6h | y7h | y8h | 6373 * 6374 * - Source and destination register must not be overlapped. 6375 * - components units are measured in terms of the smaller type between 6376 * source and destination because we are un/shuffling the smaller 6377 * components from/into the bigger ones. 6378 * - first_component parameter allows skipping source components. 6379 */ 6380void 6381shuffle_src_to_dst(const fs_builder &bld, 6382 const fs_reg &dst, 6383 const fs_reg &src, 6384 uint32_t first_component, 6385 uint32_t components) 6386{ 6387 if (type_sz(src.type) == type_sz(dst.type)) { 6388 assert(!regions_overlap(dst, 6389 type_sz(dst.type) * bld.dispatch_width() * components, 6390 offset(src, bld, first_component), 6391 type_sz(src.type) * bld.dispatch_width() * components)); 6392 for (unsigned i = 0; i < components; i++) { 6393 bld.MOV(retype(offset(dst, bld, i), src.type), 6394 offset(src, bld, i + first_component)); 6395 } 6396 } else if (type_sz(src.type) < type_sz(dst.type)) { 6397 /* Source is shuffled into destination */ 6398 unsigned size_ratio = type_sz(dst.type) / type_sz(src.type); 6399 assert(!regions_overlap(dst, 6400 type_sz(dst.type) * bld.dispatch_width() * 6401 DIV_ROUND_UP(components, size_ratio), 6402 offset(src, bld, first_component), 6403 type_sz(src.type) * bld.dispatch_width() * components)); 6404 6405 brw_reg_type shuffle_type = 6406 brw_reg_type_from_bit_size(8 * type_sz(src.type), 6407 BRW_REGISTER_TYPE_D); 6408 for (unsigned i = 0; i < components; i++) { 6409 fs_reg shuffle_component_i = 6410 subscript(offset(dst, bld, i / size_ratio), 6411 shuffle_type, i % size_ratio); 6412 bld.MOV(shuffle_component_i, 6413 retype(offset(src, bld, i + first_component), shuffle_type)); 6414 } 6415 } else { 6416 /* Source is unshuffled into destination */ 6417 unsigned size_ratio = type_sz(src.type) / type_sz(dst.type); 6418 assert(!regions_overlap(dst, 6419 type_sz(dst.type) * bld.dispatch_width() * components, 6420 offset(src, bld, first_component / size_ratio), 6421 type_sz(src.type) * bld.dispatch_width() * 6422 DIV_ROUND_UP(components + (first_component % size_ratio), 6423 size_ratio))); 6424 6425 brw_reg_type shuffle_type = 6426 brw_reg_type_from_bit_size(8 * type_sz(dst.type), 6427 BRW_REGISTER_TYPE_D); 6428 for (unsigned i = 0; i < components; i++) { 6429 fs_reg shuffle_component_i = 6430 subscript(offset(src, bld, (first_component + i) / size_ratio), 6431 shuffle_type, (first_component + i) % size_ratio); 6432 bld.MOV(retype(offset(dst, bld, i), shuffle_type), 6433 shuffle_component_i); 6434 } 6435 } 6436} 6437 6438void 6439shuffle_from_32bit_read(const fs_builder &bld, 6440 const fs_reg &dst, 6441 const fs_reg &src, 6442 uint32_t first_component, 6443 uint32_t components) 6444{ 6445 assert(type_sz(src.type) == 4); 6446 6447 /* This function takes components in units of the destination type while 6448 * shuffle_src_to_dst takes components in units of the smallest type 6449 */ 6450 if (type_sz(dst.type) > 4) { 6451 assert(type_sz(dst.type) == 8); 6452 first_component *= 2; 6453 components *= 2; 6454 } 6455 6456 shuffle_src_to_dst(bld, dst, src, first_component, components); 6457} 6458 6459fs_reg 6460setup_imm_df(const fs_builder &bld, double v) 6461{ 6462 const struct intel_device_info *devinfo = bld.shader->devinfo; 6463 assert(devinfo->ver >= 7); 6464 6465 if (devinfo->ver >= 8) 6466 return brw_imm_df(v); 6467 6468 /* gfx7.5 does not support DF immediates straighforward but the DIM 6469 * instruction allows to set the 64-bit immediate value. 6470 */ 6471 if (devinfo->is_haswell) { 6472 const fs_builder ubld = bld.exec_all().group(1, 0); 6473 fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1); 6474 ubld.DIM(dst, brw_imm_df(v)); 6475 return component(dst, 0); 6476 } 6477 6478 /* gfx7 does not support DF immediates, so we generate a 64-bit constant by 6479 * writing the low 32-bit of the constant to suboffset 0 of a VGRF and 6480 * the high 32-bit to suboffset 4 and then applying a stride of 0. 6481 * 6482 * Alternatively, we could also produce a normal VGRF (without stride 0) 6483 * by writing to all the channels in the VGRF, however, that would hit the 6484 * gfx7 bug where we have to split writes that span more than 1 register 6485 * into instructions with a width of 4 (otherwise the write to the second 6486 * register written runs into an execmask hardware bug) which isn't very 6487 * nice. 6488 */ 6489 union { 6490 double d; 6491 struct { 6492 uint32_t i1; 6493 uint32_t i2; 6494 }; 6495 } di; 6496 6497 di.d = v; 6498 6499 const fs_builder ubld = bld.exec_all().group(1, 0); 6500 const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2); 6501 ubld.MOV(tmp, brw_imm_ud(di.i1)); 6502 ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2)); 6503 6504 return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0); 6505} 6506 6507fs_reg 6508setup_imm_b(const fs_builder &bld, int8_t v) 6509{ 6510 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_B); 6511 bld.MOV(tmp, brw_imm_w(v)); 6512 return tmp; 6513} 6514 6515fs_reg 6516setup_imm_ub(const fs_builder &bld, uint8_t v) 6517{ 6518 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UB); 6519 bld.MOV(tmp, brw_imm_uw(v)); 6520 return tmp; 6521} 6522