1/* 2 * Copyright © 2014 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_nir.h" 25#include "brw_shader.h" 26#include "dev/gen_debug.h" 27#include "compiler/glsl_types.h" 28#include "compiler/nir/nir_builder.h" 29#include "util/u_math.h" 30 31static bool 32is_input(nir_intrinsic_instr *intrin) 33{ 34 return intrin->intrinsic == nir_intrinsic_load_input || 35 intrin->intrinsic == nir_intrinsic_load_per_vertex_input || 36 intrin->intrinsic == nir_intrinsic_load_interpolated_input; 37} 38 39static bool 40is_output(nir_intrinsic_instr *intrin) 41{ 42 return intrin->intrinsic == nir_intrinsic_load_output || 43 intrin->intrinsic == nir_intrinsic_load_per_vertex_output || 44 intrin->intrinsic == nir_intrinsic_store_output || 45 intrin->intrinsic == nir_intrinsic_store_per_vertex_output; 46} 47 48/** 49 * In many cases, we just add the base and offset together, so there's no 50 * reason to keep them separate. Sometimes, combining them is essential: 51 * if a shader only accesses part of a compound variable (such as a matrix 52 * or array), the variable's base may not actually exist in the VUE map. 53 * 54 * This pass adds constant offsets to instr->const_index[0], and resets 55 * the offset source to 0. Non-constant offsets remain unchanged - since 56 * we don't know what part of a compound variable is accessed, we allocate 57 * storage for the entire thing. 58 */ 59 60static bool 61add_const_offset_to_base_block(nir_block *block, nir_builder *b, 62 nir_variable_mode mode) 63{ 64 nir_foreach_instr_safe(instr, block) { 65 if (instr->type != nir_instr_type_intrinsic) 66 continue; 67 68 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 69 70 if ((mode == nir_var_shader_in && is_input(intrin)) || 71 (mode == nir_var_shader_out && is_output(intrin))) { 72 nir_src *offset = nir_get_io_offset_src(intrin); 73 74 if (nir_src_is_const(*offset)) { 75 intrin->const_index[0] += nir_src_as_uint(*offset); 76 b->cursor = nir_before_instr(&intrin->instr); 77 nir_instr_rewrite_src(&intrin->instr, offset, 78 nir_src_for_ssa(nir_imm_int(b, 0))); 79 } 80 } 81 } 82 return true; 83} 84 85static void 86add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode) 87{ 88 nir_foreach_function(f, nir) { 89 if (f->impl) { 90 nir_builder b; 91 nir_builder_init(&b, f->impl); 92 nir_foreach_block(block, f->impl) { 93 add_const_offset_to_base_block(block, &b, mode); 94 } 95 } 96 } 97} 98 99static bool 100remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr, 101 GLenum primitive_mode) 102{ 103 const int location = nir_intrinsic_base(intr); 104 const unsigned component = nir_intrinsic_component(intr); 105 bool out_of_bounds; 106 107 if (location == VARYING_SLOT_TESS_LEVEL_INNER) { 108 switch (primitive_mode) { 109 case GL_QUADS: 110 /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */ 111 nir_intrinsic_set_base(intr, 0); 112 nir_intrinsic_set_component(intr, 3 - component); 113 out_of_bounds = false; 114 break; 115 case GL_TRIANGLES: 116 /* gl_TessLevelInner[0] lives at DWord 4. */ 117 nir_intrinsic_set_base(intr, 1); 118 out_of_bounds = component > 0; 119 break; 120 case GL_ISOLINES: 121 out_of_bounds = true; 122 break; 123 default: 124 unreachable("Bogus tessellation domain"); 125 } 126 } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) { 127 if (primitive_mode == GL_ISOLINES) { 128 /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */ 129 nir_intrinsic_set_base(intr, 1); 130 nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr)); 131 out_of_bounds = component > 1; 132 } else { 133 /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */ 134 nir_intrinsic_set_base(intr, 1); 135 nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr)); 136 out_of_bounds = component == 3 && primitive_mode == GL_TRIANGLES; 137 } 138 } else { 139 return false; 140 } 141 142 if (out_of_bounds) { 143 if (nir_intrinsic_infos[intr->intrinsic].has_dest) { 144 b->cursor = nir_before_instr(&intr->instr); 145 nir_ssa_def *undef = nir_ssa_undef(b, 1, 32); 146 nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(undef)); 147 } 148 nir_instr_remove(&intr->instr); 149 } 150 151 return true; 152} 153 154static bool 155remap_patch_urb_offsets(nir_block *block, nir_builder *b, 156 const struct brw_vue_map *vue_map, 157 GLenum tes_primitive_mode) 158{ 159 const bool is_passthrough_tcs = b->shader->info.name && 160 strcmp(b->shader->info.name, "passthrough") == 0; 161 162 nir_foreach_instr_safe(instr, block) { 163 if (instr->type != nir_instr_type_intrinsic) 164 continue; 165 166 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 167 168 gl_shader_stage stage = b->shader->info.stage; 169 170 if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) || 171 (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) { 172 173 if (!is_passthrough_tcs && 174 remap_tess_levels(b, intrin, tes_primitive_mode)) 175 continue; 176 177 int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]]; 178 assert(vue_slot != -1); 179 intrin->const_index[0] = vue_slot; 180 181 nir_src *vertex = nir_get_io_vertex_index_src(intrin); 182 if (vertex) { 183 if (nir_src_is_const(*vertex)) { 184 intrin->const_index[0] += nir_src_as_uint(*vertex) * 185 vue_map->num_per_vertex_slots; 186 } else { 187 b->cursor = nir_before_instr(&intrin->instr); 188 189 /* Multiply by the number of per-vertex slots. */ 190 nir_ssa_def *vertex_offset = 191 nir_imul(b, 192 nir_ssa_for_src(b, *vertex, 1), 193 nir_imm_int(b, 194 vue_map->num_per_vertex_slots)); 195 196 /* Add it to the existing offset */ 197 nir_src *offset = nir_get_io_offset_src(intrin); 198 nir_ssa_def *total_offset = 199 nir_iadd(b, vertex_offset, 200 nir_ssa_for_src(b, *offset, 1)); 201 202 nir_instr_rewrite_src(&intrin->instr, offset, 203 nir_src_for_ssa(total_offset)); 204 } 205 } 206 } 207 } 208 return true; 209} 210 211void 212brw_nir_lower_vs_inputs(nir_shader *nir, 213 const uint8_t *vs_attrib_wa_flags) 214{ 215 /* Start with the location of the variable's base. */ 216 foreach_list_typed(nir_variable, var, node, &nir->inputs) { 217 var->data.driver_location = var->data.location; 218 } 219 220 /* Now use nir_lower_io to walk dereference chains. Attribute arrays are 221 * loaded as one vec4 or dvec4 per element (or matrix column), depending on 222 * whether it is a double-precision type or not. 223 */ 224 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); 225 226 /* This pass needs actual constants */ 227 nir_opt_constant_folding(nir); 228 229 add_const_offset_to_base(nir, nir_var_shader_in); 230 231 brw_nir_apply_attribute_workarounds(nir, vs_attrib_wa_flags); 232 233 /* The last step is to remap VERT_ATTRIB_* to actual registers */ 234 235 /* Whether or not we have any system generated values. gl_DrawID is not 236 * included here as it lives in its own vec4. 237 */ 238 const bool has_sgvs = 239 nir->info.system_values_read & 240 (BITFIELD64_BIT(SYSTEM_VALUE_FIRST_VERTEX) | 241 BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) | 242 BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) | 243 BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)); 244 245 const unsigned num_inputs = util_bitcount64(nir->info.inputs_read); 246 247 nir_foreach_function(function, nir) { 248 if (!function->impl) 249 continue; 250 251 nir_builder b; 252 nir_builder_init(&b, function->impl); 253 254 nir_foreach_block(block, function->impl) { 255 nir_foreach_instr_safe(instr, block) { 256 if (instr->type != nir_instr_type_intrinsic) 257 continue; 258 259 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 260 261 switch (intrin->intrinsic) { 262 case nir_intrinsic_load_first_vertex: 263 case nir_intrinsic_load_base_instance: 264 case nir_intrinsic_load_vertex_id_zero_base: 265 case nir_intrinsic_load_instance_id: 266 case nir_intrinsic_load_is_indexed_draw: 267 case nir_intrinsic_load_draw_id: { 268 b.cursor = nir_after_instr(&intrin->instr); 269 270 /* gl_VertexID and friends are stored by the VF as the last 271 * vertex element. We convert them to load_input intrinsics at 272 * the right location. 273 */ 274 nir_intrinsic_instr *load = 275 nir_intrinsic_instr_create(nir, nir_intrinsic_load_input); 276 load->src[0] = nir_src_for_ssa(nir_imm_int(&b, 0)); 277 278 nir_intrinsic_set_base(load, num_inputs); 279 switch (intrin->intrinsic) { 280 case nir_intrinsic_load_first_vertex: 281 nir_intrinsic_set_component(load, 0); 282 break; 283 case nir_intrinsic_load_base_instance: 284 nir_intrinsic_set_component(load, 1); 285 break; 286 case nir_intrinsic_load_vertex_id_zero_base: 287 nir_intrinsic_set_component(load, 2); 288 break; 289 case nir_intrinsic_load_instance_id: 290 nir_intrinsic_set_component(load, 3); 291 break; 292 case nir_intrinsic_load_draw_id: 293 case nir_intrinsic_load_is_indexed_draw: 294 /* gl_DrawID and IsIndexedDraw are stored right after 295 * gl_VertexID and friends if any of them exist. 296 */ 297 nir_intrinsic_set_base(load, num_inputs + has_sgvs); 298 if (intrin->intrinsic == nir_intrinsic_load_draw_id) 299 nir_intrinsic_set_component(load, 0); 300 else 301 nir_intrinsic_set_component(load, 1); 302 break; 303 default: 304 unreachable("Invalid system value intrinsic"); 305 } 306 307 load->num_components = 1; 308 nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); 309 nir_builder_instr_insert(&b, &load->instr); 310 311 nir_ssa_def_rewrite_uses(&intrin->dest.ssa, 312 nir_src_for_ssa(&load->dest.ssa)); 313 nir_instr_remove(&intrin->instr); 314 break; 315 } 316 317 case nir_intrinsic_load_input: { 318 /* Attributes come in a contiguous block, ordered by their 319 * gl_vert_attrib value. That means we can compute the slot 320 * number for an attribute by masking out the enabled attributes 321 * before it and counting the bits. 322 */ 323 int attr = nir_intrinsic_base(intrin); 324 int slot = util_bitcount64(nir->info.inputs_read & 325 BITFIELD64_MASK(attr)); 326 nir_intrinsic_set_base(intrin, slot); 327 break; 328 } 329 330 default: 331 break; /* Nothing to do */ 332 } 333 } 334 } 335 } 336} 337 338void 339brw_nir_lower_vue_inputs(nir_shader *nir, 340 const struct brw_vue_map *vue_map) 341{ 342 foreach_list_typed(nir_variable, var, node, &nir->inputs) { 343 var->data.driver_location = var->data.location; 344 } 345 346 /* Inputs are stored in vec4 slots, so use type_size_vec4(). */ 347 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); 348 349 /* This pass needs actual constants */ 350 nir_opt_constant_folding(nir); 351 352 add_const_offset_to_base(nir, nir_var_shader_in); 353 354 nir_foreach_function(function, nir) { 355 if (!function->impl) 356 continue; 357 358 nir_foreach_block(block, function->impl) { 359 nir_foreach_instr(instr, block) { 360 if (instr->type != nir_instr_type_intrinsic) 361 continue; 362 363 nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); 364 365 if (intrin->intrinsic == nir_intrinsic_load_input || 366 intrin->intrinsic == nir_intrinsic_load_per_vertex_input) { 367 /* Offset 0 is the VUE header, which contains 368 * VARYING_SLOT_LAYER [.y], VARYING_SLOT_VIEWPORT [.z], and 369 * VARYING_SLOT_PSIZ [.w]. 370 */ 371 int varying = nir_intrinsic_base(intrin); 372 int vue_slot; 373 switch (varying) { 374 case VARYING_SLOT_PSIZ: 375 nir_intrinsic_set_base(intrin, 0); 376 nir_intrinsic_set_component(intrin, 3); 377 break; 378 379 default: 380 vue_slot = vue_map->varying_to_slot[varying]; 381 assert(vue_slot != -1); 382 nir_intrinsic_set_base(intrin, vue_slot); 383 break; 384 } 385 } 386 } 387 } 388 } 389} 390 391void 392brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map) 393{ 394 foreach_list_typed(nir_variable, var, node, &nir->inputs) { 395 var->data.driver_location = var->data.location; 396 } 397 398 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0); 399 400 /* This pass needs actual constants */ 401 nir_opt_constant_folding(nir); 402 403 add_const_offset_to_base(nir, nir_var_shader_in); 404 405 nir_foreach_function(function, nir) { 406 if (function->impl) { 407 nir_builder b; 408 nir_builder_init(&b, function->impl); 409 nir_foreach_block(block, function->impl) { 410 remap_patch_urb_offsets(block, &b, vue_map, 411 nir->info.tess.primitive_mode); 412 } 413 } 414 } 415} 416 417void 418brw_nir_lower_fs_inputs(nir_shader *nir, 419 const struct gen_device_info *devinfo, 420 const struct brw_wm_prog_key *key) 421{ 422 foreach_list_typed(nir_variable, var, node, &nir->inputs) { 423 var->data.driver_location = var->data.location; 424 425 /* Apply default interpolation mode. 426 * 427 * Everything defaults to smooth except for the legacy GL color 428 * built-in variables, which might be flat depending on API state. 429 */ 430 if (var->data.interpolation == INTERP_MODE_NONE) { 431 const bool flat = key->flat_shade && 432 (var->data.location == VARYING_SLOT_COL0 || 433 var->data.location == VARYING_SLOT_COL1); 434 435 var->data.interpolation = flat ? INTERP_MODE_FLAT 436 : INTERP_MODE_SMOOTH; 437 } 438 439 /* On Ironlake and below, there is only one interpolation mode. 440 * Centroid interpolation doesn't mean anything on this hardware -- 441 * there is no multisampling. 442 */ 443 if (devinfo->gen < 6) { 444 var->data.centroid = false; 445 var->data.sample = false; 446 } 447 } 448 449 nir_lower_io_options lower_io_options = 0; 450 if (key->persample_interp) 451 lower_io_options |= nir_lower_io_force_sample_interpolation; 452 453 nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options); 454 455 /* This pass needs actual constants */ 456 nir_opt_constant_folding(nir); 457 458 add_const_offset_to_base(nir, nir_var_shader_in); 459} 460 461void 462brw_nir_lower_vue_outputs(nir_shader *nir) 463{ 464 nir_foreach_variable(var, &nir->outputs) { 465 var->data.driver_location = var->data.location; 466 } 467 468 nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0); 469} 470 471void 472brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map, 473 GLenum tes_primitive_mode) 474{ 475 nir_foreach_variable(var, &nir->outputs) { 476 var->data.driver_location = var->data.location; 477 } 478 479 nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0); 480 481 /* This pass needs actual constants */ 482 nir_opt_constant_folding(nir); 483 484 add_const_offset_to_base(nir, nir_var_shader_out); 485 486 nir_foreach_function(function, nir) { 487 if (function->impl) { 488 nir_builder b; 489 nir_builder_init(&b, function->impl); 490 nir_foreach_block(block, function->impl) { 491 remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode); 492 } 493 } 494 } 495} 496 497void 498brw_nir_lower_fs_outputs(nir_shader *nir) 499{ 500 nir_foreach_variable(var, &nir->outputs) { 501 var->data.driver_location = 502 SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) | 503 SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION); 504 } 505 506 nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0); 507} 508 509#define OPT(pass, ...) ({ \ 510 bool this_progress = false; \ 511 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ 512 if (this_progress) \ 513 progress = true; \ 514 this_progress; \ 515}) 516 517static nir_variable_mode 518brw_nir_no_indirect_mask(const struct brw_compiler *compiler, 519 gl_shader_stage stage) 520{ 521 nir_variable_mode indirect_mask = 0; 522 523 if (compiler->glsl_compiler_options[stage].EmitNoIndirectInput) 524 indirect_mask |= nir_var_shader_in; 525 if (compiler->glsl_compiler_options[stage].EmitNoIndirectOutput) 526 indirect_mask |= nir_var_shader_out; 527 if (compiler->glsl_compiler_options[stage].EmitNoIndirectTemp) 528 indirect_mask |= nir_var_function_temp; 529 530 return indirect_mask; 531} 532 533nir_shader * 534brw_nir_optimize(nir_shader *nir, const struct brw_compiler *compiler, 535 bool is_scalar, bool allow_copies) 536{ 537 nir_variable_mode indirect_mask = 538 brw_nir_no_indirect_mask(compiler, nir->info.stage); 539 540 bool progress; 541 do { 542 progress = false; 543 OPT(nir_split_array_vars, nir_var_function_temp); 544 OPT(nir_shrink_vec_array_vars, nir_var_function_temp); 545 OPT(nir_opt_deref); 546 OPT(nir_lower_vars_to_ssa); 547 if (allow_copies) { 548 /* Only run this pass in the first call to brw_nir_optimize. Later 549 * calls assume that we've lowered away any copy_deref instructions 550 * and we don't want to introduce any more. 551 */ 552 OPT(nir_opt_find_array_copies); 553 } 554 OPT(nir_opt_copy_prop_vars); 555 OPT(nir_opt_dead_write_vars); 556 OPT(nir_opt_combine_stores, nir_var_all); 557 558 if (is_scalar) { 559 OPT(nir_lower_alu_to_scalar); 560 } 561 562 OPT(nir_copy_prop); 563 564 if (is_scalar) { 565 OPT(nir_lower_phis_to_scalar); 566 } 567 568 OPT(nir_copy_prop); 569 OPT(nir_opt_dce); 570 OPT(nir_opt_cse); 571 OPT(nir_opt_combine_stores, nir_var_all); 572 573 /* Passing 0 to the peephole select pass causes it to convert 574 * if-statements that contain only move instructions in the branches 575 * regardless of the count. 576 * 577 * Passing 1 to the peephole select pass causes it to convert 578 * if-statements that contain at most a single ALU instruction (total) 579 * in both branches. Before Gen6, some math instructions were 580 * prohibitively expensive and the results of compare operations need an 581 * extra resolve step. For these reasons, this pass is more harmful 582 * than good on those platforms. 583 * 584 * For indirect loads of uniforms (push constants), we assume that array 585 * indices will nearly always be in bounds and the cost of the load is 586 * low. Therefore there shouldn't be a performance benefit to avoid it. 587 * However, in vec4 tessellation shaders, these loads operate by 588 * actually pulling from memory. 589 */ 590 const bool is_vec4_tessellation = !is_scalar && 591 (nir->info.stage == MESA_SHADER_TESS_CTRL || 592 nir->info.stage == MESA_SHADER_TESS_EVAL); 593 OPT(nir_opt_peephole_select, 0, !is_vec4_tessellation, false); 594 OPT(nir_opt_peephole_select, 1, !is_vec4_tessellation, 595 compiler->devinfo->gen >= 6); 596 597 OPT(nir_opt_intrinsics); 598 OPT(nir_opt_idiv_const, 32); 599 OPT(nir_opt_algebraic); 600 OPT(nir_opt_constant_folding); 601 OPT(nir_opt_dead_cf); 602 if (OPT(nir_opt_trivial_continues)) { 603 /* If nir_opt_trivial_continues makes progress, then we need to clean 604 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll 605 * to make progress. 606 */ 607 OPT(nir_copy_prop); 608 OPT(nir_opt_dce); 609 } 610 OPT(nir_opt_if, false); 611 if (nir->options->max_unroll_iterations != 0) { 612 OPT(nir_opt_loop_unroll, indirect_mask); 613 } 614 OPT(nir_opt_remove_phis); 615 OPT(nir_opt_undef); 616 OPT(nir_lower_pack); 617 } while (progress); 618 619 /* Workaround Gfxbench unused local sampler variable which will trigger an 620 * assert in the opt_large_constants pass. 621 */ 622 OPT(nir_remove_dead_variables, nir_var_function_temp); 623 624 return nir; 625} 626 627static unsigned 628lower_bit_size_callback(const nir_alu_instr *alu, UNUSED void *data) 629{ 630 assert(alu->dest.dest.is_ssa); 631 if (alu->dest.dest.ssa.bit_size >= 32) 632 return 0; 633 634 const struct brw_compiler *compiler = (const struct brw_compiler *) data; 635 636 switch (alu->op) { 637 case nir_op_idiv: 638 case nir_op_imod: 639 case nir_op_irem: 640 case nir_op_udiv: 641 case nir_op_umod: 642 case nir_op_fceil: 643 case nir_op_ffloor: 644 case nir_op_ffract: 645 case nir_op_fround_even: 646 case nir_op_ftrunc: 647 return 32; 648 case nir_op_frcp: 649 case nir_op_frsq: 650 case nir_op_fsqrt: 651 case nir_op_fpow: 652 case nir_op_fexp2: 653 case nir_op_flog2: 654 case nir_op_fsin: 655 case nir_op_fcos: 656 return compiler->devinfo->gen < 9 ? 32 : 0; 657 default: 658 return 0; 659 } 660} 661 662/* Does some simple lowering and runs the standard suite of optimizations 663 * 664 * This is intended to be called more-or-less directly after you get the 665 * shader out of GLSL or some other source. While it is geared towards i965, 666 * it is not at all generator-specific except for the is_scalar flag. Even 667 * there, it is safe to call with is_scalar = false for a shader that is 668 * intended for the FS backend as long as nir_optimize is called again with 669 * is_scalar = true to scalarize everything prior to code gen. 670 */ 671nir_shader * 672brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir, 673 const nir_shader *softfp64) 674{ 675 const struct gen_device_info *devinfo = compiler->devinfo; 676 UNUSED bool progress; /* Written by OPT */ 677 678 const bool is_scalar = compiler->scalar_stage[nir->info.stage]; 679 680 if (is_scalar) { 681 OPT(nir_lower_alu_to_scalar); 682 } 683 684 if (nir->info.stage == MESA_SHADER_GEOMETRY) 685 OPT(nir_lower_gs_intrinsics); 686 687 /* See also brw_nir_trig_workarounds.py */ 688 if (compiler->precise_trig && 689 !(devinfo->gen >= 10 || devinfo->is_kabylake)) 690 OPT(brw_nir_apply_trig_workarounds); 691 692 static const nir_lower_tex_options tex_options = { 693 .lower_txp = ~0, 694 .lower_txf_offset = true, 695 .lower_rect_offset = true, 696 .lower_tex_without_implicit_lod = true, 697 .lower_txd_cube_map = true, 698 .lower_txb_shadow_clamp = true, 699 .lower_txd_shadow_clamp = true, 700 .lower_txd_offset_clamp = true, 701 .lower_tg4_offsets = true, 702 }; 703 704 OPT(nir_lower_tex, &tex_options); 705 OPT(nir_normalize_cubemap_coords); 706 707 OPT(nir_lower_global_vars_to_local); 708 709 OPT(nir_split_var_copies); 710 OPT(nir_split_struct_vars, nir_var_function_temp); 711 712 nir = brw_nir_optimize(nir, compiler, is_scalar, true); 713 714 bool lowered_64bit_ops = false; 715 do { 716 progress = false; 717 718 OPT(nir_lower_int64, nir->options->lower_int64_options); 719 OPT(nir_lower_doubles, softfp64, nir->options->lower_doubles_options); 720 721 /* Necessary to lower add -> sub and div -> mul/rcp */ 722 OPT(nir_opt_algebraic); 723 724 lowered_64bit_ops |= progress; 725 } while (progress); 726 727 /* This needs to be run after the first optimization pass but before we 728 * lower indirect derefs away 729 */ 730 if (compiler->supports_shader_constants) { 731 OPT(nir_opt_large_constants, NULL, 32); 732 } 733 734 OPT(nir_lower_bit_size, lower_bit_size_callback, (void *)compiler); 735 736 if (is_scalar) { 737 OPT(nir_lower_load_const_to_scalar); 738 } 739 740 /* Lower a bunch of stuff */ 741 OPT(nir_lower_var_copies); 742 743 OPT(nir_lower_system_values); 744 745 const nir_lower_subgroups_options subgroups_options = { 746 .subgroup_size = BRW_SUBGROUP_SIZE, 747 .ballot_bit_size = 32, 748 .lower_to_scalar = true, 749 .lower_subgroup_masks = true, 750 .lower_vote_trivial = !is_scalar, 751 .lower_shuffle = true, 752 }; 753 OPT(nir_lower_subgroups, &subgroups_options); 754 755 OPT(nir_lower_clip_cull_distance_arrays); 756 757 nir_variable_mode indirect_mask = 758 brw_nir_no_indirect_mask(compiler, nir->info.stage); 759 OPT(nir_lower_indirect_derefs, indirect_mask); 760 761 /* Lower array derefs of vectors for SSBO and UBO loads. For both UBOs and 762 * SSBOs, our back-end is capable of loading an entire vec4 at a time and 763 * we would like to take advantage of that whenever possible regardless of 764 * whether or not the app gives us full loads. This should allow the 765 * optimizer to combine UBO and SSBO load operations and save us some send 766 * messages. 767 */ 768 OPT(nir_lower_array_deref_of_vec, 769 nir_var_mem_ubo | nir_var_mem_ssbo, 770 nir_lower_direct_array_deref_of_vec_load); 771 772 /* Get rid of split copies */ 773 nir = brw_nir_optimize(nir, compiler, is_scalar, false); 774 775 return nir; 776} 777 778void 779brw_nir_link_shaders(const struct brw_compiler *compiler, 780 nir_shader **producer, nir_shader **consumer) 781{ 782 nir_lower_io_arrays_to_elements(*producer, *consumer); 783 nir_validate_shader(*producer, "after nir_lower_io_arrays_to_elements"); 784 nir_validate_shader(*consumer, "after nir_lower_io_arrays_to_elements"); 785 786 const bool p_is_scalar = 787 compiler->scalar_stage[(*producer)->info.stage]; 788 const bool c_is_scalar = 789 compiler->scalar_stage[(*consumer)->info.stage]; 790 791 if (p_is_scalar && c_is_scalar) { 792 NIR_PASS_V(*producer, nir_lower_io_to_scalar_early, nir_var_shader_out); 793 NIR_PASS_V(*consumer, nir_lower_io_to_scalar_early, nir_var_shader_in); 794 *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false); 795 *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); 796 } 797 798 if (nir_link_opt_varyings(*producer, *consumer)) 799 *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); 800 801 NIR_PASS_V(*producer, nir_remove_dead_variables, nir_var_shader_out); 802 NIR_PASS_V(*consumer, nir_remove_dead_variables, nir_var_shader_in); 803 804 if (nir_remove_unused_varyings(*producer, *consumer)) { 805 NIR_PASS_V(*producer, nir_lower_global_vars_to_local); 806 NIR_PASS_V(*consumer, nir_lower_global_vars_to_local); 807 808 /* The backend might not be able to handle indirects on 809 * temporaries so we need to lower indirects on any of the 810 * varyings we have demoted here. 811 */ 812 NIR_PASS_V(*producer, nir_lower_indirect_derefs, 813 brw_nir_no_indirect_mask(compiler, (*producer)->info.stage)); 814 NIR_PASS_V(*consumer, nir_lower_indirect_derefs, 815 brw_nir_no_indirect_mask(compiler, (*consumer)->info.stage)); 816 817 *producer = brw_nir_optimize(*producer, compiler, p_is_scalar, false); 818 *consumer = brw_nir_optimize(*consumer, compiler, c_is_scalar, false); 819 } 820 821 NIR_PASS_V(*producer, nir_lower_io_to_vector, nir_var_shader_out); 822 NIR_PASS_V(*producer, nir_opt_combine_stores, nir_var_shader_out); 823 NIR_PASS_V(*consumer, nir_lower_io_to_vector, nir_var_shader_in); 824 825 if ((*producer)->info.stage != MESA_SHADER_TESS_CTRL) { 826 /* Calling lower_io_to_vector creates output variable writes with 827 * write-masks. On non-TCS outputs, the back-end can't handle it and we 828 * need to call nir_lower_io_to_temporaries to get rid of them. This, 829 * in turn, creates temporary variables and extra copy_deref intrinsics 830 * that we need to clean up. 831 */ 832 NIR_PASS_V(*producer, nir_lower_io_to_temporaries, 833 nir_shader_get_entrypoint(*producer), true, false); 834 NIR_PASS_V(*producer, nir_lower_global_vars_to_local); 835 NIR_PASS_V(*producer, nir_split_var_copies); 836 NIR_PASS_V(*producer, nir_lower_var_copies); 837 } 838} 839 840/* Prepare the given shader for codegen 841 * 842 * This function is intended to be called right before going into the actual 843 * backend and is highly backend-specific. Also, once this function has been 844 * called on a shader, it will no longer be in SSA form so most optimizations 845 * will not work. 846 */ 847nir_shader * 848brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler, 849 bool is_scalar) 850{ 851 const struct gen_device_info *devinfo = compiler->devinfo; 852 bool debug_enabled = 853 (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->info.stage)); 854 855 UNUSED bool progress; /* Written by OPT */ 856 857 OPT(brw_nir_lower_mem_access_bit_sizes); 858 OPT(nir_lower_int64, nir->options->lower_int64_options); 859 860 do { 861 progress = false; 862 OPT(nir_opt_algebraic_before_ffma); 863 } while (progress); 864 865 nir = brw_nir_optimize(nir, compiler, is_scalar, false); 866 867 if (devinfo->gen >= 6) { 868 /* Try and fuse multiply-adds */ 869 OPT(brw_nir_opt_peephole_ffma); 870 } 871 872 if (OPT(nir_opt_comparison_pre)) { 873 OPT(nir_copy_prop); 874 OPT(nir_opt_dce); 875 OPT(nir_opt_cse); 876 877 /* Do the select peepehole again. nir_opt_comparison_pre (combined with 878 * the other optimization passes) will have removed at least one 879 * instruction from one of the branches of the if-statement, so now it 880 * might be under the threshold of conversion to bcsel. 881 * 882 * See brw_nir_optimize for the explanation of is_vec4_tessellation. 883 */ 884 const bool is_vec4_tessellation = !is_scalar && 885 (nir->info.stage == MESA_SHADER_TESS_CTRL || 886 nir->info.stage == MESA_SHADER_TESS_EVAL); 887 OPT(nir_opt_peephole_select, 0, is_vec4_tessellation, false); 888 OPT(nir_opt_peephole_select, 1, is_vec4_tessellation, 889 compiler->devinfo->gen >= 6); 890 } 891 892 OPT(nir_opt_algebraic_late); 893 894 OPT(brw_nir_lower_conversions); 895 896 OPT(nir_lower_to_source_mods, nir_lower_all_source_mods); 897 OPT(nir_copy_prop); 898 OPT(nir_opt_dce); 899 OPT(nir_opt_move_comparisons); 900 901 OPT(nir_lower_bool_to_int32); 902 903 OPT(nir_lower_locals_to_regs); 904 905 if (unlikely(debug_enabled)) { 906 /* Re-index SSA defs so we print more sensible numbers. */ 907 nir_foreach_function(function, nir) { 908 if (function->impl) 909 nir_index_ssa_defs(function->impl); 910 } 911 912 fprintf(stderr, "NIR (SSA form) for %s shader:\n", 913 _mesa_shader_stage_to_string(nir->info.stage)); 914 nir_print_shader(nir, stderr); 915 } 916 917 OPT(nir_convert_from_ssa, true); 918 919 if (!is_scalar) { 920 OPT(nir_move_vec_src_uses_to_dest); 921 OPT(nir_lower_vec_to_movs); 922 } 923 924 OPT(nir_opt_dce); 925 926 /* This is the last pass we run before we start emitting stuff. It 927 * determines when we need to insert boolean resolves on Gen <= 5. We 928 * run it last because it stashes data in instr->pass_flags and we don't 929 * want that to be squashed by other NIR passes. 930 */ 931 if (devinfo->gen <= 5) 932 brw_nir_analyze_boolean_resolves(nir); 933 934 nir_sweep(nir); 935 936 if (unlikely(debug_enabled)) { 937 fprintf(stderr, "NIR (final form) for %s shader:\n", 938 _mesa_shader_stage_to_string(nir->info.stage)); 939 nir_print_shader(nir, stderr); 940 } 941 942 return nir; 943} 944 945nir_shader * 946brw_nir_apply_sampler_key(nir_shader *nir, 947 const struct brw_compiler *compiler, 948 const struct brw_sampler_prog_key_data *key_tex, 949 bool is_scalar) 950{ 951 const struct gen_device_info *devinfo = compiler->devinfo; 952 nir_lower_tex_options tex_options = { 953 .lower_txd_clamp_bindless_sampler = true, 954 .lower_txd_clamp_if_sampler_index_not_lt_16 = true, 955 }; 956 957 /* Iron Lake and prior require lowering of all rectangle textures */ 958 if (devinfo->gen < 6) 959 tex_options.lower_rect = true; 960 961 /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */ 962 if (devinfo->gen < 8) { 963 tex_options.saturate_s = key_tex->gl_clamp_mask[0]; 964 tex_options.saturate_t = key_tex->gl_clamp_mask[1]; 965 tex_options.saturate_r = key_tex->gl_clamp_mask[2]; 966 } 967 968 /* Prior to Haswell, we have to fake texture swizzle */ 969 for (unsigned s = 0; s < MAX_SAMPLERS; s++) { 970 if (key_tex->swizzles[s] == SWIZZLE_NOOP) 971 continue; 972 973 tex_options.swizzle_result |= (1 << s); 974 for (unsigned c = 0; c < 4; c++) 975 tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c); 976 } 977 978 /* Prior to Haswell, we have to lower gradients on shadow samplers */ 979 tex_options.lower_txd_shadow = devinfo->gen < 8 && !devinfo->is_haswell; 980 981 tex_options.lower_y_uv_external = key_tex->y_uv_image_mask; 982 tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask; 983 tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask; 984 tex_options.lower_xy_uxvx_external = key_tex->xy_uxvx_image_mask; 985 tex_options.lower_ayuv_external = key_tex->ayuv_image_mask; 986 tex_options.lower_xyuv_external = key_tex->xyuv_image_mask; 987 988 /* Setup array of scaling factors for each texture. */ 989 memcpy(&tex_options.scale_factors, &key_tex->scale_factors, 990 sizeof(tex_options.scale_factors)); 991 992 if (nir_lower_tex(nir, &tex_options)) { 993 nir_validate_shader(nir, "after nir_lower_tex"); 994 nir = brw_nir_optimize(nir, compiler, is_scalar, false); 995 } 996 997 return nir; 998} 999 1000enum brw_reg_type 1001brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type) 1002{ 1003 switch (type) { 1004 case nir_type_uint: 1005 case nir_type_uint32: 1006 return BRW_REGISTER_TYPE_UD; 1007 case nir_type_bool: 1008 case nir_type_int: 1009 case nir_type_bool32: 1010 case nir_type_int32: 1011 return BRW_REGISTER_TYPE_D; 1012 case nir_type_float: 1013 case nir_type_float32: 1014 return BRW_REGISTER_TYPE_F; 1015 case nir_type_float16: 1016 return BRW_REGISTER_TYPE_HF; 1017 case nir_type_float64: 1018 return BRW_REGISTER_TYPE_DF; 1019 case nir_type_int64: 1020 return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q; 1021 case nir_type_uint64: 1022 return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ; 1023 case nir_type_int16: 1024 return BRW_REGISTER_TYPE_W; 1025 case nir_type_uint16: 1026 return BRW_REGISTER_TYPE_UW; 1027 case nir_type_int8: 1028 return BRW_REGISTER_TYPE_B; 1029 case nir_type_uint8: 1030 return BRW_REGISTER_TYPE_UB; 1031 default: 1032 unreachable("unknown type"); 1033 } 1034 1035 return BRW_REGISTER_TYPE_F; 1036} 1037 1038/* Returns the glsl_base_type corresponding to a nir_alu_type. 1039 * This is used by both brw_vec4_nir and brw_fs_nir. 1040 */ 1041enum glsl_base_type 1042brw_glsl_base_type_for_nir_type(nir_alu_type type) 1043{ 1044 switch (type) { 1045 case nir_type_float: 1046 case nir_type_float32: 1047 return GLSL_TYPE_FLOAT; 1048 1049 case nir_type_float16: 1050 return GLSL_TYPE_FLOAT16; 1051 1052 case nir_type_float64: 1053 return GLSL_TYPE_DOUBLE; 1054 1055 case nir_type_int: 1056 case nir_type_int32: 1057 return GLSL_TYPE_INT; 1058 1059 case nir_type_uint: 1060 case nir_type_uint32: 1061 return GLSL_TYPE_UINT; 1062 1063 case nir_type_int16: 1064 return GLSL_TYPE_INT16; 1065 1066 case nir_type_uint16: 1067 return GLSL_TYPE_UINT16; 1068 1069 default: 1070 unreachable("bad type"); 1071 } 1072} 1073 1074nir_shader * 1075brw_nir_create_passthrough_tcs(void *mem_ctx, const struct brw_compiler *compiler, 1076 const nir_shader_compiler_options *options, 1077 const struct brw_tcs_prog_key *key) 1078{ 1079 nir_builder b; 1080 nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_TESS_CTRL, 1081 options); 1082 nir_shader *nir = b.shader; 1083 nir_variable *var; 1084 nir_intrinsic_instr *load; 1085 nir_intrinsic_instr *store; 1086 nir_ssa_def *zero = nir_imm_int(&b, 0); 1087 nir_ssa_def *invoc_id = nir_load_invocation_id(&b); 1088 1089 nir->info.inputs_read = key->outputs_written & 1090 ~(VARYING_BIT_TESS_LEVEL_INNER | VARYING_BIT_TESS_LEVEL_OUTER); 1091 nir->info.outputs_written = key->outputs_written; 1092 nir->info.tess.tcs_vertices_out = key->input_vertices; 1093 nir->info.name = ralloc_strdup(nir, "passthrough"); 1094 nir->num_uniforms = 8 * sizeof(uint32_t); 1095 1096 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_0"); 1097 var->data.location = 0; 1098 var = nir_variable_create(nir, nir_var_uniform, glsl_vec4_type(), "hdr_1"); 1099 var->data.location = 1; 1100 1101 /* Write the patch URB header. */ 1102 for (int i = 0; i <= 1; i++) { 1103 load = nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform); 1104 load->num_components = 4; 1105 load->src[0] = nir_src_for_ssa(zero); 1106 nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); 1107 nir_intrinsic_set_base(load, i * 4 * sizeof(uint32_t)); 1108 nir_builder_instr_insert(&b, &load->instr); 1109 1110 store = nir_intrinsic_instr_create(nir, nir_intrinsic_store_output); 1111 store->num_components = 4; 1112 store->src[0] = nir_src_for_ssa(&load->dest.ssa); 1113 store->src[1] = nir_src_for_ssa(zero); 1114 nir_intrinsic_set_base(store, VARYING_SLOT_TESS_LEVEL_INNER - i); 1115 nir_intrinsic_set_write_mask(store, WRITEMASK_XYZW); 1116 nir_builder_instr_insert(&b, &store->instr); 1117 } 1118 1119 /* Copy inputs to outputs. */ 1120 uint64_t varyings = nir->info.inputs_read; 1121 1122 while (varyings != 0) { 1123 const int varying = ffsll(varyings) - 1; 1124 1125 load = nir_intrinsic_instr_create(nir, 1126 nir_intrinsic_load_per_vertex_input); 1127 load->num_components = 4; 1128 load->src[0] = nir_src_for_ssa(invoc_id); 1129 load->src[1] = nir_src_for_ssa(zero); 1130 nir_ssa_dest_init(&load->instr, &load->dest, 4, 32, NULL); 1131 nir_intrinsic_set_base(load, varying); 1132 nir_builder_instr_insert(&b, &load->instr); 1133 1134 store = nir_intrinsic_instr_create(nir, 1135 nir_intrinsic_store_per_vertex_output); 1136 store->num_components = 4; 1137 store->src[0] = nir_src_for_ssa(&load->dest.ssa); 1138 store->src[1] = nir_src_for_ssa(invoc_id); 1139 store->src[2] = nir_src_for_ssa(zero); 1140 nir_intrinsic_set_base(store, varying); 1141 nir_intrinsic_set_write_mask(store, WRITEMASK_XYZW); 1142 nir_builder_instr_insert(&b, &store->instr); 1143 1144 varyings &= ~BITFIELD64_BIT(varying); 1145 } 1146 1147 nir_validate_shader(nir, "in brw_nir_create_passthrough_tcs"); 1148 1149 nir = brw_preprocess_nir(compiler, nir, NULL); 1150 1151 return nir; 1152} 1153