1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24/** @file brw_fs_visitor.cpp 25 * 26 * This file supports generating the FS LIR from the GLSL IR. The LIR 27 * makes it easier to do backend-specific optimizations than doing so 28 * in the GLSL IR or in the native code. 29 */ 30#include "brw_fs.h" 31#include "compiler/glsl_types.h" 32 33using namespace brw; 34 35/* Sample from the MCS surface attached to this multisample texture. */ 36fs_reg 37fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components, 38 const fs_reg &texture, 39 const fs_reg &texture_handle) 40{ 41 const fs_reg dest = vgrf(glsl_type::uvec4_type); 42 43 fs_reg srcs[TEX_LOGICAL_NUM_SRCS]; 44 srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate; 45 srcs[TEX_LOGICAL_SRC_SURFACE] = texture; 46 srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0); 47 srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] = texture_handle; 48 srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components); 49 srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0); 50 51 fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs, 52 ARRAY_SIZE(srcs)); 53 54 /* We only care about one or two regs of response, but the sampler always 55 * writes 4/8. 56 */ 57 inst->size_written = 4 * dest.component_size(inst->exec_size); 58 59 return dest; 60} 61 62/** 63 * Apply workarounds for Gen6 gather with UINT/SINT 64 */ 65void 66fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst) 67{ 68 if (!wa) 69 return; 70 71 int width = (wa & WA_8BIT) ? 8 : 16; 72 73 for (int i = 0; i < 4; i++) { 74 fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F); 75 /* Convert from UNORM to UINT */ 76 bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1)); 77 bld.MOV(dst, dst_f); 78 79 if (wa & WA_SIGN) { 80 /* Reinterpret the UINT value as a signed INT value by 81 * shifting the sign bit into place, then shifting back 82 * preserving sign. 83 */ 84 bld.SHL(dst, dst, brw_imm_d(32 - width)); 85 bld.ASR(dst, dst, brw_imm_d(32 - width)); 86 } 87 88 dst = offset(dst, bld, 1); 89 } 90} 91 92/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */ 93void 94fs_visitor::emit_dummy_fs() 95{ 96 int reg_width = dispatch_width / 8; 97 98 /* Everyone's favorite color. */ 99 const float color[4] = { 1.0, 0.0, 1.0, 0.0 }; 100 for (int i = 0; i < 4; i++) { 101 bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F), 102 brw_imm_f(color[i])); 103 } 104 105 fs_inst *write; 106 write = bld.emit(FS_OPCODE_FB_WRITE); 107 write->eot = true; 108 write->last_rt = true; 109 if (devinfo->gen >= 6) { 110 write->base_mrf = 2; 111 write->mlen = 4 * reg_width; 112 } else { 113 write->header_size = 2; 114 write->base_mrf = 0; 115 write->mlen = 2 + 4 * reg_width; 116 } 117 118 /* Tell the SF we don't have any inputs. Gen4-5 require at least one 119 * varying to avoid GPU hangs, so set that. 120 */ 121 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data); 122 wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0; 123 memset(wm_prog_data->urb_setup, -1, 124 sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX); 125 126 /* We don't have any uniforms. */ 127 stage_prog_data->nr_params = 0; 128 stage_prog_data->nr_pull_params = 0; 129 stage_prog_data->curb_read_length = 0; 130 stage_prog_data->dispatch_grf_start_reg = 2; 131 wm_prog_data->dispatch_grf_start_reg_16 = 2; 132 wm_prog_data->dispatch_grf_start_reg_32 = 2; 133 grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */ 134 135 calculate_cfg(); 136} 137 138/* The register location here is relative to the start of the URB 139 * data. It will get adjusted to be a real location before 140 * generate_code() time. 141 */ 142fs_reg 143fs_visitor::interp_reg(int location, int channel) 144{ 145 assert(stage == MESA_SHADER_FRAGMENT); 146 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 147 int regnr = prog_data->urb_setup[location] * 4 + channel; 148 assert(prog_data->urb_setup[location] != -1); 149 150 return fs_reg(ATTR, regnr, BRW_REGISTER_TYPE_F); 151} 152 153/** Emits the interpolation for the varying inputs. */ 154void 155fs_visitor::emit_interpolation_setup_gen4() 156{ 157 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW); 158 159 fs_builder abld = bld.annotate("compute pixel centers"); 160 this->pixel_x = vgrf(glsl_type::uint_type); 161 this->pixel_y = vgrf(glsl_type::uint_type); 162 this->pixel_x.type = BRW_REGISTER_TYPE_UW; 163 this->pixel_y.type = BRW_REGISTER_TYPE_UW; 164 abld.ADD(this->pixel_x, 165 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)), 166 fs_reg(brw_imm_v(0x10101010))); 167 abld.ADD(this->pixel_y, 168 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)), 169 fs_reg(brw_imm_v(0x11001100))); 170 171 abld = bld.annotate("compute pixel deltas from v0"); 172 173 this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] = 174 vgrf(glsl_type::vec2_type); 175 const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL]; 176 const fs_reg xstart(negate(brw_vec1_grf(1, 0))); 177 const fs_reg ystart(negate(brw_vec1_grf(1, 1))); 178 179 if (devinfo->has_pln && dispatch_width == 16) { 180 for (unsigned i = 0; i < 2; i++) { 181 abld.half(i).ADD(half(offset(delta_xy, abld, i), 0), 182 half(this->pixel_x, i), xstart); 183 abld.half(i).ADD(half(offset(delta_xy, abld, i), 1), 184 half(this->pixel_y, i), ystart); 185 } 186 } else { 187 abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart); 188 abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart); 189 } 190 191 abld = bld.annotate("compute pos.w and 1/pos.w"); 192 /* Compute wpos.w. It's always in our setup, since it's needed to 193 * interpolate the other attributes. 194 */ 195 this->wpos_w = vgrf(glsl_type::float_type); 196 abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, 197 component(interp_reg(VARYING_SLOT_POS, 3), 0)); 198 /* Compute the pixel 1/W value from wpos.w. */ 199 this->pixel_w = vgrf(glsl_type::float_type); 200 abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w); 201} 202 203/** Emits the interpolation for the varying inputs. */ 204void 205fs_visitor::emit_interpolation_setup_gen6() 206{ 207 fs_builder abld = bld.annotate("compute pixel centers"); 208 209 this->pixel_x = vgrf(glsl_type::float_type); 210 this->pixel_y = vgrf(glsl_type::float_type); 211 212 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { 213 const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i); 214 struct brw_reg gi_uw = retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UW); 215 216 if (devinfo->gen >= 8 || dispatch_width == 8) { 217 /* The "Register Region Restrictions" page says for BDW (and newer, 218 * presumably): 219 * 220 * "When destination spans two registers, the source may be one or 221 * two registers. The destination elements must be evenly split 222 * between the two registers." 223 * 224 * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 225 * to compute our pixel centers. 226 */ 227 const fs_builder dbld = 228 abld.exec_all().group(hbld.dispatch_width() * 2, 0); 229 fs_reg int_pixel_xy = dbld.vgrf(BRW_REGISTER_TYPE_UW); 230 231 dbld.ADD(int_pixel_xy, 232 fs_reg(stride(suboffset(gi_uw, 4), 1, 4, 0)), 233 fs_reg(brw_imm_v(0x11001010))); 234 235 hbld.emit(FS_OPCODE_PIXEL_X, offset(pixel_x, hbld, i), int_pixel_xy); 236 hbld.emit(FS_OPCODE_PIXEL_Y, offset(pixel_y, hbld, i), int_pixel_xy); 237 } else { 238 /* The "Register Region Restrictions" page says for SNB, IVB, HSW: 239 * 240 * "When destination spans two registers, the source MUST span 241 * two registers." 242 * 243 * Since the GRF source of the ADD will only read a single register, 244 * we must do two separate ADDs in SIMD16. 245 */ 246 const fs_reg int_pixel_x = hbld.vgrf(BRW_REGISTER_TYPE_UW); 247 const fs_reg int_pixel_y = hbld.vgrf(BRW_REGISTER_TYPE_UW); 248 249 hbld.ADD(int_pixel_x, 250 fs_reg(stride(suboffset(gi_uw, 4), 2, 4, 0)), 251 fs_reg(brw_imm_v(0x10101010))); 252 hbld.ADD(int_pixel_y, 253 fs_reg(stride(suboffset(gi_uw, 5), 2, 4, 0)), 254 fs_reg(brw_imm_v(0x11001100))); 255 256 /* As of gen6, we can no longer mix float and int sources. We have 257 * to turn the integer pixel centers into floats for their actual 258 * use. 259 */ 260 hbld.MOV(offset(pixel_x, hbld, i), int_pixel_x); 261 hbld.MOV(offset(pixel_y, hbld, i), int_pixel_y); 262 } 263 } 264 265 abld = bld.annotate("compute pos.w"); 266 this->pixel_w = fetch_payload_reg(abld, payload.source_w_reg); 267 this->wpos_w = vgrf(glsl_type::float_type); 268 abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w); 269 270 struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data); 271 272 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { 273 this->delta_xy[i] = fetch_payload_reg( 274 bld, payload.barycentric_coord_reg[i], BRW_REGISTER_TYPE_F, 2); 275 } 276 277 uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes & 278 (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID | 279 1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID); 280 281 if (devinfo->needs_unlit_centroid_workaround && centroid_modes) { 282 /* Get the pixel/sample mask into f0 so that we know which 283 * pixels are lit. Then, for each channel that is unlit, 284 * replace the centroid data with non-centroid data. 285 */ 286 for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) { 287 bld.exec_all().group(1, 0) 288 .MOV(retype(brw_flag_reg(0, i), BRW_REGISTER_TYPE_UW), 289 retype(brw_vec1_grf(1 + i, 7), BRW_REGISTER_TYPE_UW)); 290 } 291 292 for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) { 293 if (!(centroid_modes & (1 << i))) 294 continue; 295 296 const fs_reg &pixel_delta_xy = delta_xy[i - 1]; 297 298 for (unsigned q = 0; q < dispatch_width / 8; q++) { 299 for (unsigned c = 0; c < 2; c++) { 300 const unsigned idx = c + (q & 2) + (q & 1) * dispatch_width / 8; 301 set_predicate_inv( 302 BRW_PREDICATE_NORMAL, true, 303 bld.half(q).MOV(horiz_offset(delta_xy[i], idx * 8), 304 horiz_offset(pixel_delta_xy, idx * 8))); 305 } 306 } 307 } 308 } 309} 310 311static enum brw_conditional_mod 312cond_for_alpha_func(GLenum func) 313{ 314 switch(func) { 315 case GL_GREATER: 316 return BRW_CONDITIONAL_G; 317 case GL_GEQUAL: 318 return BRW_CONDITIONAL_GE; 319 case GL_LESS: 320 return BRW_CONDITIONAL_L; 321 case GL_LEQUAL: 322 return BRW_CONDITIONAL_LE; 323 case GL_EQUAL: 324 return BRW_CONDITIONAL_EQ; 325 case GL_NOTEQUAL: 326 return BRW_CONDITIONAL_NEQ; 327 default: 328 unreachable("Not reached"); 329 } 330} 331 332/** 333 * Alpha test support for when we compile it into the shader instead 334 * of using the normal fixed-function alpha test. 335 */ 336void 337fs_visitor::emit_alpha_test() 338{ 339 assert(stage == MESA_SHADER_FRAGMENT); 340 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 341 const fs_builder abld = bld.annotate("Alpha test"); 342 343 fs_inst *cmp; 344 if (key->alpha_test_func == GL_ALWAYS) 345 return; 346 347 if (key->alpha_test_func == GL_NEVER) { 348 /* f0.1 = 0 */ 349 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0), 350 BRW_REGISTER_TYPE_UW)); 351 cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg, 352 BRW_CONDITIONAL_NEQ); 353 } else { 354 /* RT0 alpha */ 355 fs_reg color = offset(outputs[0], bld, 3); 356 357 /* f0.1 &= func(color, ref) */ 358 cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref), 359 cond_for_alpha_func(key->alpha_test_func)); 360 } 361 cmp->predicate = BRW_PREDICATE_NORMAL; 362 cmp->flag_subreg = 1; 363} 364 365fs_inst * 366fs_visitor::emit_single_fb_write(const fs_builder &bld, 367 fs_reg color0, fs_reg color1, 368 fs_reg src0_alpha, unsigned components) 369{ 370 assert(stage == MESA_SHADER_FRAGMENT); 371 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 372 373 /* Hand over gl_FragDepth or the payload depth. */ 374 const fs_reg dst_depth = fetch_payload_reg(bld, payload.dest_depth_reg); 375 fs_reg src_depth, src_stencil; 376 377 if (source_depth_to_render_target) { 378 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) 379 src_depth = frag_depth; 380 else 381 src_depth = fetch_payload_reg(bld, payload.source_depth_reg); 382 } 383 384 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) 385 src_stencil = frag_stencil; 386 387 const fs_reg sources[] = { 388 color0, color1, src0_alpha, src_depth, dst_depth, src_stencil, 389 (prog_data->uses_omask ? sample_mask : fs_reg()), 390 brw_imm_ud(components) 391 }; 392 assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS); 393 fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(), 394 sources, ARRAY_SIZE(sources)); 395 396 if (prog_data->uses_kill) { 397 write->predicate = BRW_PREDICATE_NORMAL; 398 write->flag_subreg = 1; 399 } 400 401 return write; 402} 403 404void 405fs_visitor::emit_alpha_to_coverage_workaround(const fs_reg &src0_alpha) 406{ 407 /* We need to compute alpha to coverage dithering manually in shader 408 * and replace sample mask store with the bitwise-AND of sample mask and 409 * alpha to coverage dithering. 410 * 411 * The following formula is used to compute final sample mask: 412 * m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) 413 * dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) | 414 * 0x0808 * (m & 2) | 0x0100 * (m & 1) 415 * sample_mask = sample_mask & dither_mask 416 * 417 * It gives a number of ones proportional to the alpha for 2, 4, 8 or 16 418 * least significant bits of the result: 419 * 0.0000 0000000000000000 420 * 0.0625 0000000100000000 421 * 0.1250 0001000000010000 422 * 0.1875 0001000100010000 423 * 0.2500 1000100010001000 424 * 0.3125 1000100110001000 425 * 0.3750 1001100010011000 426 * 0.4375 1001100110011000 427 * 0.5000 1010101010101010 428 * 0.5625 1010101110101010 429 * 0.6250 1011101010111010 430 * 0.6875 1011101110111010 431 * 0.7500 1110111011101110 432 * 0.8125 1110111111101110 433 * 0.8750 1111111011111110 434 * 0.9375 1111111111111110 435 * 1.0000 1111111111111111 436 */ 437 const fs_builder abld = bld.annotate("compute alpha_to_coverage & " 438 "sample_mask"); 439 440 /* clamp(src0_alpha, 0.f, 1.f) */ 441 const fs_reg float_tmp = abld.vgrf(BRW_REGISTER_TYPE_F); 442 set_saturate(true, abld.MOV(float_tmp, src0_alpha)); 443 444 /* 16.0 * clamp(src0_alpha, 0.0, 1.0) */ 445 abld.MUL(float_tmp, float_tmp, brw_imm_f(16.0)); 446 447 /* m = int(16.0 * clamp(src0_alpha, 0.0, 1.0)) */ 448 const fs_reg m = abld.vgrf(BRW_REGISTER_TYPE_UW); 449 abld.MOV(m, float_tmp); 450 451 /* 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) */ 452 const fs_reg int_tmp_1 = abld.vgrf(BRW_REGISTER_TYPE_UW); 453 const fs_reg shift_const = abld.vgrf(BRW_REGISTER_TYPE_UD); 454 abld.MOV(shift_const, brw_imm_d(0xfea80)); 455 abld.AND(int_tmp_1, m, brw_imm_uw(~3)); 456 abld.SHR(int_tmp_1, shift_const, int_tmp_1); 457 abld.AND(int_tmp_1, int_tmp_1, brw_imm_uw(0xf)); 458 abld.MUL(int_tmp_1, int_tmp_1, brw_imm_uw(0x1111)); 459 460 /* 0x0808 * (m & 2) */ 461 const fs_reg int_tmp_2 = abld.vgrf(BRW_REGISTER_TYPE_UW); 462 abld.AND(int_tmp_2, m, brw_imm_uw(2)); 463 abld.MUL(int_tmp_2, int_tmp_2, brw_imm_uw(0x0808)); 464 465 abld.OR(int_tmp_1, int_tmp_1, int_tmp_2); 466 467 /* 0x0100 * (m & 1) */ 468 const fs_reg int_tmp_3 = abld.vgrf(BRW_REGISTER_TYPE_UW); 469 abld.AND(int_tmp_3, m, brw_imm_uw(1)); 470 abld.MUL(int_tmp_3, int_tmp_3, brw_imm_uw(0x0100)); 471 472 abld.OR(int_tmp_1, int_tmp_1, int_tmp_3); 473 474 /* sample_mask = sample_mask & dither_mask */ 475 const fs_reg mask = abld.vgrf(BRW_REGISTER_TYPE_UD); 476 abld.AND(mask, sample_mask, int_tmp_1); 477 sample_mask = mask; 478} 479 480void 481fs_visitor::emit_fb_writes() 482{ 483 assert(stage == MESA_SHADER_FRAGMENT); 484 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data); 485 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; 486 487 fs_inst *inst = NULL; 488 489 if (source_depth_to_render_target && devinfo->gen == 6) { 490 /* For outputting oDepth on gen6, SIMD8 writes have to be used. This 491 * would require SIMD8 moves of each half to message regs, e.g. by using 492 * the SIMD lowering pass. Unfortunately this is more difficult than it 493 * sounds because the SIMD8 single-source message lacks channel selects 494 * for the second and third subspans. 495 */ 496 limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n"); 497 } 498 499 if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) { 500 /* From the 'Render Target Write message' section of the docs: 501 * "Output Stencil is not supported with SIMD16 Render Target Write 502 * Messages." 503 */ 504 limit_dispatch_width(8, "gl_FragStencilRefARB unsupported " 505 "in SIMD16+ mode.\n"); 506 } 507 508 /* ANV doesn't know about sample mask output during the wm key creation 509 * so we compute if we need replicate alpha and emit alpha to coverage 510 * workaround here. 511 */ 512 prog_data->replicate_alpha = key->alpha_test_replicate_alpha || 513 (key->nr_color_regions > 1 && key->alpha_to_coverage && 514 (sample_mask.file == BAD_FILE || devinfo->gen == 6)); 515 516 /* From the SKL PRM, Volume 7, "Alpha Coverage": 517 * "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in 518 * hardware, regardless of the state setting for this feature." 519 */ 520 if (devinfo->gen > 6 && key->alpha_to_coverage && 521 sample_mask.file != BAD_FILE && this->outputs[0].file != BAD_FILE) 522 emit_alpha_to_coverage_workaround(offset(this->outputs[0], bld, 3)); 523 524 for (int target = 0; target < key->nr_color_regions; target++) { 525 /* Skip over outputs that weren't written. */ 526 if (this->outputs[target].file == BAD_FILE) 527 continue; 528 529 const fs_builder abld = bld.annotate( 530 ralloc_asprintf(this->mem_ctx, "FB write target %d", target)); 531 532 fs_reg src0_alpha; 533 if (devinfo->gen >= 6 && prog_data->replicate_alpha && target != 0) 534 src0_alpha = offset(outputs[0], bld, 3); 535 536 inst = emit_single_fb_write(abld, this->outputs[target], 537 this->dual_src_output, src0_alpha, 4); 538 inst->target = target; 539 } 540 541 prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE && 542 this->outputs[0].file != BAD_FILE); 543 assert(!prog_data->dual_src_blend || key->nr_color_regions == 1); 544 545 if (inst == NULL) { 546 /* Even if there's no color buffers enabled, we still need to send 547 * alpha out the pipeline to our null renderbuffer to support 548 * alpha-testing, alpha-to-coverage, and so on. 549 */ 550 /* FINISHME: Factor out this frequently recurring pattern into a 551 * helper function. 552 */ 553 const fs_reg srcs[] = { reg_undef, reg_undef, 554 reg_undef, offset(this->outputs[0], bld, 3) }; 555 const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4); 556 bld.LOAD_PAYLOAD(tmp, srcs, 4, 0); 557 558 inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4); 559 inst->target = 0; 560 } 561 562 inst->last_rt = true; 563 inst->eot = true; 564} 565 566void 567fs_visitor::setup_uniform_clipplane_values() 568{ 569 const struct brw_vs_prog_key *key = 570 (const struct brw_vs_prog_key *) this->key; 571 572 if (key->nr_userclip_plane_consts == 0) 573 return; 574 575 assert(stage_prog_data->nr_params == uniforms); 576 brw_stage_prog_data_add_params(stage_prog_data, 577 key->nr_userclip_plane_consts * 4); 578 579 for (int i = 0; i < key->nr_userclip_plane_consts; i++) { 580 this->userplane[i] = fs_reg(UNIFORM, uniforms); 581 for (int j = 0; j < 4; ++j) { 582 stage_prog_data->param[uniforms + j] = 583 BRW_PARAM_BUILTIN_CLIP_PLANE(i, j); 584 } 585 uniforms += 4; 586 } 587} 588 589/** 590 * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances. 591 * 592 * This does nothing if the shader uses gl_ClipDistance or user clipping is 593 * disabled altogether. 594 */ 595void fs_visitor::compute_clip_distance() 596{ 597 struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data); 598 const struct brw_vs_prog_key *key = 599 (const struct brw_vs_prog_key *) this->key; 600 601 /* Bail unless some sort of legacy clipping is enabled */ 602 if (key->nr_userclip_plane_consts == 0) 603 return; 604 605 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables): 606 * 607 * "If a linked set of shaders forming the vertex stage contains no 608 * static write to gl_ClipVertex or gl_ClipDistance, but the 609 * application has requested clipping against user clip planes through 610 * the API, then the coordinate written to gl_Position is used for 611 * comparison against the user clip planes." 612 * 613 * This function is only called if the shader didn't write to 614 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping 615 * if the user wrote to it; otherwise we use gl_Position. 616 */ 617 618 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX; 619 if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) 620 clip_vertex = VARYING_SLOT_POS; 621 622 /* If the clip vertex isn't written, skip this. Typically this means 623 * the GS will set up clipping. */ 624 if (outputs[clip_vertex].file == BAD_FILE) 625 return; 626 627 setup_uniform_clipplane_values(); 628 629 const fs_builder abld = bld.annotate("user clip distances"); 630 631 this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type); 632 this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type); 633 634 for (int i = 0; i < key->nr_userclip_plane_consts; i++) { 635 fs_reg u = userplane[i]; 636 const fs_reg output = offset(outputs[VARYING_SLOT_CLIP_DIST0 + i / 4], 637 bld, i & 3); 638 639 abld.MUL(output, outputs[clip_vertex], u); 640 for (int j = 1; j < 4; j++) { 641 u.nr = userplane[i].nr + j; 642 abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u); 643 } 644 } 645} 646 647void 648fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count) 649{ 650 int slot, urb_offset, length; 651 int starting_urb_offset = 0; 652 const struct brw_vue_prog_data *vue_prog_data = 653 brw_vue_prog_data(this->prog_data); 654 const struct brw_vs_prog_key *vs_key = 655 (const struct brw_vs_prog_key *) this->key; 656 const GLbitfield64 psiz_mask = 657 VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ; 658 const struct brw_vue_map *vue_map = &vue_prog_data->vue_map; 659 bool flush; 660 fs_reg sources[8]; 661 fs_reg urb_handle; 662 663 if (stage == MESA_SHADER_TESS_EVAL) 664 urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD)); 665 else 666 urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); 667 668 opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8; 669 int header_size = 1; 670 fs_reg per_slot_offsets; 671 672 if (stage == MESA_SHADER_GEOMETRY) { 673 const struct brw_gs_prog_data *gs_prog_data = 674 brw_gs_prog_data(this->prog_data); 675 676 /* We need to increment the Global Offset to skip over the control data 677 * header and the extra "Vertex Count" field (1 HWord) at the beginning 678 * of the VUE. We're counting in OWords, so the units are doubled. 679 */ 680 starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords; 681 if (gs_prog_data->static_vertex_count == -1) 682 starting_urb_offset += 2; 683 684 /* We also need to use per-slot offsets. The per-slot offset is the 685 * Vertex Count. SIMD8 mode processes 8 different primitives at a 686 * time; each may output a different number of vertices. 687 */ 688 opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT; 689 header_size++; 690 691 /* The URB offset is in 128-bit units, so we need to multiply by 2 */ 692 const int output_vertex_size_owords = 693 gs_prog_data->output_vertex_size_hwords * 2; 694 695 if (gs_vertex_count.file == IMM) { 696 per_slot_offsets = brw_imm_ud(output_vertex_size_owords * 697 gs_vertex_count.ud); 698 } else { 699 per_slot_offsets = vgrf(glsl_type::uint_type); 700 bld.MUL(per_slot_offsets, gs_vertex_count, 701 brw_imm_ud(output_vertex_size_owords)); 702 } 703 } 704 705 length = 0; 706 urb_offset = starting_urb_offset; 707 flush = false; 708 709 /* SSO shaders can have VUE slots allocated which are never actually 710 * written to, so ignore them when looking for the last (written) slot. 711 */ 712 int last_slot = vue_map->num_slots - 1; 713 while (last_slot > 0 && 714 (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD || 715 outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) { 716 last_slot--; 717 } 718 719 bool urb_written = false; 720 for (slot = 0; slot < vue_map->num_slots; slot++) { 721 int varying = vue_map->slot_to_varying[slot]; 722 switch (varying) { 723 case VARYING_SLOT_PSIZ: { 724 /* The point size varying slot is the vue header and is always in the 725 * vue map. But often none of the special varyings that live there 726 * are written and in that case we can skip writing to the vue 727 * header, provided the corresponding state properly clamps the 728 * values further down the pipeline. */ 729 if ((vue_map->slots_valid & psiz_mask) == 0) { 730 assert(length == 0); 731 urb_offset++; 732 break; 733 } 734 735 fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); 736 bld.MOV(zero, brw_imm_ud(0u)); 737 738 sources[length++] = zero; 739 if (vue_map->slots_valid & VARYING_BIT_LAYER) 740 sources[length++] = this->outputs[VARYING_SLOT_LAYER]; 741 else 742 sources[length++] = zero; 743 744 if (vue_map->slots_valid & VARYING_BIT_VIEWPORT) 745 sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT]; 746 else 747 sources[length++] = zero; 748 749 if (vue_map->slots_valid & VARYING_BIT_PSIZ) 750 sources[length++] = this->outputs[VARYING_SLOT_PSIZ]; 751 else 752 sources[length++] = zero; 753 break; 754 } 755 case BRW_VARYING_SLOT_NDC: 756 case VARYING_SLOT_EDGE: 757 unreachable("unexpected scalar vs output"); 758 break; 759 760 default: 761 /* gl_Position is always in the vue map, but isn't always written by 762 * the shader. Other varyings (clip distances) get added to the vue 763 * map but don't always get written. In those cases, the 764 * corresponding this->output[] slot will be invalid we and can skip 765 * the urb write for the varying. If we've already queued up a vue 766 * slot for writing we flush a mlen 5 urb write, otherwise we just 767 * advance the urb_offset. 768 */ 769 if (varying == BRW_VARYING_SLOT_PAD || 770 this->outputs[varying].file == BAD_FILE) { 771 if (length > 0) 772 flush = true; 773 else 774 urb_offset++; 775 break; 776 } 777 778 if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color && 779 (varying == VARYING_SLOT_COL0 || 780 varying == VARYING_SLOT_COL1 || 781 varying == VARYING_SLOT_BFC0 || 782 varying == VARYING_SLOT_BFC1)) { 783 /* We need to clamp these guys, so do a saturating MOV into a 784 * temp register and use that for the payload. 785 */ 786 for (int i = 0; i < 4; i++) { 787 fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type); 788 fs_reg src = offset(this->outputs[varying], bld, i); 789 set_saturate(true, bld.MOV(reg, src)); 790 sources[length++] = reg; 791 } 792 } else { 793 for (unsigned i = 0; i < 4; i++) 794 sources[length++] = offset(this->outputs[varying], bld, i); 795 } 796 break; 797 } 798 799 const fs_builder abld = bld.annotate("URB write"); 800 801 /* If we've queued up 8 registers of payload (2 VUE slots), if this is 802 * the last slot or if we need to flush (see BAD_FILE varying case 803 * above), emit a URB write send now to flush out the data. 804 */ 805 if (length == 8 || (length > 0 && slot == last_slot)) 806 flush = true; 807 if (flush) { 808 fs_reg *payload_sources = 809 ralloc_array(mem_ctx, fs_reg, length + header_size); 810 fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size), 811 BRW_REGISTER_TYPE_F); 812 payload_sources[0] = urb_handle; 813 814 if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT) 815 payload_sources[1] = per_slot_offsets; 816 817 memcpy(&payload_sources[header_size], sources, 818 length * sizeof sources[0]); 819 820 abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size, 821 header_size); 822 823 fs_inst *inst = abld.emit(opcode, reg_undef, payload); 824 825 /* For ICL WA 1805992985 one needs additional write in the end. */ 826 if (devinfo->gen == 11 && stage == MESA_SHADER_TESS_EVAL) 827 inst->eot = false; 828 else 829 inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY; 830 831 inst->mlen = length + header_size; 832 inst->offset = urb_offset; 833 urb_offset = starting_urb_offset + slot + 1; 834 length = 0; 835 flush = false; 836 urb_written = true; 837 } 838 } 839 840 /* If we don't have any valid slots to write, just do a minimal urb write 841 * send to terminate the shader. This includes 1 slot of undefined data, 842 * because it's invalid to write 0 data: 843 * 844 * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions - 845 * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read > 846 * Write Data Payload: 847 * 848 * "The write data payload can be between 1 and 8 message phases long." 849 */ 850 if (!urb_written) { 851 /* For GS, just turn EmitVertex() into a no-op. We don't want it to 852 * end the thread, and emit_gs_thread_end() already emits a SEND with 853 * EOT at the end of the program for us. 854 */ 855 if (stage == MESA_SHADER_GEOMETRY) 856 return; 857 858 fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD); 859 bld.exec_all().MOV(payload, urb_handle); 860 861 fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload); 862 inst->eot = true; 863 inst->mlen = 2; 864 inst->offset = 1; 865 return; 866 } 867 868 /* ICL WA 1805992985: 869 * 870 * ICLLP GPU hangs on one of tessellation vkcts tests with DS not done. The 871 * send cycle, which is a urb write with an eot must be 4 phases long and 872 * all 8 lanes must valid. 873 */ 874 if (devinfo->gen == 11 && stage == MESA_SHADER_TESS_EVAL) { 875 fs_reg payload = fs_reg(VGRF, alloc.allocate(6), BRW_REGISTER_TYPE_UD); 876 877 /* Workaround requires all 8 channels (lanes) to be valid. This is 878 * understood to mean they all need to be alive. First trick is to find 879 * a live channel and copy its urb handle for all the other channels to 880 * make sure all handles are valid. 881 */ 882 bld.exec_all().MOV(payload, bld.emit_uniformize(urb_handle)); 883 884 /* Second trick is to use masked URB write where one can tell the HW to 885 * actually write data only for selected channels even though all are 886 * active. 887 * Third trick is to take advantage of the must-be-zero (MBZ) area in 888 * the very beginning of the URB. 889 * 890 * One masks data to be written only for the first channel and uses 891 * offset zero explicitly to land data to the MBZ area avoiding trashing 892 * any other part of the URB. 893 * 894 * Since the WA says that the write needs to be 4 phases long one uses 895 * 4 slots data. All are explicitly zeros in order to to keep the MBZ 896 * area written as zeros. 897 */ 898 bld.exec_all().MOV(offset(payload, bld, 1), brw_imm_ud(0x10000u)); 899 bld.exec_all().MOV(offset(payload, bld, 2), brw_imm_ud(0u)); 900 bld.exec_all().MOV(offset(payload, bld, 3), brw_imm_ud(0u)); 901 bld.exec_all().MOV(offset(payload, bld, 4), brw_imm_ud(0u)); 902 bld.exec_all().MOV(offset(payload, bld, 5), brw_imm_ud(0u)); 903 904 fs_inst *inst = bld.exec_all().emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED, 905 reg_undef, payload); 906 inst->eot = true; 907 inst->mlen = 6; 908 inst->offset = 0; 909 } 910} 911 912void 913fs_visitor::emit_cs_terminate() 914{ 915 assert(devinfo->gen >= 7); 916 917 /* We are getting the thread ID from the compute shader header */ 918 assert(stage == MESA_SHADER_COMPUTE); 919 920 /* We can't directly send from g0, since sends with EOT have to use 921 * g112-127. So, copy it to a virtual register, The register allocator will 922 * make sure it uses the appropriate register range. 923 */ 924 struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD); 925 fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); 926 bld.group(8, 0).exec_all().MOV(payload, g0); 927 928 /* Send a message to the thread spawner to terminate the thread. */ 929 fs_inst *inst = bld.exec_all() 930 .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload); 931 inst->eot = true; 932} 933 934void 935fs_visitor::emit_barrier() 936{ 937 uint32_t barrier_id_mask; 938 switch (devinfo->gen) { 939 case 7: 940 case 8: 941 barrier_id_mask = 0x0f000000u; break; 942 case 9: 943 case 10: 944 barrier_id_mask = 0x8f000000u; break; 945 case 11: 946 barrier_id_mask = 0x7f000000u; break; 947 default: 948 unreachable("barrier is only available on gen >= 7"); 949 } 950 951 /* We are getting the barrier ID from the compute shader header */ 952 assert(stage == MESA_SHADER_COMPUTE); 953 954 fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD); 955 956 /* Clear the message payload */ 957 bld.exec_all().group(8, 0).MOV(payload, brw_imm_ud(0u)); 958 959 /* Copy the barrier id from r0.2 to the message payload reg.2 */ 960 fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)); 961 bld.exec_all().group(1, 0).AND(component(payload, 2), r0_2, 962 brw_imm_ud(barrier_id_mask)); 963 964 /* Emit a gateway "barrier" message using the payload we set up, followed 965 * by a wait instruction. 966 */ 967 bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload); 968} 969 970fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, 971 void *mem_ctx, 972 const void *key, 973 struct brw_stage_prog_data *prog_data, 974 struct gl_program *prog, 975 const nir_shader *shader, 976 unsigned dispatch_width, 977 int shader_time_index, 978 const struct brw_vue_map *input_vue_map) 979 : backend_shader(compiler, log_data, mem_ctx, shader, prog_data), 980 key(key), gs_compile(NULL), prog_data(prog_data), prog(prog), 981 input_vue_map(input_vue_map), 982 dispatch_width(dispatch_width), 983 shader_time_index(shader_time_index), 984 bld(fs_builder(this, dispatch_width).at_end()) 985{ 986 init(); 987} 988 989fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data, 990 void *mem_ctx, 991 struct brw_gs_compile *c, 992 struct brw_gs_prog_data *prog_data, 993 const nir_shader *shader, 994 int shader_time_index) 995 : backend_shader(compiler, log_data, mem_ctx, shader, 996 &prog_data->base.base), 997 key(&c->key), gs_compile(c), 998 prog_data(&prog_data->base.base), prog(NULL), 999 dispatch_width(8), 1000 shader_time_index(shader_time_index), 1001 bld(fs_builder(this, dispatch_width).at_end()) 1002{ 1003 init(); 1004} 1005 1006 1007void 1008fs_visitor::init() 1009{ 1010 switch (stage) { 1011 case MESA_SHADER_FRAGMENT: 1012 key_tex = &((const brw_wm_prog_key *) key)->tex; 1013 break; 1014 case MESA_SHADER_VERTEX: 1015 key_tex = &((const brw_vs_prog_key *) key)->tex; 1016 break; 1017 case MESA_SHADER_TESS_CTRL: 1018 key_tex = &((const brw_tcs_prog_key *) key)->tex; 1019 break; 1020 case MESA_SHADER_TESS_EVAL: 1021 key_tex = &((const brw_tes_prog_key *) key)->tex; 1022 break; 1023 case MESA_SHADER_GEOMETRY: 1024 key_tex = &((const brw_gs_prog_key *) key)->tex; 1025 break; 1026 case MESA_SHADER_COMPUTE: 1027 key_tex = &((const brw_cs_prog_key*) key)->tex; 1028 break; 1029 default: 1030 unreachable("unhandled shader stage"); 1031 } 1032 1033 this->max_dispatch_width = 32; 1034 this->prog_data = this->stage_prog_data; 1035 1036 this->failed = false; 1037 1038 this->nir_locals = NULL; 1039 this->nir_ssa_values = NULL; 1040 1041 memset(&this->payload, 0, sizeof(this->payload)); 1042 this->source_depth_to_render_target = false; 1043 this->runtime_check_aads_emit = false; 1044 this->first_non_payload_grf = 0; 1045 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF; 1046 1047 this->virtual_grf_start = NULL; 1048 this->virtual_grf_end = NULL; 1049 this->live_intervals = NULL; 1050 this->regs_live_at_ip = NULL; 1051 1052 this->uniforms = 0; 1053 this->last_scratch = 0; 1054 this->pull_constant_loc = NULL; 1055 this->push_constant_loc = NULL; 1056 1057 this->promoted_constants = 0, 1058 1059 this->grf_used = 0; 1060 this->spilled_any_registers = false; 1061} 1062 1063fs_visitor::~fs_visitor() 1064{ 1065} 1066