1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25 26#include "dev/gen_device_info.h" 27#include "common/gen_sample_positions.h" 28#include "genxml/gen_macros.h" 29 30#include "main/bufferobj.h" 31#include "main/context.h" 32#include "main/enums.h" 33#include "main/macros.h" 34#include "main/state.h" 35 36#include "genX_boilerplate.h" 37 38#include "brw_context.h" 39#include "brw_draw.h" 40#include "brw_multisample_state.h" 41#include "brw_state.h" 42#include "brw_wm.h" 43#include "brw_util.h" 44 45#include "intel_batchbuffer.h" 46#include "intel_buffer_objects.h" 47#include "intel_fbo.h" 48 49#include "main/enums.h" 50#include "main/fbobject.h" 51#include "main/framebuffer.h" 52#include "main/glformats.h" 53#include "main/samplerobj.h" 54#include "main/shaderapi.h" 55#include "main/stencil.h" 56#include "main/transformfeedback.h" 57#include "main/varray.h" 58#include "main/viewport.h" 59#include "util/half_float.h" 60 61#if GEN_GEN == 4 62static struct brw_address 63KSP(struct brw_context *brw, uint32_t offset) 64{ 65 return ro_bo(brw->cache.bo, offset); 66} 67#else 68static uint32_t 69KSP(UNUSED struct brw_context *brw, uint32_t offset) 70{ 71 return offset; 72} 73#endif 74 75#if GEN_GEN >= 7 76MAYBE_UNUSED static void 77emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr) 78{ 79 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) { 80 lrm.RegisterAddress = reg; 81 lrm.MemoryAddress = addr; 82 } 83} 84#endif 85 86MAYBE_UNUSED static void 87emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm) 88{ 89 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) { 90 lri.RegisterOffset = reg; 91 lri.DataDWord = imm; 92 } 93} 94 95#if GEN_IS_HASWELL || GEN_GEN >= 8 96MAYBE_UNUSED static void 97emit_lrr(struct brw_context *brw, uint32_t dst, uint32_t src) 98{ 99 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_REG), lrr) { 100 lrr.SourceRegisterAddress = src; 101 lrr.DestinationRegisterAddress = dst; 102 } 103} 104#endif 105 106/** 107 * Polygon stipple packet 108 */ 109static void 110genX(upload_polygon_stipple)(struct brw_context *brw) 111{ 112 struct gl_context *ctx = &brw->ctx; 113 114 /* _NEW_POLYGON */ 115 if (!ctx->Polygon.StippleFlag) 116 return; 117 118 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) { 119 /* Polygon stipple is provided in OpenGL order, i.e. bottom 120 * row first. If we're rendering to a window (i.e. the 121 * default frame buffer object, 0), then we need to invert 122 * it to match our pixel layout. But if we're rendering 123 * to a FBO (i.e. any named frame buffer object), we *don't* 124 * need to invert - we already match the layout. 125 */ 126 if (ctx->DrawBuffer->FlipY) { 127 for (unsigned i = 0; i < 32; i++) 128 poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */ 129 } else { 130 for (unsigned i = 0; i < 32; i++) 131 poly.PatternRow[i] = ctx->PolygonStipple[i]; 132 } 133 } 134} 135 136static const struct brw_tracked_state genX(polygon_stipple) = { 137 .dirty = { 138 .mesa = _NEW_POLYGON | 139 _NEW_POLYGONSTIPPLE, 140 .brw = BRW_NEW_CONTEXT, 141 }, 142 .emit = genX(upload_polygon_stipple), 143}; 144 145/** 146 * Polygon stipple offset packet 147 */ 148static void 149genX(upload_polygon_stipple_offset)(struct brw_context *brw) 150{ 151 struct gl_context *ctx = &brw->ctx; 152 153 /* _NEW_POLYGON */ 154 if (!ctx->Polygon.StippleFlag) 155 return; 156 157 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) { 158 /* _NEW_BUFFERS 159 * 160 * If we're drawing to a system window we have to invert the Y axis 161 * in order to match the OpenGL pixel coordinate system, and our 162 * offset must be matched to the window position. If we're drawing 163 * to a user-created FBO then our native pixel coordinate system 164 * works just fine, and there's no window system to worry about. 165 */ 166 if (ctx->DrawBuffer->FlipY) { 167 poly.PolygonStippleYOffset = 168 (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31; 169 } 170 } 171} 172 173static const struct brw_tracked_state genX(polygon_stipple_offset) = { 174 .dirty = { 175 .mesa = _NEW_BUFFERS | 176 _NEW_POLYGON, 177 .brw = BRW_NEW_CONTEXT, 178 }, 179 .emit = genX(upload_polygon_stipple_offset), 180}; 181 182/** 183 * Line stipple packet 184 */ 185static void 186genX(upload_line_stipple)(struct brw_context *brw) 187{ 188 struct gl_context *ctx = &brw->ctx; 189 190 if (!ctx->Line.StippleFlag) 191 return; 192 193 brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) { 194 line.LineStipplePattern = ctx->Line.StipplePattern; 195 196 line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor; 197 line.LineStippleRepeatCount = ctx->Line.StippleFactor; 198 } 199} 200 201static const struct brw_tracked_state genX(line_stipple) = { 202 .dirty = { 203 .mesa = _NEW_LINE, 204 .brw = BRW_NEW_CONTEXT, 205 }, 206 .emit = genX(upload_line_stipple), 207}; 208 209/* Constant single cliprect for framebuffer object or DRI2 drawing */ 210static void 211genX(upload_drawing_rect)(struct brw_context *brw) 212{ 213 struct gl_context *ctx = &brw->ctx; 214 const struct gl_framebuffer *fb = ctx->DrawBuffer; 215 const unsigned int fb_width = _mesa_geometric_width(fb); 216 const unsigned int fb_height = _mesa_geometric_height(fb); 217 218 brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { 219 rect.ClippedDrawingRectangleXMax = fb_width - 1; 220 rect.ClippedDrawingRectangleYMax = fb_height - 1; 221 } 222} 223 224static const struct brw_tracked_state genX(drawing_rect) = { 225 .dirty = { 226 .mesa = _NEW_BUFFERS, 227 .brw = BRW_NEW_BLORP | 228 BRW_NEW_CONTEXT, 229 }, 230 .emit = genX(upload_drawing_rect), 231}; 232 233static uint32_t * 234genX(emit_vertex_buffer_state)(struct brw_context *brw, 235 uint32_t *dw, 236 unsigned buffer_nr, 237 struct brw_bo *bo, 238 unsigned start_offset, 239 MAYBE_UNUSED unsigned end_offset, 240 unsigned stride, 241 MAYBE_UNUSED unsigned step_rate) 242{ 243 struct GENX(VERTEX_BUFFER_STATE) buf_state = { 244 .VertexBufferIndex = buffer_nr, 245 .BufferPitch = stride, 246 247 /* The VF cache designers apparently cut corners, and made the cache 248 * only consider the bottom 32 bits of memory addresses. If you happen 249 * to have two vertex buffers which get placed exactly 4 GiB apart and 250 * use them in back-to-back draw calls, you can get collisions. To work 251 * around this problem, we restrict vertex buffers to the low 32 bits of 252 * the address space. 253 */ 254 .BufferStartingAddress = ro_32_bo(bo, start_offset), 255#if GEN_GEN >= 8 256 .BufferSize = end_offset - start_offset, 257#endif 258 259#if GEN_GEN >= 7 260 .AddressModifyEnable = true, 261#endif 262 263#if GEN_GEN < 8 264 .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA, 265 .InstanceDataStepRate = step_rate, 266#if GEN_GEN >= 5 267 .EndAddress = ro_bo(bo, end_offset - 1), 268#endif 269#endif 270 271#if GEN_GEN == 11 272 .MOCS = ICL_MOCS_WB, 273#elif GEN_GEN == 10 274 .MOCS = CNL_MOCS_WB, 275#elif GEN_GEN == 9 276 .MOCS = SKL_MOCS_WB, 277#elif GEN_GEN == 8 278 .MOCS = BDW_MOCS_WB, 279#elif GEN_GEN == 7 280 .MOCS = GEN7_MOCS_L3, 281#endif 282 }; 283 284 GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state); 285 return dw + GENX(VERTEX_BUFFER_STATE_length); 286} 287 288UNUSED static bool 289is_passthru_format(uint32_t format) 290{ 291 switch (format) { 292 case ISL_FORMAT_R64_PASSTHRU: 293 case ISL_FORMAT_R64G64_PASSTHRU: 294 case ISL_FORMAT_R64G64B64_PASSTHRU: 295 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 296 return true; 297 default: 298 return false; 299 } 300} 301 302UNUSED static int 303uploads_needed(uint32_t format, 304 bool is_dual_slot) 305{ 306 if (!is_passthru_format(format)) 307 return 1; 308 309 if (is_dual_slot) 310 return 2; 311 312 switch (format) { 313 case ISL_FORMAT_R64_PASSTHRU: 314 case ISL_FORMAT_R64G64_PASSTHRU: 315 return 1; 316 case ISL_FORMAT_R64G64B64_PASSTHRU: 317 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 318 return 2; 319 default: 320 unreachable("not reached"); 321 } 322} 323 324/* 325 * Returns the format that we are finally going to use when upload a vertex 326 * element. It will only change if we are using *64*PASSTHRU formats, as for 327 * gen < 8 they need to be splitted on two *32*FLOAT formats. 328 * 329 * @upload points in which upload we are. Valid values are [0,1] 330 */ 331static uint32_t 332downsize_format_if_needed(uint32_t format, 333 int upload) 334{ 335 assert(upload == 0 || upload == 1); 336 337 if (!is_passthru_format(format)) 338 return format; 339 340 /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload == 341 * 1 means that we have been forced to do 2 uploads for a size <= 2. This 342 * happens with gen < 8 and dvec3 or dvec4 vertex shader input 343 * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of 344 * flagging that we want to fill with zeroes this second forced upload. 345 */ 346 switch (format) { 347 case ISL_FORMAT_R64_PASSTHRU: 348 return upload == 0 ? ISL_FORMAT_R32G32_FLOAT 349 : ISL_FORMAT_R32_FLOAT; 350 case ISL_FORMAT_R64G64_PASSTHRU: 351 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT 352 : ISL_FORMAT_R32_FLOAT; 353 case ISL_FORMAT_R64G64B64_PASSTHRU: 354 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT 355 : ISL_FORMAT_R32G32_FLOAT; 356 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 357 return ISL_FORMAT_R32G32B32A32_FLOAT; 358 default: 359 unreachable("not reached"); 360 } 361} 362 363/* 364 * Returns the number of componentes associated with a format that is used on 365 * a 64 to 32 format split. See downsize_format() 366 */ 367static int 368upload_format_size(uint32_t upload_format) 369{ 370 switch (upload_format) { 371 case ISL_FORMAT_R32_FLOAT: 372 373 /* downsized_format has returned this one in order to flag that we are 374 * performing a second upload which we want to have filled with 375 * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4 376 * vertex shader input variables. 377 */ 378 379 return 0; 380 case ISL_FORMAT_R32G32_FLOAT: 381 return 2; 382 case ISL_FORMAT_R32G32B32A32_FLOAT: 383 return 4; 384 default: 385 unreachable("not reached"); 386 } 387} 388 389static UNUSED uint16_t 390pinned_bo_high_bits(struct brw_bo *bo) 391{ 392 return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0; 393} 394 395/* The VF cache designers apparently cut corners, and made the cache key's 396 * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits 397 * of the address. If you happen to have two vertex buffers which get placed 398 * exactly 4 GiB apart and use them in back-to-back draw calls, you can get 399 * collisions. (These collisions can happen within a single batch.) 400 * 401 * In the soft-pin world, we'd like to assign addresses up front, and never 402 * move buffers. So, we need to do a VF cache invalidate if the buffer for 403 * a particular VB slot has different [48:32] address bits than the last one. 404 * 405 * In the relocation world, we have no idea what the addresses will be, so 406 * we can't apply this workaround. Instead, we tell the kernel to move it 407 * to the low 4GB regardless. 408 * 409 * This HW issue is gone on Gen11+. 410 */ 411static void 412vf_invalidate_for_vb_48bit_transitions(struct brw_context *brw) 413{ 414#if GEN_GEN >= 8 && GEN_GEN < 11 415 bool need_invalidate = false; 416 417 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { 418 uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo); 419 420 if (high_bits != brw->vb.last_bo_high_bits[i]) { 421 need_invalidate = true; 422 brw->vb.last_bo_high_bits[i] = high_bits; 423 } 424 } 425 426 if (brw->draw.draw_params_bo) { 427 uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo); 428 429 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) { 430 need_invalidate = true; 431 brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits; 432 } 433 } 434 435 if (brw->draw.derived_draw_params_bo) { 436 uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo); 437 438 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) { 439 need_invalidate = true; 440 brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits; 441 } 442 } 443 444 if (need_invalidate) { 445 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL); 446 } 447#endif 448} 449 450static void 451vf_invalidate_for_ib_48bit_transition(struct brw_context *brw) 452{ 453#if GEN_GEN >= 8 454 uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo); 455 456 if (high_bits != brw->ib.last_bo_high_bits) { 457 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE); 458 brw->ib.last_bo_high_bits = high_bits; 459 } 460#endif 461} 462 463static void 464genX(emit_vertices)(struct brw_context *brw) 465{ 466 const struct gen_device_info *devinfo = &brw->screen->devinfo; 467 uint32_t *dw; 468 469 brw_prepare_vertices(brw); 470 brw_prepare_shader_draw_parameters(brw); 471 472#if GEN_GEN < 6 473 brw_emit_query_begin(brw); 474#endif 475 476 const struct brw_vs_prog_data *vs_prog_data = 477 brw_vs_prog_data(brw->vs.base.prog_data); 478 479#if GEN_GEN >= 8 480 struct gl_context *ctx = &brw->ctx; 481 const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL || 482 ctx->Polygon.BackMode != GL_FILL); 483 484 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { 485 unsigned vue = brw->vb.nr_enabled; 486 487 /* The element for the edge flags must always be last, so we have to 488 * insert the SGVS before it in that case. 489 */ 490 if (uses_edge_flag) { 491 assert(vue > 0); 492 vue--; 493 } 494 495 WARN_ONCE(vue >= 33, 496 "Trying to insert VID/IID past 33rd vertex element, " 497 "need to reorder the vertex attrbutes."); 498 499 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) { 500 if (vs_prog_data->uses_vertexid) { 501 vfs.VertexIDEnable = true; 502 vfs.VertexIDComponentNumber = 2; 503 vfs.VertexIDElementOffset = vue; 504 } 505 506 if (vs_prog_data->uses_instanceid) { 507 vfs.InstanceIDEnable = true; 508 vfs.InstanceIDComponentNumber = 3; 509 vfs.InstanceIDElementOffset = vue; 510 } 511 } 512 513 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 514 vfi.InstancingEnable = true; 515 vfi.VertexElementIndex = vue; 516 } 517 } else { 518 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs); 519 } 520#endif 521 522 const bool uses_draw_params = 523 vs_prog_data->uses_firstvertex || 524 vs_prog_data->uses_baseinstance; 525 526 const bool uses_derived_draw_params = 527 vs_prog_data->uses_drawid || 528 vs_prog_data->uses_is_indexed_draw; 529 530 const bool needs_sgvs_element = (uses_draw_params || 531 vs_prog_data->uses_instanceid || 532 vs_prog_data->uses_vertexid); 533 534 unsigned nr_elements = 535 brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params; 536 537#if GEN_GEN < 8 538 /* If any of the formats of vb.enabled needs more that one upload, we need 539 * to add it to nr_elements 540 */ 541 for (unsigned i = 0; i < brw->vb.nr_enabled; i++) { 542 struct brw_vertex_element *input = brw->vb.enabled[i]; 543 const struct gl_array_attributes *glattrib = input->glattrib; 544 uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format); 545 546 if (uploads_needed(format, input->is_dual_slot) > 1) 547 nr_elements++; 548 } 549#endif 550 551 /* If the VS doesn't read any inputs (calculating vertex position from 552 * a state variable for some reason, for example), emit a single pad 553 * VERTEX_ELEMENT struct and bail. 554 * 555 * The stale VB state stays in place, but they don't do anything unless 556 * a VE loads from them. 557 */ 558 if (nr_elements == 0) { 559 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS), 560 1 + GENX(VERTEX_ELEMENT_STATE_length)); 561 struct GENX(VERTEX_ELEMENT_STATE) elem = { 562 .Valid = true, 563 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT, 564 .Component0Control = VFCOMP_STORE_0, 565 .Component1Control = VFCOMP_STORE_0, 566 .Component2Control = VFCOMP_STORE_0, 567 .Component3Control = VFCOMP_STORE_1_FP, 568 }; 569 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem); 570 return; 571 } 572 573 /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */ 574 const unsigned nr_buffers = brw->vb.nr_buffers + 575 uses_draw_params + uses_derived_draw_params; 576 577 vf_invalidate_for_vb_48bit_transitions(brw); 578 579 if (nr_buffers) { 580 assert(nr_buffers <= (GEN_GEN >= 6 ? 33 : 17)); 581 582 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS), 583 1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers); 584 585 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { 586 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; 587 /* Prior to Haswell and Bay Trail we have to use 4-component formats 588 * to fake 3-component ones. In particular, we do this for 589 * half-float and 8 and 16-bit integer formats. This means that the 590 * vertex element may poke over the end of the buffer by 2 bytes. 591 */ 592 const unsigned padding = 593 (GEN_GEN <= 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail) * 2; 594 const unsigned end = buffer->offset + buffer->size + padding; 595 dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo, 596 buffer->offset, 597 end, 598 buffer->stride, 599 buffer->step_rate); 600 } 601 602 if (uses_draw_params) { 603 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers, 604 brw->draw.draw_params_bo, 605 brw->draw.draw_params_offset, 606 brw->draw.draw_params_bo->size, 607 0 /* stride */, 608 0 /* step rate */); 609 } 610 611 if (uses_derived_draw_params) { 612 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1, 613 brw->draw.derived_draw_params_bo, 614 brw->draw.derived_draw_params_offset, 615 brw->draw.derived_draw_params_bo->size, 616 0 /* stride */, 617 0 /* step rate */); 618 } 619 } 620 621 /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, 622 * presumably for VertexID/InstanceID. 623 */ 624#if GEN_GEN >= 6 625 assert(nr_elements <= 34); 626 const struct brw_vertex_element *gen6_edgeflag_input = NULL; 627#else 628 assert(nr_elements <= 18); 629#endif 630 631 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS), 632 1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements); 633 unsigned i; 634 for (i = 0; i < brw->vb.nr_enabled; i++) { 635 const struct brw_vertex_element *input = brw->vb.enabled[i]; 636 const struct gl_array_attributes *glattrib = input->glattrib; 637 uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format); 638 uint32_t comp0 = VFCOMP_STORE_SRC; 639 uint32_t comp1 = VFCOMP_STORE_SRC; 640 uint32_t comp2 = VFCOMP_STORE_SRC; 641 uint32_t comp3 = VFCOMP_STORE_SRC; 642 const unsigned num_uploads = GEN_GEN < 8 ? 643 uploads_needed(format, input->is_dual_slot) : 1; 644 645#if GEN_GEN >= 8 646 /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE): 647 * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an 648 * element which has edge flag enabled." 649 */ 650 assert(!(is_passthru_format(format) && uses_edge_flag)); 651#endif 652 653 /* The gen4 driver expects edgeflag to come in as a float, and passes 654 * that float on to the tests in the clipper. Mesa's current vertex 655 * attribute value for EdgeFlag is stored as a float, which works out. 656 * glEdgeFlagPointer, on the other hand, gives us an unnormalized 657 * integer ubyte. Just rewrite that to convert to a float. 658 * 659 * Gen6+ passes edgeflag as sideband along with the vertex, instead 660 * of in the VUE. We have to upload it sideband as the last vertex 661 * element according to the B-Spec. 662 */ 663#if GEN_GEN >= 6 664 if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) { 665 gen6_edgeflag_input = input; 666 continue; 667 } 668#endif 669 670 for (unsigned c = 0; c < num_uploads; c++) { 671 const uint32_t upload_format = GEN_GEN >= 8 ? format : 672 downsize_format_if_needed(format, c); 673 /* If we need more that one upload, the offset stride would be 128 674 * bits (16 bytes), as for previous uploads we are using the full 675 * entry. */ 676 const unsigned offset = input->offset + c * 16; 677 678 const struct gl_array_attributes *glattrib = input->glattrib; 679 const int size = (GEN_GEN < 8 && is_passthru_format(format)) ? 680 upload_format_size(upload_format) : glattrib->Format.Size; 681 682 switch (size) { 683 case 0: comp0 = VFCOMP_STORE_0; 684 case 1: comp1 = VFCOMP_STORE_0; 685 case 2: comp2 = VFCOMP_STORE_0; 686 case 3: 687 if (GEN_GEN >= 8 && glattrib->Format.Doubles) { 688 comp3 = VFCOMP_STORE_0; 689 } else if (glattrib->Format.Integer) { 690 comp3 = VFCOMP_STORE_1_INT; 691 } else { 692 comp3 = VFCOMP_STORE_1_FP; 693 } 694 695 break; 696 } 697 698#if GEN_GEN >= 8 699 /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE): 700 * 701 * "When SourceElementFormat is set to one of the *64*_PASSTHRU 702 * formats, 64-bit components are stored in the URB without any 703 * conversion. In this case, vertex elements must be written as 128 704 * or 256 bits, with VFCOMP_STORE_0 being used to pad the output as 705 * required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red 706 * component into the URB, Component 1 must be specified as 707 * VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in 708 * order to output a 128-bit vertex element, or Components 1-3 must 709 * be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex 710 * element. Likewise, use of R64G64B64_PASSTHRU requires Component 3 711 * to be specified as VFCOMP_STORE_0 in order to output a 256-bit 712 * vertex element." 713 */ 714 if (glattrib->Format.Doubles && !input->is_dual_slot) { 715 /* Store vertex elements which correspond to double and dvec2 vertex 716 * shader inputs as 128-bit vertex elements, instead of 256-bits. 717 */ 718 comp2 = VFCOMP_NOSTORE; 719 comp3 = VFCOMP_NOSTORE; 720 } 721#endif 722 723 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 724 .VertexBufferIndex = input->buffer, 725 .Valid = true, 726 .SourceElementFormat = upload_format, 727 .SourceElementOffset = offset, 728 .Component0Control = comp0, 729 .Component1Control = comp1, 730 .Component2Control = comp2, 731 .Component3Control = comp3, 732#if GEN_GEN < 5 733 .DestinationElementOffset = i * 4, 734#endif 735 }; 736 737 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 738 dw += GENX(VERTEX_ELEMENT_STATE_length); 739 } 740 } 741 742 if (needs_sgvs_element) { 743 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 744 .Valid = true, 745 .Component0Control = VFCOMP_STORE_0, 746 .Component1Control = VFCOMP_STORE_0, 747 .Component2Control = VFCOMP_STORE_0, 748 .Component3Control = VFCOMP_STORE_0, 749#if GEN_GEN < 5 750 .DestinationElementOffset = i * 4, 751#endif 752 }; 753 754#if GEN_GEN >= 8 755 if (uses_draw_params) { 756 elem_state.VertexBufferIndex = brw->vb.nr_buffers; 757 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 758 elem_state.Component0Control = VFCOMP_STORE_SRC; 759 elem_state.Component1Control = VFCOMP_STORE_SRC; 760 } 761#else 762 elem_state.VertexBufferIndex = brw->vb.nr_buffers; 763 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 764 if (uses_draw_params) { 765 elem_state.Component0Control = VFCOMP_STORE_SRC; 766 elem_state.Component1Control = VFCOMP_STORE_SRC; 767 } 768 769 if (vs_prog_data->uses_vertexid) 770 elem_state.Component2Control = VFCOMP_STORE_VID; 771 772 if (vs_prog_data->uses_instanceid) 773 elem_state.Component3Control = VFCOMP_STORE_IID; 774#endif 775 776 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 777 dw += GENX(VERTEX_ELEMENT_STATE_length); 778 } 779 780 if (uses_derived_draw_params) { 781 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 782 .Valid = true, 783 .VertexBufferIndex = brw->vb.nr_buffers + 1, 784 .SourceElementFormat = ISL_FORMAT_R32G32_UINT, 785 .Component0Control = VFCOMP_STORE_SRC, 786 .Component1Control = VFCOMP_STORE_SRC, 787 .Component2Control = VFCOMP_STORE_0, 788 .Component3Control = VFCOMP_STORE_0, 789#if GEN_GEN < 5 790 .DestinationElementOffset = i * 4, 791#endif 792 }; 793 794 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 795 dw += GENX(VERTEX_ELEMENT_STATE_length); 796 } 797 798#if GEN_GEN >= 6 799 if (gen6_edgeflag_input) { 800 const struct gl_array_attributes *glattrib = gen6_edgeflag_input->glattrib; 801 const uint32_t format = brw_get_vertex_surface_type(brw, &glattrib->Format); 802 803 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 804 .Valid = true, 805 .VertexBufferIndex = gen6_edgeflag_input->buffer, 806 .EdgeFlagEnable = true, 807 .SourceElementFormat = format, 808 .SourceElementOffset = gen6_edgeflag_input->offset, 809 .Component0Control = VFCOMP_STORE_SRC, 810 .Component1Control = VFCOMP_STORE_0, 811 .Component2Control = VFCOMP_STORE_0, 812 .Component3Control = VFCOMP_STORE_0, 813 }; 814 815 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 816 dw += GENX(VERTEX_ELEMENT_STATE_length); 817 } 818#endif 819 820#if GEN_GEN >= 8 821 for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) { 822 const struct brw_vertex_element *input = brw->vb.enabled[i]; 823 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer]; 824 unsigned element_index; 825 826 /* The edge flag element is reordered to be the last one in the code 827 * above so we need to compensate for that in the element indices used 828 * below. 829 */ 830 if (input == gen6_edgeflag_input) 831 element_index = nr_elements - 1; 832 else 833 element_index = j++; 834 835 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 836 vfi.VertexElementIndex = element_index; 837 vfi.InstancingEnable = buffer->step_rate != 0; 838 vfi.InstanceDataStepRate = buffer->step_rate; 839 } 840 } 841 842 if (vs_prog_data->uses_drawid) { 843 const unsigned element = brw->vb.nr_enabled + needs_sgvs_element; 844 845 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 846 vfi.VertexElementIndex = element; 847 } 848 } 849#endif 850} 851 852static const struct brw_tracked_state genX(vertices) = { 853 .dirty = { 854 .mesa = _NEW_POLYGON, 855 .brw = BRW_NEW_BATCH | 856 BRW_NEW_BLORP | 857 BRW_NEW_VERTEX_PROGRAM | 858 BRW_NEW_VERTICES | 859 BRW_NEW_VS_PROG_DATA, 860 }, 861 .emit = genX(emit_vertices), 862}; 863 864static void 865genX(emit_index_buffer)(struct brw_context *brw) 866{ 867 const struct _mesa_index_buffer *index_buffer = brw->ib.ib; 868 869 if (index_buffer == NULL) 870 return; 871 872 vf_invalidate_for_ib_48bit_transition(brw); 873 874 brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) { 875#if GEN_GEN < 8 && !GEN_IS_HASWELL 876 assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index); 877 ib.CutIndexEnable = brw->ib.enable_cut_index; 878#endif 879 ib.IndexFormat = brw_get_index_type(index_buffer->index_size); 880 881 /* The VF cache designers apparently cut corners, and made the cache 882 * only consider the bottom 32 bits of memory addresses. If you happen 883 * to have two index buffers which get placed exactly 4 GiB apart and 884 * use them in back-to-back draw calls, you can get collisions. To work 885 * around this problem, we restrict index buffers to the low 32 bits of 886 * the address space. 887 */ 888 ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0); 889#if GEN_GEN >= 8 890 ib.MOCS = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; 891 ib.BufferSize = brw->ib.size; 892#else 893 ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1); 894#endif 895 } 896} 897 898static const struct brw_tracked_state genX(index_buffer) = { 899 .dirty = { 900 .mesa = 0, 901 .brw = BRW_NEW_BATCH | 902 BRW_NEW_BLORP | 903 BRW_NEW_INDEX_BUFFER, 904 }, 905 .emit = genX(emit_index_buffer), 906}; 907 908#if GEN_IS_HASWELL || GEN_GEN >= 8 909static void 910genX(upload_cut_index)(struct brw_context *brw) 911{ 912 const struct gl_context *ctx = &brw->ctx; 913 914 brw_batch_emit(brw, GENX(3DSTATE_VF), vf) { 915 if (ctx->Array._PrimitiveRestart && brw->ib.ib) { 916 vf.IndexedDrawCutIndexEnable = true; 917 vf.CutIndex = _mesa_primitive_restart_index(ctx, brw->ib.index_size); 918 } 919 } 920} 921 922const struct brw_tracked_state genX(cut_index) = { 923 .dirty = { 924 .mesa = _NEW_TRANSFORM, 925 .brw = BRW_NEW_INDEX_BUFFER, 926 }, 927 .emit = genX(upload_cut_index), 928}; 929#endif 930 931static void 932genX(upload_vf_statistics)(struct brw_context *brw) 933{ 934 brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) { 935 vf.StatisticsEnable = true; 936 } 937} 938 939const struct brw_tracked_state genX(vf_statistics) = { 940 .dirty = { 941 .mesa = 0, 942 .brw = BRW_NEW_BLORP | BRW_NEW_CONTEXT, 943 }, 944 .emit = genX(upload_vf_statistics), 945}; 946 947#if GEN_GEN >= 6 948/** 949 * Determine the appropriate attribute override value to store into the 950 * 3DSTATE_SF structure for a given fragment shader attribute. The attribute 951 * override value contains two pieces of information: the location of the 952 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a 953 * flag indicating whether to "swizzle" the attribute based on the direction 954 * the triangle is facing. 955 * 956 * If an attribute is "swizzled", then the given VUE location is used for 957 * front-facing triangles, and the VUE location that immediately follows is 958 * used for back-facing triangles. We use this to implement the mapping from 959 * gl_FrontColor/gl_BackColor to gl_Color. 960 * 961 * urb_entry_read_offset is the offset into the VUE at which the SF unit is 962 * being instructed to begin reading attribute data. It can be set to a 963 * nonzero value to prevent the SF unit from wasting time reading elements of 964 * the VUE that are not needed by the fragment shader. It is measured in 965 * 256-bit increments. 966 */ 967static void 968genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr, 969 const struct brw_vue_map *vue_map, 970 int urb_entry_read_offset, int fs_attr, 971 bool two_side_color, uint32_t *max_source_attr) 972{ 973 /* Find the VUE slot for this attribute. */ 974 int slot = vue_map->varying_to_slot[fs_attr]; 975 976 /* Viewport and Layer are stored in the VUE header. We need to override 977 * them to zero if earlier stages didn't write them, as GL requires that 978 * they read back as zero when not explicitly set. 979 */ 980 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) { 981 attr->ComponentOverrideX = true; 982 attr->ComponentOverrideW = true; 983 attr->ConstantSource = CONST_0000; 984 985 if (!(vue_map->slots_valid & VARYING_BIT_LAYER)) 986 attr->ComponentOverrideY = true; 987 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT)) 988 attr->ComponentOverrideZ = true; 989 990 return; 991 } 992 993 /* If there was only a back color written but not front, use back 994 * as the color instead of undefined 995 */ 996 if (slot == -1 && fs_attr == VARYING_SLOT_COL0) 997 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0]; 998 if (slot == -1 && fs_attr == VARYING_SLOT_COL1) 999 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1]; 1000 1001 if (slot == -1) { 1002 /* This attribute does not exist in the VUE--that means that the vertex 1003 * shader did not write to it. This means that either: 1004 * 1005 * (a) This attribute is a texture coordinate, and it is going to be 1006 * replaced with point coordinates (as a consequence of a call to 1007 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the 1008 * hardware will ignore whatever attribute override we supply. 1009 * 1010 * (b) This attribute is read by the fragment shader but not written by 1011 * the vertex shader, so its value is undefined. Therefore the 1012 * attribute override we supply doesn't matter. 1013 * 1014 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the 1015 * previous shader stage. 1016 * 1017 * Note that we don't have to worry about the cases where the attribute 1018 * is gl_PointCoord or is undergoing point sprite coordinate 1019 * replacement, because in those cases, this function isn't called. 1020 * 1021 * In case (c), we need to program the attribute overrides so that the 1022 * primitive ID will be stored in this slot. In every other case, the 1023 * attribute override we supply doesn't matter. So just go ahead and 1024 * program primitive ID in every case. 1025 */ 1026 attr->ComponentOverrideW = true; 1027 attr->ComponentOverrideX = true; 1028 attr->ComponentOverrideY = true; 1029 attr->ComponentOverrideZ = true; 1030 attr->ConstantSource = PRIM_ID; 1031 return; 1032 } 1033 1034 /* Compute the location of the attribute relative to urb_entry_read_offset. 1035 * Each increment of urb_entry_read_offset represents a 256-bit value, so 1036 * it counts for two 128-bit VUE slots. 1037 */ 1038 int source_attr = slot - 2 * urb_entry_read_offset; 1039 assert(source_attr >= 0 && source_attr < 32); 1040 1041 /* If we are doing two-sided color, and the VUE slot following this one 1042 * represents a back-facing color, then we need to instruct the SF unit to 1043 * do back-facing swizzling. 1044 */ 1045 bool swizzling = two_side_color && 1046 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 && 1047 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) || 1048 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 && 1049 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)); 1050 1051 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */ 1052 if (*max_source_attr < source_attr + swizzling) 1053 *max_source_attr = source_attr + swizzling; 1054 1055 attr->SourceAttribute = source_attr; 1056 if (swizzling) 1057 attr->SwizzleSelect = INPUTATTR_FACING; 1058} 1059 1060 1061static void 1062genX(calculate_attr_overrides)(const struct brw_context *brw, 1063 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides, 1064 uint32_t *point_sprite_enables, 1065 uint32_t *urb_entry_read_length, 1066 uint32_t *urb_entry_read_offset) 1067{ 1068 const struct gl_context *ctx = &brw->ctx; 1069 1070 /* _NEW_POINT */ 1071 const struct gl_point_attrib *point = &ctx->Point; 1072 1073 /* BRW_NEW_FRAGMENT_PROGRAM */ 1074 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 1075 1076 /* BRW_NEW_FS_PROG_DATA */ 1077 const struct brw_wm_prog_data *wm_prog_data = 1078 brw_wm_prog_data(brw->wm.base.prog_data); 1079 uint32_t max_source_attr = 0; 1080 1081 *point_sprite_enables = 0; 1082 1083 int first_slot = 1084 brw_compute_first_urb_slot_required(fp->info.inputs_read, 1085 &brw->vue_map_geom_out); 1086 1087 /* Each URB offset packs two varying slots */ 1088 assert(first_slot % 2 == 0); 1089 *urb_entry_read_offset = first_slot / 2; 1090 1091 /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE, 1092 * description of dw10 Point Sprite Texture Coordinate Enable: 1093 * 1094 * "This field must be programmed to zero when non-point primitives 1095 * are rendered." 1096 * 1097 * The SandyBridge PRM doesn't explicitly say that point sprite enables 1098 * must be programmed to zero when rendering non-point primitives, but 1099 * the IvyBridge PRM does, and if we don't, we get garbage. 1100 * 1101 * This is not required on Haswell, as the hardware ignores this state 1102 * when drawing non-points -- although we do still need to be careful to 1103 * correctly set the attr overrides. 1104 * 1105 * _NEW_POLYGON 1106 * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA 1107 */ 1108 bool drawing_points = brw_is_drawing_points(brw); 1109 1110 for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) { 1111 int input_index = wm_prog_data->urb_setup[attr]; 1112 1113 if (input_index < 0) 1114 continue; 1115 1116 /* _NEW_POINT */ 1117 bool point_sprite = false; 1118 if (drawing_points) { 1119 if (point->PointSprite && 1120 (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) && 1121 (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) { 1122 point_sprite = true; 1123 } 1124 1125 if (attr == VARYING_SLOT_PNTC) 1126 point_sprite = true; 1127 1128 if (point_sprite) 1129 *point_sprite_enables |= (1 << input_index); 1130 } 1131 1132 /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */ 1133 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 }; 1134 1135 if (!point_sprite) { 1136 genX(get_attr_override)(&attribute, 1137 &brw->vue_map_geom_out, 1138 *urb_entry_read_offset, attr, 1139 _mesa_vertex_program_two_side_enabled(ctx), 1140 &max_source_attr); 1141 } 1142 1143 /* The hardware can only do the overrides on 16 overrides at a 1144 * time, and the other up to 16 have to be lined up so that the 1145 * input index = the output index. We'll need to do some 1146 * tweaking to make sure that's the case. 1147 */ 1148 if (input_index < 16) 1149 attr_overrides[input_index] = attribute; 1150 else 1151 assert(attribute.SourceAttribute == input_index); 1152 } 1153 1154 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for 1155 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length": 1156 * 1157 * "This field should be set to the minimum length required to read the 1158 * maximum source attribute. The maximum source attribute is indicated 1159 * by the maximum value of the enabled Attribute # Source Attribute if 1160 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if 1161 * enable is not set. 1162 * read_length = ceiling((max_source_attr + 1) / 2) 1163 * 1164 * [errata] Corruption/Hang possible if length programmed larger than 1165 * recommended" 1166 * 1167 * Similar text exists for Ivy Bridge. 1168 */ 1169 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2); 1170} 1171#endif 1172 1173/* ---------------------------------------------------------------------- */ 1174 1175#if GEN_GEN >= 8 1176typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML; 1177#elif GEN_GEN >= 6 1178typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML; 1179#else 1180typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML; 1181#endif 1182 1183static inline void 1184set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds) 1185{ 1186 struct gl_context *ctx = &brw->ctx; 1187 1188 /* _NEW_BUFFERS */ 1189 struct intel_renderbuffer *depth_irb = 1190 intel_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH); 1191 1192 /* _NEW_DEPTH */ 1193 struct gl_depthbuffer_attrib *depth = &ctx->Depth; 1194 1195 /* _NEW_STENCIL */ 1196 struct gl_stencil_attrib *stencil = &ctx->Stencil; 1197 const int b = stencil->_BackFace; 1198 1199 if (depth->Test && depth_irb) { 1200 ds->DepthTestEnable = true; 1201 ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw); 1202 ds->DepthTestFunction = intel_translate_compare_func(depth->Func); 1203 } 1204 1205 if (brw->stencil_enabled) { 1206 ds->StencilTestEnable = true; 1207 ds->StencilWriteMask = stencil->WriteMask[0] & 0xff; 1208 ds->StencilTestMask = stencil->ValueMask[0] & 0xff; 1209 1210 ds->StencilTestFunction = 1211 intel_translate_compare_func(stencil->Function[0]); 1212 ds->StencilFailOp = 1213 intel_translate_stencil_op(stencil->FailFunc[0]); 1214 ds->StencilPassDepthPassOp = 1215 intel_translate_stencil_op(stencil->ZPassFunc[0]); 1216 ds->StencilPassDepthFailOp = 1217 intel_translate_stencil_op(stencil->ZFailFunc[0]); 1218 1219 ds->StencilBufferWriteEnable = brw->stencil_write_enabled; 1220 1221 if (brw->stencil_two_sided) { 1222 ds->DoubleSidedStencilEnable = true; 1223 ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff; 1224 ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff; 1225 1226 ds->BackfaceStencilTestFunction = 1227 intel_translate_compare_func(stencil->Function[b]); 1228 ds->BackfaceStencilFailOp = 1229 intel_translate_stencil_op(stencil->FailFunc[b]); 1230 ds->BackfaceStencilPassDepthPassOp = 1231 intel_translate_stencil_op(stencil->ZPassFunc[b]); 1232 ds->BackfaceStencilPassDepthFailOp = 1233 intel_translate_stencil_op(stencil->ZFailFunc[b]); 1234 } 1235 1236#if GEN_GEN <= 5 || GEN_GEN >= 9 1237 ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0); 1238 ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b); 1239#endif 1240 } 1241} 1242 1243#if GEN_GEN >= 6 1244static void 1245genX(upload_depth_stencil_state)(struct brw_context *brw) 1246{ 1247#if GEN_GEN >= 8 1248 brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) { 1249 set_depth_stencil_bits(brw, &wmds); 1250 } 1251#else 1252 uint32_t ds_offset; 1253 brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) { 1254 set_depth_stencil_bits(brw, &ds); 1255 } 1256 1257 /* Now upload a pointer to the indirect state */ 1258#if GEN_GEN == 6 1259 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 1260 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 1261 ptr.DEPTH_STENCIL_STATEChange = true; 1262 } 1263#else 1264 brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) { 1265 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 1266 } 1267#endif 1268#endif 1269} 1270 1271static const struct brw_tracked_state genX(depth_stencil_state) = { 1272 .dirty = { 1273 .mesa = _NEW_BUFFERS | 1274 _NEW_DEPTH | 1275 _NEW_STENCIL, 1276 .brw = BRW_NEW_BLORP | 1277 (GEN_GEN >= 8 ? BRW_NEW_CONTEXT 1278 : BRW_NEW_BATCH | 1279 BRW_NEW_STATE_BASE_ADDRESS), 1280 }, 1281 .emit = genX(upload_depth_stencil_state), 1282}; 1283#endif 1284 1285/* ---------------------------------------------------------------------- */ 1286 1287#if GEN_GEN <= 5 1288 1289static void 1290genX(upload_clip_state)(struct brw_context *brw) 1291{ 1292 struct gl_context *ctx = &brw->ctx; 1293 1294 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 1295 brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) { 1296 clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset); 1297 clip.GRFRegisterCount = 1298 DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1; 1299 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1300 clip.SingleProgramFlow = true; 1301 clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length; 1302 clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length; 1303 1304 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */ 1305 clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2; 1306 clip.DispatchGRFStartRegisterForURBData = 1; 1307 clip.VertexURBEntryReadOffset = 0; 1308 1309 /* BRW_NEW_URB_FENCE */ 1310 clip.NumberofURBEntries = brw->urb.nr_clip_entries; 1311 clip.URBEntryAllocationSize = brw->urb.vsize - 1; 1312 1313 if (brw->urb.nr_clip_entries >= 10) { 1314 /* Half of the URB entries go to each thread, and it has to be an 1315 * even number. 1316 */ 1317 assert(brw->urb.nr_clip_entries % 2 == 0); 1318 1319 /* Although up to 16 concurrent Clip threads are allowed on Ironlake, 1320 * only 2 threads can output VUEs at a time. 1321 */ 1322 clip.MaximumNumberofThreads = (GEN_GEN == 5 ? 16 : 2) - 1; 1323 } else { 1324 assert(brw->urb.nr_clip_entries >= 5); 1325 clip.MaximumNumberofThreads = 1 - 1; 1326 } 1327 1328 clip.VertexPositionSpace = VPOS_NDCSPACE; 1329 clip.UserClipFlagsMustClipEnable = true; 1330 clip.GuardbandClipTestEnable = true; 1331 1332 clip.ClipperViewportStatePointer = 1333 ro_bo(brw->batch.state.bo, brw->clip.vp_offset); 1334 1335 clip.ScreenSpaceViewportXMin = -1; 1336 clip.ScreenSpaceViewportXMax = 1; 1337 clip.ScreenSpaceViewportYMin = -1; 1338 clip.ScreenSpaceViewportYMax = 1; 1339 1340 clip.ViewportXYClipTestEnable = true; 1341 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear && 1342 ctx->Transform.DepthClampFar); 1343 1344 /* _NEW_TRANSFORM */ 1345 if (GEN_GEN == 5 || GEN_IS_G4X) { 1346 clip.UserClipDistanceClipTestEnableBitmask = 1347 ctx->Transform.ClipPlanesEnabled; 1348 } else { 1349 /* Up to 6 actual clip flags, plus the 7th for the negative RHW 1350 * workaround. 1351 */ 1352 clip.UserClipDistanceClipTestEnableBitmask = 1353 (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40; 1354 } 1355 1356 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE) 1357 clip.APIMode = APIMODE_D3D; 1358 else 1359 clip.APIMode = APIMODE_OGL; 1360 1361 clip.GuardbandClipTestEnable = true; 1362 1363 clip.ClipMode = brw->clip.prog_data->clip_mode; 1364 1365#if GEN_IS_G4X 1366 clip.NegativeWClipTestEnable = true; 1367#endif 1368 } 1369} 1370 1371const struct brw_tracked_state genX(clip_state) = { 1372 .dirty = { 1373 .mesa = _NEW_TRANSFORM | 1374 _NEW_VIEWPORT, 1375 .brw = BRW_NEW_BATCH | 1376 BRW_NEW_BLORP | 1377 BRW_NEW_CLIP_PROG_DATA | 1378 BRW_NEW_PUSH_CONSTANT_ALLOCATION | 1379 BRW_NEW_PROGRAM_CACHE | 1380 BRW_NEW_URB_FENCE, 1381 }, 1382 .emit = genX(upload_clip_state), 1383}; 1384 1385#else 1386 1387static void 1388genX(upload_clip_state)(struct brw_context *brw) 1389{ 1390 struct gl_context *ctx = &brw->ctx; 1391 1392 /* _NEW_BUFFERS */ 1393 struct gl_framebuffer *fb = ctx->DrawBuffer; 1394 1395 /* BRW_NEW_FS_PROG_DATA */ 1396 struct brw_wm_prog_data *wm_prog_data = 1397 brw_wm_prog_data(brw->wm.base.prog_data); 1398 1399 brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) { 1400 clip.StatisticsEnable = !brw->meta_in_progress; 1401 1402 if (wm_prog_data->barycentric_interp_modes & 1403 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) 1404 clip.NonPerspectiveBarycentricEnable = true; 1405 1406#if GEN_GEN >= 7 1407 clip.EarlyCullEnable = true; 1408#endif 1409 1410#if GEN_GEN == 7 1411 clip.FrontWinding = brw->polygon_front_bit != fb->FlipY; 1412 1413 if (ctx->Polygon.CullFlag) { 1414 switch (ctx->Polygon.CullFaceMode) { 1415 case GL_FRONT: 1416 clip.CullMode = CULLMODE_FRONT; 1417 break; 1418 case GL_BACK: 1419 clip.CullMode = CULLMODE_BACK; 1420 break; 1421 case GL_FRONT_AND_BACK: 1422 clip.CullMode = CULLMODE_BOTH; 1423 break; 1424 default: 1425 unreachable("Should not get here: invalid CullFlag"); 1426 } 1427 } else { 1428 clip.CullMode = CULLMODE_NONE; 1429 } 1430#endif 1431 1432#if GEN_GEN < 8 1433 clip.UserClipDistanceCullTestEnableBitmask = 1434 brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask; 1435 1436 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear && 1437 ctx->Transform.DepthClampFar); 1438#endif 1439 1440 /* _NEW_LIGHT */ 1441 if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) { 1442 clip.TriangleStripListProvokingVertexSelect = 0; 1443 clip.TriangleFanProvokingVertexSelect = 1; 1444 clip.LineStripListProvokingVertexSelect = 0; 1445 } else { 1446 clip.TriangleStripListProvokingVertexSelect = 2; 1447 clip.TriangleFanProvokingVertexSelect = 2; 1448 clip.LineStripListProvokingVertexSelect = 1; 1449 } 1450 1451 /* _NEW_TRANSFORM */ 1452 clip.UserClipDistanceClipTestEnableBitmask = 1453 ctx->Transform.ClipPlanesEnabled; 1454 1455#if GEN_GEN >= 8 1456 clip.ForceUserClipDistanceClipTestEnableBitmask = true; 1457#endif 1458 1459 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE) 1460 clip.APIMode = APIMODE_D3D; 1461 else 1462 clip.APIMode = APIMODE_OGL; 1463 1464 clip.GuardbandClipTestEnable = true; 1465 1466 /* BRW_NEW_VIEWPORT_COUNT */ 1467 const unsigned viewport_count = brw->clip.viewport_count; 1468 1469 if (ctx->RasterDiscard) { 1470 clip.ClipMode = CLIPMODE_REJECT_ALL; 1471#if GEN_GEN == 6 1472 perf_debug("Rasterizer discard is currently implemented via the " 1473 "clipper; having the GS not write primitives would " 1474 "likely be faster.\n"); 1475#endif 1476 } else { 1477 clip.ClipMode = CLIPMODE_NORMAL; 1478 } 1479 1480 clip.ClipEnable = true; 1481 1482 /* _NEW_POLYGON, 1483 * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE 1484 */ 1485 if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw)) 1486 clip.ViewportXYClipTestEnable = true; 1487 1488 clip.MinimumPointWidth = 0.125; 1489 clip.MaximumPointWidth = 255.875; 1490 clip.MaximumVPIndex = viewport_count - 1; 1491 if (_mesa_geometric_layers(fb) == 0) 1492 clip.ForceZeroRTAIndexEnable = true; 1493 } 1494} 1495 1496static const struct brw_tracked_state genX(clip_state) = { 1497 .dirty = { 1498 .mesa = _NEW_BUFFERS | 1499 _NEW_LIGHT | 1500 _NEW_POLYGON | 1501 _NEW_TRANSFORM, 1502 .brw = BRW_NEW_BLORP | 1503 BRW_NEW_CONTEXT | 1504 BRW_NEW_FS_PROG_DATA | 1505 BRW_NEW_GS_PROG_DATA | 1506 BRW_NEW_VS_PROG_DATA | 1507 BRW_NEW_META_IN_PROGRESS | 1508 BRW_NEW_PRIMITIVE | 1509 BRW_NEW_RASTERIZER_DISCARD | 1510 BRW_NEW_TES_PROG_DATA | 1511 BRW_NEW_VIEWPORT_COUNT, 1512 }, 1513 .emit = genX(upload_clip_state), 1514}; 1515#endif 1516 1517/* ---------------------------------------------------------------------- */ 1518 1519static void 1520genX(upload_sf)(struct brw_context *brw) 1521{ 1522 struct gl_context *ctx = &brw->ctx; 1523 float point_size; 1524 1525#if GEN_GEN <= 7 1526 /* _NEW_BUFFERS */ 1527 bool flip_y = ctx->DrawBuffer->FlipY; 1528 UNUSED const bool multisampled_fbo = 1529 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 1530#endif 1531 1532#if GEN_GEN < 6 1533 const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data; 1534 1535 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 1536 1537 brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) { 1538 sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset); 1539 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1540 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1; 1541 sf.DispatchGRFStartRegisterForURBData = 3; 1542 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET; 1543 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length; 1544 sf.NumberofURBEntries = brw->urb.nr_sf_entries; 1545 sf.URBEntryAllocationSize = brw->urb.sfsize - 1; 1546 1547 /* STATE_PREFETCH command description describes this state as being 1548 * something loaded through the GPE (L2 ISC), so it's INSTRUCTION 1549 * domain. 1550 */ 1551 sf.SetupViewportStateOffset = 1552 ro_bo(brw->batch.state.bo, brw->sf.vp_offset); 1553 1554 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1555 1556 /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */ 1557 /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */ 1558 1559 sf.MaximumNumberofThreads = 1560 MIN2(GEN_GEN == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1; 1561 1562 sf.SpritePointEnable = ctx->Point.PointSprite; 1563 1564 sf.DestinationOriginHorizontalBias = 0.5; 1565 sf.DestinationOriginVerticalBias = 0.5; 1566#else 1567 brw_batch_emit(brw, GENX(3DSTATE_SF), sf) { 1568 sf.StatisticsEnable = true; 1569#endif 1570 sf.ViewportTransformEnable = true; 1571 1572#if GEN_GEN == 7 1573 /* _NEW_BUFFERS */ 1574 sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw); 1575#endif 1576 1577#if GEN_GEN <= 7 1578 /* _NEW_POLYGON */ 1579 sf.FrontWinding = brw->polygon_front_bit != flip_y; 1580#if GEN_GEN >= 6 1581 sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill; 1582 sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine; 1583 sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint; 1584 1585 switch (ctx->Polygon.FrontMode) { 1586 case GL_FILL: 1587 sf.FrontFaceFillMode = FILL_MODE_SOLID; 1588 break; 1589 case GL_LINE: 1590 sf.FrontFaceFillMode = FILL_MODE_WIREFRAME; 1591 break; 1592 case GL_POINT: 1593 sf.FrontFaceFillMode = FILL_MODE_POINT; 1594 break; 1595 default: 1596 unreachable("not reached"); 1597 } 1598 1599 switch (ctx->Polygon.BackMode) { 1600 case GL_FILL: 1601 sf.BackFaceFillMode = FILL_MODE_SOLID; 1602 break; 1603 case GL_LINE: 1604 sf.BackFaceFillMode = FILL_MODE_WIREFRAME; 1605 break; 1606 case GL_POINT: 1607 sf.BackFaceFillMode = FILL_MODE_POINT; 1608 break; 1609 default: 1610 unreachable("not reached"); 1611 } 1612 1613 if (multisampled_fbo && ctx->Multisample.Enabled) 1614 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 1615 1616 sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2; 1617 sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor; 1618 sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp; 1619#endif 1620 1621 sf.ScissorRectangleEnable = true; 1622 1623 if (ctx->Polygon.CullFlag) { 1624 switch (ctx->Polygon.CullFaceMode) { 1625 case GL_FRONT: 1626 sf.CullMode = CULLMODE_FRONT; 1627 break; 1628 case GL_BACK: 1629 sf.CullMode = CULLMODE_BACK; 1630 break; 1631 case GL_FRONT_AND_BACK: 1632 sf.CullMode = CULLMODE_BOTH; 1633 break; 1634 default: 1635 unreachable("not reached"); 1636 } 1637 } else { 1638 sf.CullMode = CULLMODE_NONE; 1639 } 1640 1641#if GEN_IS_HASWELL 1642 sf.LineStippleEnable = ctx->Line.StippleFlag; 1643#endif 1644 1645#endif 1646 1647 /* _NEW_LINE */ 1648#if GEN_GEN == 8 1649 const struct gen_device_info *devinfo = &brw->screen->devinfo; 1650 1651 if (devinfo->is_cherryview) 1652 sf.CHVLineWidth = brw_get_line_width(brw); 1653 else 1654 sf.LineWidth = brw_get_line_width(brw); 1655#else 1656 sf.LineWidth = brw_get_line_width(brw); 1657#endif 1658 1659 if (ctx->Line.SmoothFlag) { 1660 sf.LineEndCapAntialiasingRegionWidth = _10pixels; 1661#if GEN_GEN <= 7 1662 sf.AntiAliasingEnable = true; 1663#endif 1664 } 1665 1666 /* _NEW_POINT - Clamp to ARB_point_parameters user limits */ 1667 point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); 1668 /* Clamp to the hardware limits */ 1669 sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f); 1670 1671 /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */ 1672 if (use_state_point_size(brw)) 1673 sf.PointWidthSource = State; 1674 1675#if GEN_GEN >= 8 1676 /* _NEW_POINT | _NEW_MULTISAMPLE */ 1677 if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) && 1678 !ctx->Point.PointSprite) 1679 sf.SmoothPointEnable = true; 1680#endif 1681 1682#if GEN_GEN == 10 1683 /* _NEW_BUFFERS 1684 * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1. 1685 */ 1686 const bool multisampled_fbo = 1687 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 1688 if (multisampled_fbo) 1689 sf.SmoothPointEnable = false; 1690#endif 1691 1692#if GEN_IS_G4X || GEN_GEN >= 5 1693 sf.AALineDistanceMode = AALINEDISTANCE_TRUE; 1694#endif 1695 1696 /* _NEW_LIGHT */ 1697 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) { 1698 sf.TriangleStripListProvokingVertexSelect = 2; 1699 sf.TriangleFanProvokingVertexSelect = 2; 1700 sf.LineStripListProvokingVertexSelect = 1; 1701 } else { 1702 sf.TriangleFanProvokingVertexSelect = 1; 1703 } 1704 1705#if GEN_GEN == 6 1706 /* BRW_NEW_FS_PROG_DATA */ 1707 const struct brw_wm_prog_data *wm_prog_data = 1708 brw_wm_prog_data(brw->wm.base.prog_data); 1709 1710 sf.AttributeSwizzleEnable = true; 1711 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 1712 1713 /* 1714 * Window coordinates in an FBO are inverted, which means point 1715 * sprite origin must be inverted, too. 1716 */ 1717 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) { 1718 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT; 1719 } else { 1720 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT; 1721 } 1722 1723 /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM | 1724 * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA 1725 */ 1726 uint32_t urb_entry_read_length; 1727 uint32_t urb_entry_read_offset; 1728 uint32_t point_sprite_enables; 1729 genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables, 1730 &urb_entry_read_length, 1731 &urb_entry_read_offset); 1732 sf.VertexURBEntryReadLength = urb_entry_read_length; 1733 sf.VertexURBEntryReadOffset = urb_entry_read_offset; 1734 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables; 1735 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 1736#endif 1737 } 1738} 1739 1740static const struct brw_tracked_state genX(sf_state) = { 1741 .dirty = { 1742 .mesa = _NEW_LIGHT | 1743 _NEW_LINE | 1744 _NEW_POINT | 1745 _NEW_PROGRAM | 1746 (GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0) | 1747 (GEN_GEN <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) | 1748 (GEN_GEN == 10 ? _NEW_BUFFERS : 0), 1749 .brw = BRW_NEW_BLORP | 1750 BRW_NEW_VUE_MAP_GEOM_OUT | 1751 (GEN_GEN <= 5 ? BRW_NEW_BATCH | 1752 BRW_NEW_PROGRAM_CACHE | 1753 BRW_NEW_SF_PROG_DATA | 1754 BRW_NEW_SF_VP | 1755 BRW_NEW_URB_FENCE 1756 : 0) | 1757 (GEN_GEN >= 6 ? BRW_NEW_CONTEXT : 0) | 1758 (GEN_GEN >= 6 && GEN_GEN <= 7 ? 1759 BRW_NEW_GS_PROG_DATA | 1760 BRW_NEW_PRIMITIVE | 1761 BRW_NEW_TES_PROG_DATA 1762 : 0) | 1763 (GEN_GEN == 6 ? BRW_NEW_FS_PROG_DATA | 1764 BRW_NEW_FRAGMENT_PROGRAM 1765 : 0), 1766 }, 1767 .emit = genX(upload_sf), 1768}; 1769 1770/* ---------------------------------------------------------------------- */ 1771 1772static bool 1773brw_color_buffer_write_enabled(struct brw_context *brw) 1774{ 1775 struct gl_context *ctx = &brw->ctx; 1776 /* BRW_NEW_FRAGMENT_PROGRAM */ 1777 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 1778 unsigned i; 1779 1780 /* _NEW_BUFFERS */ 1781 for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) { 1782 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i]; 1783 uint64_t outputs_written = fp->info.outputs_written; 1784 1785 /* _NEW_COLOR */ 1786 if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) || 1787 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) && 1788 GET_COLORMASK(ctx->Color.ColorMask, i)) { 1789 return true; 1790 } 1791 } 1792 1793 return false; 1794} 1795 1796static void 1797genX(upload_wm)(struct brw_context *brw) 1798{ 1799 struct gl_context *ctx = &brw->ctx; 1800 1801 /* BRW_NEW_FS_PROG_DATA */ 1802 const struct brw_wm_prog_data *wm_prog_data = 1803 brw_wm_prog_data(brw->wm.base.prog_data); 1804 1805 UNUSED bool writes_depth = 1806 wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF; 1807 UNUSED struct brw_stage_state *stage_state = &brw->wm.base; 1808 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 1809 1810#if GEN_GEN == 6 1811 /* We can't fold this into gen6_upload_wm_push_constants(), because 1812 * according to the SNB PRM, vol 2 part 1 section 7.2.2 1813 * (3DSTATE_CONSTANT_PS [DevSNB]): 1814 * 1815 * "[DevSNB]: This packet must be followed by WM_STATE." 1816 */ 1817 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) { 1818 if (wm_prog_data->base.nr_params != 0) { 1819 wmcp.Buffer0Valid = true; 1820 /* Pointer to the WM constant buffer. Covered by the set of 1821 * state flags from gen6_upload_wm_push_constants. 1822 */ 1823 wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 1824 wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 1825 } 1826 } 1827#endif 1828 1829#if GEN_GEN >= 6 1830 brw_batch_emit(brw, GENX(3DSTATE_WM), wm) { 1831#else 1832 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 1833 brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) { 1834#endif 1835 1836#if GEN_GEN <= 6 1837 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8; 1838 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16; 1839 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32; 1840#endif 1841 1842#if GEN_GEN == 4 1843 /* On gen4, we only have one shader kernel */ 1844 if (brw_wm_state_has_ksp(wm, 0)) { 1845 assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0); 1846 wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset); 1847 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 1848 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1849 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); 1850 } 1851#elif GEN_GEN == 5 1852 /* On gen5, we have multiple shader kernels but only one GRF start 1853 * register for all kernels 1854 */ 1855 wm.KernelStartPointer0 = stage_state->prog_offset + 1856 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 1857 wm.KernelStartPointer1 = stage_state->prog_offset + 1858 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 1859 wm.KernelStartPointer2 = stage_state->prog_offset + 1860 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 1861 1862 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 1863 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1); 1864 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2); 1865 1866 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1867 wm_prog_data->base.dispatch_grf_start_reg; 1868 1869 /* Dispatch GRF Start should be the same for all shaders on gen5 */ 1870 if (brw_wm_state_has_ksp(wm, 1)) { 1871 assert(wm_prog_data->base.dispatch_grf_start_reg == 1872 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1)); 1873 } 1874 if (brw_wm_state_has_ksp(wm, 2)) { 1875 assert(wm_prog_data->base.dispatch_grf_start_reg == 1876 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2)); 1877 } 1878#elif GEN_GEN == 6 1879 /* On gen6, we have multiple shader kernels and we no longer specify a 1880 * register count for each one. 1881 */ 1882 wm.KernelStartPointer0 = stage_state->prog_offset + 1883 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 1884 wm.KernelStartPointer1 = stage_state->prog_offset + 1885 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 1886 wm.KernelStartPointer2 = stage_state->prog_offset + 1887 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 1888 1889 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1890 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); 1891 wm.DispatchGRFStartRegisterForConstantSetupData1 = 1892 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1); 1893 wm.DispatchGRFStartRegisterForConstantSetupData2 = 1894 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2); 1895#endif 1896 1897#if GEN_GEN <= 5 1898 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length; 1899 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */ 1900 wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2; 1901 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2; 1902 wm.SetupURBEntryReadOffset = 0; 1903 wm.EarlyDepthTestEnable = true; 1904#endif 1905 1906#if GEN_GEN >= 6 1907 wm.LineAntialiasingRegionWidth = _10pixels; 1908 wm.LineEndCapAntialiasingRegionWidth = _05pixels; 1909 1910 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1911 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes; 1912#else 1913 if (stage_state->sampler_count) 1914 wm.SamplerStatePointer = 1915 ro_bo(brw->batch.state.bo, stage_state->sampler_offset); 1916 1917 wm.LineAntialiasingRegionWidth = _05pixels; 1918 wm.LineEndCapAntialiasingRegionWidth = _10pixels; 1919 1920 /* _NEW_POLYGON */ 1921 if (ctx->Polygon.OffsetFill) { 1922 wm.GlobalDepthOffsetEnable = true; 1923 /* Something weird going on with legacy_global_depth_bias, 1924 * offset_constant, scaling and MRD. This value passes glean 1925 * but gives some odd results elsewere (eg. the 1926 * quad-offset-units test). 1927 */ 1928 wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2; 1929 1930 /* This is the only value that passes glean: 1931 */ 1932 wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor; 1933 } 1934 1935 wm.DepthCoefficientURBReadOffset = 1; 1936#endif 1937 1938 /* BRW_NEW_STATS_WM */ 1939 wm.StatisticsEnable = GEN_GEN >= 6 || brw->stats_wm; 1940 1941#if GEN_GEN < 7 1942 if (wm_prog_data->base.use_alt_mode) 1943 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1944 1945 /* WA_1606682166 */ 1946 wm.SamplerCount = (GEN_GEN == 5 || GEN_GEN == 11) ? 1947 0 : DIV_ROUND_UP(stage_state->sampler_count, 4); 1948 1949 wm.BindingTableEntryCount = 1950 wm_prog_data->base.binding_table.size_bytes / 4; 1951 wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 1952 1953#if GEN_GEN == 6 1954 wm.DualSourceBlendEnable = 1955 wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) && 1956 ctx->Color.Blend[0]._UsesDualSrc; 1957 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 1958 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 1959 1960 /* From the SNB PRM, volume 2 part 1, page 281: 1961 * "If the PS kernel does not need the Position XY Offsets 1962 * to compute a Position XY value, then this field should be 1963 * programmed to POSOFFSET_NONE." 1964 * 1965 * "SW Recommendation: If the PS kernel needs the Position Offsets 1966 * to compute a Position XY value, this field should match Position 1967 * ZW Interpolation Mode to ensure a consistent position.xyzw 1968 * computation." 1969 * We only require XY sample offsets. So, this recommendation doesn't 1970 * look useful at the moment. We might need this in future. 1971 */ 1972 if (wm_prog_data->uses_pos_offset) 1973 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE; 1974 else 1975 wm.PositionXYOffsetSelect = POSOFFSET_NONE; 1976#endif 1977 1978 if (wm_prog_data->base.total_scratch) { 1979 wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); 1980 wm.PerThreadScratchSpace = 1981 ffs(stage_state->per_thread_scratch) - 11; 1982 } 1983 1984 wm.PixelShaderComputedDepth = writes_depth; 1985#endif 1986 1987 /* _NEW_LINE */ 1988 wm.LineStippleEnable = ctx->Line.StippleFlag; 1989 1990 /* _NEW_POLYGON */ 1991 wm.PolygonStippleEnable = ctx->Polygon.StippleFlag; 1992 1993#if GEN_GEN < 8 1994 1995#if GEN_GEN >= 6 1996 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 1997 1998 /* _NEW_BUFFERS */ 1999 const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1; 2000 2001 if (multisampled_fbo) { 2002 /* _NEW_MULTISAMPLE */ 2003 if (ctx->Multisample.Enabled) 2004 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 2005 else 2006 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 2007 2008 if (wm_prog_data->persample_dispatch) 2009 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 2010 else 2011 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; 2012 } else { 2013 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 2014 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 2015 } 2016#endif 2017 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 2018 if (wm_prog_data->uses_kill || 2019 _mesa_is_alpha_test_enabled(ctx) || 2020 _mesa_is_alpha_to_coverage_enabled(ctx) || 2021 (GEN_GEN >= 6 && wm_prog_data->uses_omask)) { 2022 wm.PixelShaderKillsPixel = true; 2023 } 2024 2025 /* _NEW_BUFFERS | _NEW_COLOR */ 2026 if (brw_color_buffer_write_enabled(brw) || writes_depth || 2027 wm.PixelShaderKillsPixel || 2028 (GEN_GEN >= 6 && wm_prog_data->has_side_effects)) { 2029 wm.ThreadDispatchEnable = true; 2030 } 2031 2032#if GEN_GEN >= 7 2033 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 2034 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 2035#endif 2036 2037 /* The "UAV access enable" bits are unnecessary on HSW because they only 2038 * seem to have an effect on the HW-assisted coherency mechanism which we 2039 * don't need, and the rasterization-related UAV_ONLY flag and the 2040 * DISPATCH_ENABLE bit can be set independently from it. 2041 * C.f. gen8_upload_ps_extra(). 2042 * 2043 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | 2044 * _NEW_COLOR 2045 */ 2046#if GEN_IS_HASWELL 2047 if (!(brw_color_buffer_write_enabled(brw) || writes_depth) && 2048 wm_prog_data->has_side_effects) 2049 wm.PSUAVonly = ON; 2050#endif 2051#endif 2052 2053#if GEN_GEN >= 7 2054 /* BRW_NEW_FS_PROG_DATA */ 2055 if (wm_prog_data->early_fragment_tests) 2056 wm.EarlyDepthStencilControl = EDSC_PREPS; 2057 else if (wm_prog_data->has_side_effects) 2058 wm.EarlyDepthStencilControl = EDSC_PSEXEC; 2059#endif 2060 } 2061 2062#if GEN_GEN <= 5 2063 if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) { 2064 brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) { 2065 clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp; 2066 } 2067 2068 brw->wm.offset_clamp = ctx->Polygon.OffsetClamp; 2069 } 2070#endif 2071} 2072 2073static const struct brw_tracked_state genX(wm_state) = { 2074 .dirty = { 2075 .mesa = _NEW_LINE | 2076 _NEW_POLYGON | 2077 (GEN_GEN < 8 ? _NEW_BUFFERS | 2078 _NEW_COLOR : 2079 0) | 2080 (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0) | 2081 (GEN_GEN < 6 ? _NEW_POLYGONSTIPPLE : 0) | 2082 (GEN_GEN < 8 && GEN_GEN >= 6 ? _NEW_MULTISAMPLE : 0), 2083 .brw = BRW_NEW_BLORP | 2084 BRW_NEW_FS_PROG_DATA | 2085 (GEN_GEN < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2086 BRW_NEW_FRAGMENT_PROGRAM | 2087 BRW_NEW_PROGRAM_CACHE | 2088 BRW_NEW_SAMPLER_STATE_TABLE | 2089 BRW_NEW_STATS_WM 2090 : 0) | 2091 (GEN_GEN < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT), 2092 }, 2093 .emit = genX(upload_wm), 2094}; 2095 2096/* ---------------------------------------------------------------------- */ 2097 2098/* We restrict scratch buffers to the bottom 32 bits of the address space 2099 * by using rw_32_bo(). 2100 * 2101 * General State Base Address is a bit broken. If the address + size as 2102 * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat 2103 * all accesses to the buffer as being out of bounds and returns zero. 2104 */ 2105 2106#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \ 2107 pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \ 2108 /* WA_1606682166 */ \ 2109 pkt.SamplerCount = \ 2110 GEN_GEN == 11 ? \ 2111 0 : \ 2112 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \ 2113 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to \ 2114 * disable prefetching of binding tables in A0 and B0 steppings. \ 2115 * TODO: Revisit this WA on C0 stepping. \ 2116 */ \ 2117 pkt.BindingTableEntryCount = \ 2118 GEN_GEN == 11 ? \ 2119 0 : \ 2120 stage_prog_data->binding_table.size_bytes / 4; \ 2121 pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \ 2122 \ 2123 if (stage_prog_data->total_scratch) { \ 2124 pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \ 2125 pkt.PerThreadScratchSpace = \ 2126 ffs(stage_state->per_thread_scratch) - 11; \ 2127 } \ 2128 \ 2129 pkt.DispatchGRFStartRegisterForURBData = \ 2130 stage_prog_data->dispatch_grf_start_reg; \ 2131 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \ 2132 pkt.prefix##URBEntryReadOffset = 0; \ 2133 \ 2134 pkt.StatisticsEnable = true; \ 2135 pkt.Enable = true; 2136 2137static void 2138genX(upload_vs_state)(struct brw_context *brw) 2139{ 2140 UNUSED struct gl_context *ctx = &brw->ctx; 2141 const struct gen_device_info *devinfo = &brw->screen->devinfo; 2142 struct brw_stage_state *stage_state = &brw->vs.base; 2143 2144 /* BRW_NEW_VS_PROG_DATA */ 2145 const struct brw_vue_prog_data *vue_prog_data = 2146 brw_vue_prog_data(brw->vs.base.prog_data); 2147 const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base; 2148 2149 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 || 2150 vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT); 2151 assert(GEN_GEN < 11 || 2152 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8); 2153 2154#if GEN_GEN == 6 2155 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State, 2156 * 3DSTATE_VS, Dword 5.0 "VS Function Enable": 2157 * 2158 * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS 2159 * command that causes the VS Function Enable to toggle. Pipeline 2160 * flush can be executed by sending a PIPE_CONTROL command with CS 2161 * stall bit set and a post sync operation. 2162 * 2163 * We've already done such a flush at the start of state upload, so we 2164 * don't need to do another one here. 2165 */ 2166 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) { 2167 if (stage_state->push_const_size != 0) { 2168 cvs.Buffer0Valid = true; 2169 cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 2170 cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 2171 } 2172 } 2173#endif 2174 2175 if (GEN_GEN == 7 && devinfo->is_ivybridge) 2176 gen7_emit_vs_workaround_flush(brw); 2177 2178#if GEN_GEN >= 6 2179 brw_batch_emit(brw, GENX(3DSTATE_VS), vs) { 2180#else 2181 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 2182 brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) { 2183#endif 2184 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex); 2185 2186 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; 2187 2188#if GEN_GEN < 6 2189 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1; 2190 vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; 2191 vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; 2192 2193 vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GEN_GEN == 5 ? 2 : 0); 2194 vs.URBEntryAllocationSize = brw->urb.vsize - 1; 2195 2196 vs.MaximumNumberofThreads = 2197 CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1; 2198 2199 vs.StatisticsEnable = false; 2200 vs.SamplerStatePointer = 2201 ro_bo(brw->batch.state.bo, stage_state->sampler_offset); 2202#endif 2203 2204#if GEN_GEN == 5 2205 /* Force single program flow on Ironlake. We cannot reliably get 2206 * all applications working without it. See: 2207 * https://bugs.freedesktop.org/show_bug.cgi?id=29172 2208 * 2209 * The most notable and reliably failing application is the Humus 2210 * demo "CelShading" 2211 */ 2212 vs.SingleProgramFlow = true; 2213 vs.SamplerCount = 0; /* hardware requirement */ 2214#endif 2215 2216#if GEN_GEN >= 8 2217 vs.SIMD8DispatchEnable = 2218 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8; 2219 2220 vs.UserClipDistanceCullTestEnableBitmask = 2221 vue_prog_data->cull_distance_mask; 2222#endif 2223 } 2224 2225#if GEN_GEN == 6 2226 /* Based on my reading of the simulator, the VS constants don't get 2227 * pulled into the VS FF unit until an appropriate pipeline flush 2228 * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds 2229 * references to them into a little FIFO. The flushes are common, 2230 * but don't reliably happen between this and a 3DPRIMITIVE, causing 2231 * the primitive to use the wrong constants. Then the FIFO 2232 * containing the constant setup gets added to again on the next 2233 * constants change, and eventually when a flush does happen the 2234 * unit is overwhelmed by constant changes and dies. 2235 * 2236 * To avoid this, send a PIPE_CONTROL down the line that will 2237 * update the unit immediately loading the constants. The flush 2238 * type bits here were those set by the STATE_BASE_ADDRESS whose 2239 * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the 2240 * bug reports that led to this workaround, and may be more than 2241 * what is strictly required to avoid the issue. 2242 */ 2243 brw_emit_pipe_control_flush(brw, 2244 PIPE_CONTROL_DEPTH_STALL | 2245 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 2246 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 2247#endif 2248} 2249 2250static const struct brw_tracked_state genX(vs_state) = { 2251 .dirty = { 2252 .mesa = (GEN_GEN == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0), 2253 .brw = BRW_NEW_BATCH | 2254 BRW_NEW_BLORP | 2255 BRW_NEW_CONTEXT | 2256 BRW_NEW_VS_PROG_DATA | 2257 (GEN_GEN == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) | 2258 (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2259 BRW_NEW_PROGRAM_CACHE | 2260 BRW_NEW_SAMPLER_STATE_TABLE | 2261 BRW_NEW_URB_FENCE 2262 : 0), 2263 }, 2264 .emit = genX(upload_vs_state), 2265}; 2266 2267/* ---------------------------------------------------------------------- */ 2268 2269static void 2270genX(upload_cc_viewport)(struct brw_context *brw) 2271{ 2272 struct gl_context *ctx = &brw->ctx; 2273 2274 /* BRW_NEW_VIEWPORT_COUNT */ 2275 const unsigned viewport_count = brw->clip.viewport_count; 2276 2277 struct GENX(CC_VIEWPORT) ccv; 2278 uint32_t cc_vp_offset; 2279 uint32_t *cc_map = 2280 brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count, 2281 32, &cc_vp_offset); 2282 2283 for (unsigned i = 0; i < viewport_count; i++) { 2284 /* _NEW_VIEWPORT | _NEW_TRANSFORM */ 2285 const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i]; 2286 if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) { 2287 ccv.MinimumDepth = MIN2(vp->Near, vp->Far); 2288 ccv.MaximumDepth = MAX2(vp->Near, vp->Far); 2289 } else if (ctx->Transform.DepthClampNear) { 2290 ccv.MinimumDepth = MIN2(vp->Near, vp->Far); 2291 ccv.MaximumDepth = 0.0; 2292 } else if (ctx->Transform.DepthClampFar) { 2293 ccv.MinimumDepth = 0.0; 2294 ccv.MaximumDepth = MAX2(vp->Near, vp->Far); 2295 } else { 2296 ccv.MinimumDepth = 0.0; 2297 ccv.MaximumDepth = 1.0; 2298 } 2299 GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv); 2300 cc_map += GENX(CC_VIEWPORT_length); 2301 } 2302 2303#if GEN_GEN >= 7 2304 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) { 2305 ptr.CCViewportPointer = cc_vp_offset; 2306 } 2307#elif GEN_GEN == 6 2308 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 2309 vp.CCViewportStateChange = 1; 2310 vp.PointertoCC_VIEWPORT = cc_vp_offset; 2311 } 2312#else 2313 brw->cc.vp_offset = cc_vp_offset; 2314 ctx->NewDriverState |= BRW_NEW_CC_VP; 2315#endif 2316} 2317 2318const struct brw_tracked_state genX(cc_vp) = { 2319 .dirty = { 2320 .mesa = _NEW_TRANSFORM | 2321 _NEW_VIEWPORT, 2322 .brw = BRW_NEW_BATCH | 2323 BRW_NEW_BLORP | 2324 BRW_NEW_VIEWPORT_COUNT, 2325 }, 2326 .emit = genX(upload_cc_viewport) 2327}; 2328 2329/* ---------------------------------------------------------------------- */ 2330 2331static void 2332set_scissor_bits(const struct gl_context *ctx, int i, 2333 bool flip_y, unsigned fb_width, unsigned fb_height, 2334 struct GENX(SCISSOR_RECT) *sc) 2335{ 2336 int bbox[4]; 2337 2338 bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); 2339 bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width); 2340 bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height); 2341 bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height); 2342 _mesa_intersect_scissor_bounding_box(ctx, i, bbox); 2343 2344 if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) { 2345 /* If the scissor was out of bounds and got clamped to 0 width/height 2346 * at the bounds, the subtraction of 1 from maximums could produce a 2347 * negative number and thus not clip anything. Instead, just provide 2348 * a min > max scissor inside the bounds, which produces the expected 2349 * no rendering. 2350 */ 2351 sc->ScissorRectangleXMin = 1; 2352 sc->ScissorRectangleXMax = 0; 2353 sc->ScissorRectangleYMin = 1; 2354 sc->ScissorRectangleYMax = 0; 2355 } else if (!flip_y) { 2356 /* texmemory: Y=0=bottom */ 2357 sc->ScissorRectangleXMin = bbox[0]; 2358 sc->ScissorRectangleXMax = bbox[1] - 1; 2359 sc->ScissorRectangleYMin = bbox[2]; 2360 sc->ScissorRectangleYMax = bbox[3] - 1; 2361 } else { 2362 /* memory: Y=0=top */ 2363 sc->ScissorRectangleXMin = bbox[0]; 2364 sc->ScissorRectangleXMax = bbox[1] - 1; 2365 sc->ScissorRectangleYMin = fb_height - bbox[3]; 2366 sc->ScissorRectangleYMax = fb_height - bbox[2] - 1; 2367 } 2368} 2369 2370#if GEN_GEN >= 6 2371static void 2372genX(upload_scissor_state)(struct brw_context *brw) 2373{ 2374 struct gl_context *ctx = &brw->ctx; 2375 const bool flip_y = ctx->DrawBuffer->FlipY; 2376 struct GENX(SCISSOR_RECT) scissor; 2377 uint32_t scissor_state_offset; 2378 const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer); 2379 const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer); 2380 uint32_t *scissor_map; 2381 2382 /* BRW_NEW_VIEWPORT_COUNT */ 2383 const unsigned viewport_count = brw->clip.viewport_count; 2384 2385 scissor_map = brw_state_batch( 2386 brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count, 2387 32, &scissor_state_offset); 2388 2389 /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */ 2390 2391 /* The scissor only needs to handle the intersection of drawable and 2392 * scissor rect. Clipping to the boundaries of static shared buffers 2393 * for front/back/depth is covered by looping over cliprects in brw_draw.c. 2394 * 2395 * Note that the hardware's coordinates are inclusive, while Mesa's min is 2396 * inclusive but max is exclusive. 2397 */ 2398 for (unsigned i = 0; i < viewport_count; i++) { 2399 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor); 2400 GENX(SCISSOR_RECT_pack)( 2401 NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor); 2402 } 2403 2404 brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) { 2405 ptr.ScissorRectPointer = scissor_state_offset; 2406 } 2407} 2408 2409static const struct brw_tracked_state genX(scissor_state) = { 2410 .dirty = { 2411 .mesa = _NEW_BUFFERS | 2412 _NEW_SCISSOR | 2413 _NEW_VIEWPORT, 2414 .brw = BRW_NEW_BATCH | 2415 BRW_NEW_BLORP | 2416 BRW_NEW_VIEWPORT_COUNT, 2417 }, 2418 .emit = genX(upload_scissor_state), 2419}; 2420#endif 2421 2422/* ---------------------------------------------------------------------- */ 2423 2424static void 2425brw_calculate_guardband_size(uint32_t fb_width, uint32_t fb_height, 2426 float m00, float m11, float m30, float m31, 2427 float *xmin, float *xmax, 2428 float *ymin, float *ymax) 2429{ 2430 /* According to the "Vertex X,Y Clamping and Quantization" section of the 2431 * Strips and Fans documentation: 2432 * 2433 * "The vertex X and Y screen-space coordinates are also /clamped/ to the 2434 * fixed-point "guardband" range supported by the rasterization hardware" 2435 * 2436 * and 2437 * 2438 * "In almost all circumstances, if an object’s vertices are actually 2439 * modified by this clamping (i.e., had X or Y coordinates outside of 2440 * the guardband extent the rendered object will not match the intended 2441 * result. Therefore software should take steps to ensure that this does 2442 * not happen - e.g., by clipping objects such that they do not exceed 2443 * these limits after the Drawing Rectangle is applied." 2444 * 2445 * I believe the fundamental restriction is that the rasterizer (in 2446 * the SF/WM stages) have a limit on the number of pixels that can be 2447 * rasterized. We need to ensure any coordinates beyond the rasterizer 2448 * limit are handled by the clipper. So effectively that limit becomes 2449 * the clipper's guardband size. 2450 * 2451 * It goes on to say: 2452 * 2453 * "In addition, in order to be correctly rendered, objects must have a 2454 * screenspace bounding box not exceeding 8K in the X or Y direction. 2455 * This additional restriction must also be comprehended by software, 2456 * i.e., enforced by use of clipping." 2457 * 2458 * This makes no sense. Gen7+ hardware supports 16K render targets, 2459 * and you definitely need to be able to draw polygons that fill the 2460 * surface. Our assumption is that the rasterizer was limited to 8K 2461 * on Sandybridge, which only supports 8K surfaces, and it was actually 2462 * increased to 16K on Ivybridge and later. 2463 * 2464 * So, limit the guardband to 16K on Gen7+ and 8K on Sandybridge. 2465 */ 2466 const float gb_size = GEN_GEN >= 7 ? 16384.0f : 8192.0f; 2467 2468 /* Workaround: prevent gpu hangs on SandyBridge 2469 * by disabling guardband clipping for odd dimensions. 2470 */ 2471 if (GEN_GEN == 6 && (fb_width & 1 || fb_height & 1)) { 2472 *xmin = -1.0f; 2473 *xmax = 1.0f; 2474 *ymin = -1.0f; 2475 *ymax = 1.0f; 2476 return; 2477 } 2478 2479 if (m00 != 0 && m11 != 0) { 2480 /* First, we compute the screen-space render area */ 2481 const float ss_ra_xmin = MIN3( 0, m30 + m00, m30 - m00); 2482 const float ss_ra_xmax = MAX3( fb_width, m30 + m00, m30 - m00); 2483 const float ss_ra_ymin = MIN3( 0, m31 + m11, m31 - m11); 2484 const float ss_ra_ymax = MAX3(fb_height, m31 + m11, m31 - m11); 2485 2486 /* We want the guardband to be centered on that */ 2487 const float ss_gb_xmin = (ss_ra_xmin + ss_ra_xmax) / 2 - gb_size; 2488 const float ss_gb_xmax = (ss_ra_xmin + ss_ra_xmax) / 2 + gb_size; 2489 const float ss_gb_ymin = (ss_ra_ymin + ss_ra_ymax) / 2 - gb_size; 2490 const float ss_gb_ymax = (ss_ra_ymin + ss_ra_ymax) / 2 + gb_size; 2491 2492 /* Now we need it in native device coordinates */ 2493 const float ndc_gb_xmin = (ss_gb_xmin - m30) / m00; 2494 const float ndc_gb_xmax = (ss_gb_xmax - m30) / m00; 2495 const float ndc_gb_ymin = (ss_gb_ymin - m31) / m11; 2496 const float ndc_gb_ymax = (ss_gb_ymax - m31) / m11; 2497 2498 /* Thanks to Y-flipping and ORIGIN_UPPER_LEFT, the Y coordinates may be 2499 * flipped upside-down. X should be fine though. 2500 */ 2501 assert(ndc_gb_xmin <= ndc_gb_xmax); 2502 *xmin = ndc_gb_xmin; 2503 *xmax = ndc_gb_xmax; 2504 *ymin = MIN2(ndc_gb_ymin, ndc_gb_ymax); 2505 *ymax = MAX2(ndc_gb_ymin, ndc_gb_ymax); 2506 } else { 2507 /* The viewport scales to 0, so nothing will be rendered. */ 2508 *xmin = 0.0f; 2509 *xmax = 0.0f; 2510 *ymin = 0.0f; 2511 *ymax = 0.0f; 2512 } 2513} 2514 2515static void 2516genX(upload_sf_clip_viewport)(struct brw_context *brw) 2517{ 2518 struct gl_context *ctx = &brw->ctx; 2519 float y_scale, y_bias; 2520 2521 /* BRW_NEW_VIEWPORT_COUNT */ 2522 const unsigned viewport_count = brw->clip.viewport_count; 2523 2524 /* _NEW_BUFFERS */ 2525 const bool flip_y = ctx->DrawBuffer->FlipY; 2526 const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer); 2527 const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer); 2528 2529#if GEN_GEN >= 7 2530#define clv sfv 2531 struct GENX(SF_CLIP_VIEWPORT) sfv; 2532 uint32_t sf_clip_vp_offset; 2533 uint32_t *sf_clip_map = 2534 brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count, 2535 64, &sf_clip_vp_offset); 2536#else 2537 struct GENX(SF_VIEWPORT) sfv; 2538 struct GENX(CLIP_VIEWPORT) clv; 2539 uint32_t sf_vp_offset, clip_vp_offset; 2540 uint32_t *sf_map = 2541 brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count, 2542 32, &sf_vp_offset); 2543 uint32_t *clip_map = 2544 brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count, 2545 32, &clip_vp_offset); 2546#endif 2547 2548 /* _NEW_BUFFERS */ 2549 if (flip_y) { 2550 y_scale = -1.0; 2551 y_bias = (float)fb_height; 2552 } else { 2553 y_scale = 1.0; 2554 y_bias = 0; 2555 } 2556 2557 for (unsigned i = 0; i < brw->clip.viewport_count; i++) { 2558 /* _NEW_VIEWPORT: Guardband Clipping */ 2559 float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax; 2560 _mesa_get_viewport_xform(ctx, i, scale, translate); 2561 2562 sfv.ViewportMatrixElementm00 = scale[0]; 2563 sfv.ViewportMatrixElementm11 = scale[1] * y_scale, 2564 sfv.ViewportMatrixElementm22 = scale[2], 2565 sfv.ViewportMatrixElementm30 = translate[0], 2566 sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias, 2567 sfv.ViewportMatrixElementm32 = translate[2], 2568 brw_calculate_guardband_size(fb_width, fb_height, 2569 sfv.ViewportMatrixElementm00, 2570 sfv.ViewportMatrixElementm11, 2571 sfv.ViewportMatrixElementm30, 2572 sfv.ViewportMatrixElementm31, 2573 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax); 2574 2575 2576 clv.XMinClipGuardband = gb_xmin; 2577 clv.XMaxClipGuardband = gb_xmax; 2578 clv.YMinClipGuardband = gb_ymin; 2579 clv.YMaxClipGuardband = gb_ymax; 2580 2581#if GEN_GEN < 6 2582 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, 2583 &sfv.ScissorRectangle); 2584#elif GEN_GEN >= 8 2585 /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport 2586 * The hardware will take the intersection of the drawing rectangle, 2587 * scissor rectangle, and the viewport extents. However, emitting 2588 * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full 2589 * pipeline stall so we're better off just being a little more clever 2590 * with our viewport so we can emit it once at context creation time. 2591 */ 2592 const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0); 2593 const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0); 2594 const float viewport_Xmax = 2595 MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width); 2596 const float viewport_Ymax = 2597 MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height); 2598 2599 if (flip_y) { 2600 sfv.XMinViewPort = viewport_Xmin; 2601 sfv.XMaxViewPort = viewport_Xmax - 1; 2602 sfv.YMinViewPort = fb_height - viewport_Ymax; 2603 sfv.YMaxViewPort = fb_height - viewport_Ymin - 1; 2604 } else { 2605 sfv.XMinViewPort = viewport_Xmin; 2606 sfv.XMaxViewPort = viewport_Xmax - 1; 2607 sfv.YMinViewPort = viewport_Ymin; 2608 sfv.YMaxViewPort = viewport_Ymax - 1; 2609 } 2610#endif 2611 2612#if GEN_GEN >= 7 2613 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv); 2614 sf_clip_map += GENX(SF_CLIP_VIEWPORT_length); 2615#else 2616 GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv); 2617 GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv); 2618 sf_map += GENX(SF_VIEWPORT_length); 2619 clip_map += GENX(CLIP_VIEWPORT_length); 2620#endif 2621 } 2622 2623#if GEN_GEN >= 7 2624 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) { 2625 ptr.SFClipViewportPointer = sf_clip_vp_offset; 2626 } 2627#elif GEN_GEN == 6 2628 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 2629 vp.SFViewportStateChange = 1; 2630 vp.CLIPViewportStateChange = 1; 2631 vp.PointertoCLIP_VIEWPORT = clip_vp_offset; 2632 vp.PointertoSF_VIEWPORT = sf_vp_offset; 2633 } 2634#else 2635 brw->sf.vp_offset = sf_vp_offset; 2636 brw->clip.vp_offset = clip_vp_offset; 2637 brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP; 2638#endif 2639} 2640 2641static const struct brw_tracked_state genX(sf_clip_viewport) = { 2642 .dirty = { 2643 .mesa = _NEW_BUFFERS | 2644 _NEW_VIEWPORT | 2645 (GEN_GEN <= 5 ? _NEW_SCISSOR : 0), 2646 .brw = BRW_NEW_BATCH | 2647 BRW_NEW_BLORP | 2648 BRW_NEW_VIEWPORT_COUNT, 2649 }, 2650 .emit = genX(upload_sf_clip_viewport), 2651}; 2652 2653/* ---------------------------------------------------------------------- */ 2654 2655static void 2656genX(upload_gs_state)(struct brw_context *brw) 2657{ 2658 UNUSED struct gl_context *ctx = &brw->ctx; 2659 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 2660 const struct brw_stage_state *stage_state = &brw->gs.base; 2661 const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY]; 2662 /* BRW_NEW_GEOMETRY_PROGRAM */ 2663 bool active = GEN_GEN >= 6 && gs_prog; 2664 2665 /* BRW_NEW_GS_PROG_DATA */ 2666 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 2667 UNUSED const struct brw_vue_prog_data *vue_prog_data = 2668 brw_vue_prog_data(stage_prog_data); 2669#if GEN_GEN >= 7 2670 const struct brw_gs_prog_data *gs_prog_data = 2671 brw_gs_prog_data(stage_prog_data); 2672#endif 2673 2674#if GEN_GEN == 6 2675 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) { 2676 if (active && stage_state->push_const_size != 0) { 2677 cgs.Buffer0Valid = true; 2678 cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 2679 cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 2680 } 2681 } 2682#endif 2683 2684#if GEN_GEN == 7 && !GEN_IS_HASWELL 2685 /** 2686 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > 2687 * Geometry > Geometry Shader > State: 2688 * 2689 * "Note: Because of corruption in IVB:GT2, software needs to flush the 2690 * whole fixed function pipeline when the GS enable changes value in 2691 * the 3DSTATE_GS." 2692 * 2693 * The hardware architects have clarified that in this context "flush the 2694 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS 2695 * Stall" bit set. 2696 */ 2697 if (devinfo->gt == 2 && brw->gs.enabled != active) 2698 gen7_emit_cs_stall_flush(brw); 2699#endif 2700 2701#if GEN_GEN >= 6 2702 brw_batch_emit(brw, GENX(3DSTATE_GS), gs) { 2703#else 2704 ctx->NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 2705 brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) { 2706#endif 2707 2708#if GEN_GEN >= 6 2709 if (active) { 2710 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex); 2711 2712#if GEN_GEN >= 7 2713 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; 2714 gs.OutputTopology = gs_prog_data->output_topology; 2715 gs.ControlDataHeaderSize = 2716 gs_prog_data->control_data_header_size_hwords; 2717 2718 gs.InstanceControl = gs_prog_data->invocations - 1; 2719 gs.DispatchMode = vue_prog_data->dispatch_mode; 2720 2721 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; 2722 2723 gs.ControlDataFormat = gs_prog_data->control_data_format; 2724#endif 2725 2726 /* Note: the meaning of the GEN7_GS_REORDER_TRAILING bit changes between 2727 * Ivy Bridge and Haswell. 2728 * 2729 * On Ivy Bridge, setting this bit causes the vertices of a triangle 2730 * strip to be delivered to the geometry shader in an order that does 2731 * not strictly follow the OpenGL spec, but preserves triangle 2732 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then 2733 * the geometry shader sees triangles: 2734 * 2735 * (1, 2, 3), (2, 4, 3), (3, 4, 5) 2736 * 2737 * (Clearing the bit is even worse, because it fails to preserve 2738 * orientation). 2739 * 2740 * Triangle strips with adjacency always ordered in a way that preserves 2741 * triangle orientation but does not strictly follow the OpenGL spec, 2742 * regardless of the setting of this bit. 2743 * 2744 * On Haswell, both triangle strips and triangle strips with adjacency 2745 * are always ordered in a way that preserves triangle orientation. 2746 * Setting this bit causes the ordering to strictly follow the OpenGL 2747 * spec. 2748 * 2749 * So in either case we want to set the bit. Unfortunately on Ivy 2750 * Bridge this will get the order close to correct but not perfect. 2751 */ 2752 gs.ReorderMode = TRAILING; 2753 gs.MaximumNumberofThreads = 2754 GEN_GEN == 8 ? (devinfo->max_gs_threads / 2 - 1) 2755 : (devinfo->max_gs_threads - 1); 2756 2757#if GEN_GEN < 7 2758 gs.SOStatisticsEnable = true; 2759 if (gs_prog->info.has_transform_feedback_varyings) 2760 gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx); 2761 2762 /* GEN6_GS_SPF_MODE and GEN6_GS_VECTOR_MASK_ENABLE are enabled as it 2763 * was previously done for gen6. 2764 * 2765 * TODO: test with both disabled to see if the HW is behaving 2766 * as expected, like in gen7. 2767 */ 2768 gs.SingleProgramFlow = true; 2769 gs.VectorMaskEnable = true; 2770#endif 2771 2772#if GEN_GEN >= 8 2773 gs.ExpectedVertexCount = gs_prog_data->vertices_in; 2774 2775 if (gs_prog_data->static_vertex_count != -1) { 2776 gs.StaticOutput = true; 2777 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count; 2778 } 2779 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles; 2780 2781 gs.UserClipDistanceCullTestEnableBitmask = 2782 vue_prog_data->cull_distance_mask; 2783 2784 const int urb_entry_write_offset = 1; 2785 const uint32_t urb_entry_output_length = 2786 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) - 2787 urb_entry_write_offset; 2788 2789 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset; 2790 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1); 2791#endif 2792 } 2793#endif 2794 2795#if GEN_GEN <= 6 2796 if (!active && brw->ff_gs.prog_active) { 2797 /* In gen6, transform feedback for the VS stage is done with an 2798 * ad-hoc GS program. This function provides the needed 3DSTATE_GS 2799 * for this. 2800 */ 2801 gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset); 2802 gs.SingleProgramFlow = true; 2803 gs.DispatchGRFStartRegisterForURBData = GEN_GEN == 6 ? 2 : 1; 2804 gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length; 2805 2806#if GEN_GEN <= 5 2807 gs.GRFRegisterCount = 2808 DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1; 2809 /* BRW_NEW_URB_FENCE */ 2810 gs.NumberofURBEntries = brw->urb.nr_gs_entries; 2811 gs.URBEntryAllocationSize = brw->urb.vsize - 1; 2812 gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0; 2813 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 2814#else 2815 gs.Enable = true; 2816 gs.VectorMaskEnable = true; 2817 gs.SVBIPayloadEnable = true; 2818 gs.SVBIPostIncrementEnable = true; 2819 gs.SVBIPostIncrementValue = 2820 brw->ff_gs.prog_data->svbi_postincrement_value; 2821 gs.SOStatisticsEnable = true; 2822 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1; 2823#endif 2824 } 2825#endif 2826 if (!active && !brw->ff_gs.prog_active) { 2827#if GEN_GEN < 8 2828 gs.DispatchGRFStartRegisterForURBData = 1; 2829#if GEN_GEN >= 7 2830 gs.IncludeVertexHandles = true; 2831#endif 2832#endif 2833 } 2834 2835#if GEN_GEN >= 6 2836 gs.StatisticsEnable = true; 2837#endif 2838#if GEN_GEN == 5 || GEN_GEN == 6 2839 gs.RenderingEnabled = true; 2840#endif 2841#if GEN_GEN <= 5 2842 gs.MaximumVPIndex = brw->clip.viewport_count - 1; 2843#endif 2844 } 2845 2846#if GEN_GEN == 6 2847 brw->gs.enabled = active; 2848#endif 2849} 2850 2851static const struct brw_tracked_state genX(gs_state) = { 2852 .dirty = { 2853 .mesa = (GEN_GEN == 6 ? _NEW_PROGRAM_CONSTANTS : 0), 2854 .brw = BRW_NEW_BATCH | 2855 BRW_NEW_BLORP | 2856 (GEN_GEN <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2857 BRW_NEW_PROGRAM_CACHE | 2858 BRW_NEW_URB_FENCE | 2859 BRW_NEW_VIEWPORT_COUNT 2860 : 0) | 2861 (GEN_GEN >= 6 ? BRW_NEW_CONTEXT | 2862 BRW_NEW_GEOMETRY_PROGRAM | 2863 BRW_NEW_GS_PROG_DATA 2864 : 0) | 2865 (GEN_GEN < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0), 2866 }, 2867 .emit = genX(upload_gs_state), 2868}; 2869 2870/* ---------------------------------------------------------------------- */ 2871 2872UNUSED static GLenum 2873fix_dual_blend_alpha_to_one(GLenum function) 2874{ 2875 switch (function) { 2876 case GL_SRC1_ALPHA: 2877 return GL_ONE; 2878 2879 case GL_ONE_MINUS_SRC1_ALPHA: 2880 return GL_ZERO; 2881 } 2882 2883 return function; 2884} 2885 2886#define blend_factor(x) brw_translate_blend_factor(x) 2887#define blend_eqn(x) brw_translate_blend_equation(x) 2888 2889/** 2890 * Modify blend function to force destination alpha to 1.0 2891 * 2892 * If \c function specifies a blend function that uses destination alpha, 2893 * replace it with a function that hard-wires destination alpha to 1.0. This 2894 * is used when rendering to xRGB targets. 2895 */ 2896static GLenum 2897brw_fix_xRGB_alpha(GLenum function) 2898{ 2899 switch (function) { 2900 case GL_DST_ALPHA: 2901 return GL_ONE; 2902 2903 case GL_ONE_MINUS_DST_ALPHA: 2904 case GL_SRC_ALPHA_SATURATE: 2905 return GL_ZERO; 2906 } 2907 2908 return function; 2909} 2910 2911#if GEN_GEN >= 6 2912typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML; 2913#else 2914typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML; 2915#endif 2916 2917UNUSED static bool 2918set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i, 2919 bool alpha_to_one) 2920{ 2921 struct gl_context *ctx = &brw->ctx; 2922 2923 /* _NEW_BUFFERS */ 2924 const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i]; 2925 2926 bool independent_alpha_blend = false; 2927 2928 /* Used for implementing the following bit of GL_EXT_texture_integer: 2929 * "Per-fragment operations that require floating-point color 2930 * components, including multisample alpha operations, alpha test, 2931 * blending, and dithering, have no effect when the corresponding 2932 * colors are written to an integer color buffer." 2933 */ 2934 const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i); 2935 2936 const unsigned blend_enabled = GEN_GEN >= 6 ? 2937 ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled; 2938 2939 /* _NEW_COLOR */ 2940 if (ctx->Color.ColorLogicOpEnabled) { 2941 GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format) 2942 : GL_UNSIGNED_NORMALIZED; 2943 WARN_ONCE(ctx->Color.LogicOp != GL_COPY && 2944 rb_type != GL_UNSIGNED_NORMALIZED && 2945 rb_type != GL_FLOAT, "Ignoring %s logic op on %s " 2946 "renderbuffer\n", 2947 _mesa_enum_to_string(ctx->Color.LogicOp), 2948 _mesa_enum_to_string(rb_type)); 2949 if (GEN_GEN >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) { 2950 entry->LogicOpEnable = true; 2951 entry->LogicOpFunction = ctx->Color._LogicOp; 2952 } 2953 } else if (blend_enabled && !ctx->Color._AdvancedBlendMode 2954 && (GEN_GEN <= 5 || !integer)) { 2955 GLenum eqRGB = ctx->Color.Blend[i].EquationRGB; 2956 GLenum eqA = ctx->Color.Blend[i].EquationA; 2957 GLenum srcRGB = ctx->Color.Blend[i].SrcRGB; 2958 GLenum dstRGB = ctx->Color.Blend[i].DstRGB; 2959 GLenum srcA = ctx->Color.Blend[i].SrcA; 2960 GLenum dstA = ctx->Color.Blend[i].DstA; 2961 2962 if (eqRGB == GL_MIN || eqRGB == GL_MAX) 2963 srcRGB = dstRGB = GL_ONE; 2964 2965 if (eqA == GL_MIN || eqA == GL_MAX) 2966 srcA = dstA = GL_ONE; 2967 2968 /* Due to hardware limitations, the destination may have information 2969 * in an alpha channel even when the format specifies no alpha 2970 * channel. In order to avoid getting any incorrect blending due to 2971 * that alpha channel, coerce the blend factors to values that will 2972 * not read the alpha channel, but will instead use the correct 2973 * implicit value for alpha. 2974 */ 2975 if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat, 2976 GL_TEXTURE_ALPHA_TYPE)) { 2977 srcRGB = brw_fix_xRGB_alpha(srcRGB); 2978 srcA = brw_fix_xRGB_alpha(srcA); 2979 dstRGB = brw_fix_xRGB_alpha(dstRGB); 2980 dstA = brw_fix_xRGB_alpha(dstA); 2981 } 2982 2983 /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable): 2984 * "If Dual Source Blending is enabled, this bit must be disabled." 2985 * 2986 * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO, 2987 * and leave it enabled anyway. 2988 */ 2989 if (GEN_GEN >= 6 && ctx->Color.Blend[i]._UsesDualSrc && alpha_to_one) { 2990 srcRGB = fix_dual_blend_alpha_to_one(srcRGB); 2991 srcA = fix_dual_blend_alpha_to_one(srcA); 2992 dstRGB = fix_dual_blend_alpha_to_one(dstRGB); 2993 dstA = fix_dual_blend_alpha_to_one(dstA); 2994 } 2995 2996 /* BRW_NEW_FS_PROG_DATA */ 2997 const struct brw_wm_prog_data *wm_prog_data = 2998 brw_wm_prog_data(brw->wm.base.prog_data); 2999 3000 /* The Dual Source Blending documentation says: 3001 * 3002 * "If SRC1 is included in a src/dst blend factor and 3003 * a DualSource RT Write message is not used, results 3004 * are UNDEFINED. (This reflects the same restriction in DX APIs, 3005 * where undefined results are produced if “o1” is not written 3006 * by a PS – there are no default values defined). 3007 * If SRC1 is not included in a src/dst blend factor, 3008 * dual source blending must be disabled." 3009 * 3010 * There is no way to gracefully fix this undefined situation 3011 * so we just disable the blending to prevent possible issues. 3012 */ 3013 entry->ColorBufferBlendEnable = 3014 !ctx->Color.Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend; 3015 3016 entry->DestinationBlendFactor = blend_factor(dstRGB); 3017 entry->SourceBlendFactor = blend_factor(srcRGB); 3018 entry->DestinationAlphaBlendFactor = blend_factor(dstA); 3019 entry->SourceAlphaBlendFactor = blend_factor(srcA); 3020 entry->ColorBlendFunction = blend_eqn(eqRGB); 3021 entry->AlphaBlendFunction = blend_eqn(eqA); 3022 3023 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) 3024 independent_alpha_blend = true; 3025 } 3026 3027 return independent_alpha_blend; 3028} 3029 3030#if GEN_GEN >= 6 3031static void 3032genX(upload_blend_state)(struct brw_context *brw) 3033{ 3034 struct gl_context *ctx = &brw->ctx; 3035 int size; 3036 3037 /* We need at least one BLEND_STATE written, because we might do 3038 * thread dispatch even if _NumColorDrawBuffers is 0 (for example 3039 * for computed depth or alpha test), which will do an FB write 3040 * with render target 0, which will reference BLEND_STATE[0] for 3041 * alpha test enable. 3042 */ 3043 int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers; 3044 if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled) 3045 nr_draw_buffers = 1; 3046 3047 size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers; 3048#if GEN_GEN >= 8 3049 size += GENX(BLEND_STATE_length) * 4; 3050#endif 3051 3052 uint32_t *blend_map; 3053 blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset); 3054 3055#if GEN_GEN >= 8 3056 struct GENX(BLEND_STATE) blend = { 0 }; 3057 { 3058#else 3059 for (int i = 0; i < nr_draw_buffers; i++) { 3060 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 3061#define blend entry 3062#endif 3063 /* OpenGL specification 3.3 (page 196), section 4.1.3 says: 3064 * "If drawbuffer zero is not NONE and the buffer it references has an 3065 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE 3066 * operations are skipped." 3067 */ 3068 if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) { 3069 /* _NEW_MULTISAMPLE */ 3070 if (_mesa_is_multisample_enabled(ctx)) { 3071 if (ctx->Multisample.SampleAlphaToCoverage) { 3072 blend.AlphaToCoverageEnable = true; 3073 blend.AlphaToCoverageDitherEnable = GEN_GEN >= 7; 3074 } 3075 if (ctx->Multisample.SampleAlphaToOne) 3076 blend.AlphaToOneEnable = true; 3077 } 3078 3079 /* _NEW_COLOR */ 3080 if (ctx->Color.AlphaEnabled) { 3081 blend.AlphaTestEnable = true; 3082 blend.AlphaTestFunction = 3083 intel_translate_compare_func(ctx->Color.AlphaFunc); 3084 } 3085 3086 if (ctx->Color.DitherFlag) { 3087 blend.ColorDitherEnable = true; 3088 } 3089 } 3090 3091#if GEN_GEN >= 8 3092 for (int i = 0; i < nr_draw_buffers; i++) { 3093 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 3094#else 3095 { 3096#endif 3097 blend.IndependentAlphaBlendEnable = 3098 set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) || 3099 blend.IndependentAlphaBlendEnable; 3100 3101 /* See section 8.1.6 "Pre-Blend Color Clamping" of the 3102 * SandyBridge PRM Volume 2 Part 1 for HW requirements. 3103 * 3104 * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR 3105 * clamping in the fragment shader. For its clamping of 3106 * blending, the spec says: 3107 * 3108 * "RESOLVED: For fixed-point color buffers, the inputs and 3109 * the result of the blending equation are clamped. For 3110 * floating-point color buffers, no clamping occurs." 3111 * 3112 * So, generally, we want clamping to the render target's range. 3113 * And, good news, the hardware tables for both pre- and 3114 * post-blend color clamping are either ignored, or any are 3115 * allowed, or clamping is required but RT range clamping is a 3116 * valid option. 3117 */ 3118 entry.PreBlendColorClampEnable = true; 3119 entry.PostBlendColorClampEnable = true; 3120 entry.ColorClampRange = COLORCLAMP_RTFORMAT; 3121 3122 entry.WriteDisableRed = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0); 3123 entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1); 3124 entry.WriteDisableBlue = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2); 3125 entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3); 3126 3127#if GEN_GEN >= 8 3128 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry); 3129#else 3130 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry); 3131#endif 3132 } 3133 } 3134 3135#if GEN_GEN >= 8 3136 GENX(BLEND_STATE_pack)(NULL, blend_map, &blend); 3137#endif 3138 3139#if GEN_GEN < 7 3140 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 3141 ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset; 3142 ptr.BLEND_STATEChange = true; 3143 } 3144#else 3145 brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) { 3146 ptr.BlendStatePointer = brw->cc.blend_state_offset; 3147#if GEN_GEN >= 8 3148 ptr.BlendStatePointerValid = true; 3149#endif 3150 } 3151#endif 3152} 3153 3154static const struct brw_tracked_state genX(blend_state) = { 3155 .dirty = { 3156 .mesa = _NEW_BUFFERS | 3157 _NEW_COLOR | 3158 _NEW_MULTISAMPLE, 3159 .brw = BRW_NEW_BATCH | 3160 BRW_NEW_BLORP | 3161 BRW_NEW_FS_PROG_DATA | 3162 BRW_NEW_STATE_BASE_ADDRESS, 3163 }, 3164 .emit = genX(upload_blend_state), 3165}; 3166#endif 3167 3168/* ---------------------------------------------------------------------- */ 3169 3170#if GEN_GEN >= 7 3171UNUSED static const uint32_t push_constant_opcodes[] = { 3172 [MESA_SHADER_VERTEX] = 21, 3173 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 3174 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 3175 [MESA_SHADER_GEOMETRY] = 22, 3176 [MESA_SHADER_FRAGMENT] = 23, 3177 [MESA_SHADER_COMPUTE] = 0, 3178}; 3179 3180static void 3181genX(upload_push_constant_packets)(struct brw_context *brw) 3182{ 3183 const struct gen_device_info *devinfo = &brw->screen->devinfo; 3184 struct gl_context *ctx = &brw->ctx; 3185 3186 UNUSED uint32_t mocs = GEN_GEN < 8 ? GEN7_MOCS_L3 : 0; 3187 3188 struct brw_stage_state *stage_states[] = { 3189 &brw->vs.base, 3190 &brw->tcs.base, 3191 &brw->tes.base, 3192 &brw->gs.base, 3193 &brw->wm.base, 3194 }; 3195 3196 if (GEN_GEN == 7 && !GEN_IS_HASWELL && !devinfo->is_baytrail && 3197 stage_states[MESA_SHADER_VERTEX]->push_constants_dirty) 3198 gen7_emit_vs_workaround_flush(brw); 3199 3200 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { 3201 struct brw_stage_state *stage_state = stage_states[stage]; 3202 UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage]; 3203 3204 if (!stage_state->push_constants_dirty) 3205 continue; 3206 3207 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) { 3208 pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; 3209 if (stage_state->prog_data) { 3210#if GEN_GEN >= 8 || GEN_IS_HASWELL 3211 /* The Skylake PRM contains the following restriction: 3212 * 3213 * "The driver must ensure The following case does not occur 3214 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 3215 * buffer 3 read length equal to zero committed followed by a 3216 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 3217 * zero committed." 3218 * 3219 * To avoid this, we program the buffers in the highest slots. 3220 * This way, slot 0 is only used if slot 3 is also used. 3221 */ 3222 int n = 3; 3223 3224 for (int i = 3; i >= 0; i--) { 3225 const struct brw_ubo_range *range = 3226 &stage_state->prog_data->ubo_ranges[i]; 3227 3228 if (range->length == 0) 3229 continue; 3230 3231 const struct gl_uniform_block *block = 3232 prog->sh.UniformBlocks[range->block]; 3233 const struct gl_buffer_binding *binding = 3234 &ctx->UniformBufferBindings[block->Binding]; 3235 3236 if (binding->BufferObject == ctx->Shared->NullBufferObj) { 3237 static unsigned msg_id = 0; 3238 _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API, 3239 MESA_DEBUG_TYPE_UNDEFINED, 3240 MESA_DEBUG_SEVERITY_HIGH, 3241 "UBO %d unbound, %s shader uniform data " 3242 "will be undefined.", 3243 range->block, 3244 _mesa_shader_stage_to_string(stage)); 3245 continue; 3246 } 3247 3248 assert(binding->Offset % 32 == 0); 3249 3250 struct brw_bo *bo = intel_bufferobj_buffer(brw, 3251 intel_buffer_object(binding->BufferObject), 3252 binding->Offset, range->length * 32, false); 3253 3254 pkt.ConstantBody.ReadLength[n] = range->length; 3255 pkt.ConstantBody.Buffer[n] = 3256 ro_bo(bo, range->start * 32 + binding->Offset); 3257 n--; 3258 } 3259 3260 if (stage_state->push_const_size > 0) { 3261 assert(n >= 0); 3262 pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size; 3263 pkt.ConstantBody.Buffer[n] = 3264 ro_bo(stage_state->push_const_bo, 3265 stage_state->push_const_offset); 3266 } 3267#else 3268 pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size; 3269 pkt.ConstantBody.Buffer[0].offset = 3270 stage_state->push_const_offset | mocs; 3271#endif 3272 } 3273 } 3274 3275 stage_state->push_constants_dirty = false; 3276 brw->ctx.NewDriverState |= GEN_GEN >= 9 ? BRW_NEW_SURFACES : 0; 3277 } 3278} 3279 3280const struct brw_tracked_state genX(push_constant_packets) = { 3281 .dirty = { 3282 .mesa = 0, 3283 .brw = BRW_NEW_DRAW_CALL, 3284 }, 3285 .emit = genX(upload_push_constant_packets), 3286}; 3287#endif 3288 3289#if GEN_GEN >= 6 3290static void 3291genX(upload_vs_push_constants)(struct brw_context *brw) 3292{ 3293 struct brw_stage_state *stage_state = &brw->vs.base; 3294 3295 /* BRW_NEW_VERTEX_PROGRAM */ 3296 const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX]; 3297 /* BRW_NEW_VS_PROG_DATA */ 3298 const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data; 3299 3300 gen6_upload_push_constants(brw, vp, prog_data, stage_state); 3301} 3302 3303static const struct brw_tracked_state genX(vs_push_constants) = { 3304 .dirty = { 3305 .mesa = _NEW_PROGRAM_CONSTANTS | 3306 _NEW_TRANSFORM, 3307 .brw = BRW_NEW_BATCH | 3308 BRW_NEW_BLORP | 3309 BRW_NEW_VERTEX_PROGRAM | 3310 BRW_NEW_VS_PROG_DATA, 3311 }, 3312 .emit = genX(upload_vs_push_constants), 3313}; 3314 3315static void 3316genX(upload_gs_push_constants)(struct brw_context *brw) 3317{ 3318 struct brw_stage_state *stage_state = &brw->gs.base; 3319 3320 /* BRW_NEW_GEOMETRY_PROGRAM */ 3321 const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY]; 3322 3323 /* BRW_NEW_GS_PROG_DATA */ 3324 struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data; 3325 3326 gen6_upload_push_constants(brw, gp, prog_data, stage_state); 3327} 3328 3329static const struct brw_tracked_state genX(gs_push_constants) = { 3330 .dirty = { 3331 .mesa = _NEW_PROGRAM_CONSTANTS | 3332 _NEW_TRANSFORM, 3333 .brw = BRW_NEW_BATCH | 3334 BRW_NEW_BLORP | 3335 BRW_NEW_GEOMETRY_PROGRAM | 3336 BRW_NEW_GS_PROG_DATA, 3337 }, 3338 .emit = genX(upload_gs_push_constants), 3339}; 3340 3341static void 3342genX(upload_wm_push_constants)(struct brw_context *brw) 3343{ 3344 struct brw_stage_state *stage_state = &brw->wm.base; 3345 /* BRW_NEW_FRAGMENT_PROGRAM */ 3346 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 3347 /* BRW_NEW_FS_PROG_DATA */ 3348 const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data; 3349 3350 gen6_upload_push_constants(brw, fp, prog_data, stage_state); 3351} 3352 3353static const struct brw_tracked_state genX(wm_push_constants) = { 3354 .dirty = { 3355 .mesa = _NEW_PROGRAM_CONSTANTS, 3356 .brw = BRW_NEW_BATCH | 3357 BRW_NEW_BLORP | 3358 BRW_NEW_FRAGMENT_PROGRAM | 3359 BRW_NEW_FS_PROG_DATA, 3360 }, 3361 .emit = genX(upload_wm_push_constants), 3362}; 3363#endif 3364 3365/* ---------------------------------------------------------------------- */ 3366 3367#if GEN_GEN >= 6 3368static unsigned 3369genX(determine_sample_mask)(struct brw_context *brw) 3370{ 3371 struct gl_context *ctx = &brw->ctx; 3372 float coverage = 1.0f; 3373 float coverage_invert = false; 3374 unsigned sample_mask = ~0u; 3375 3376 /* BRW_NEW_NUM_SAMPLES */ 3377 unsigned num_samples = brw->num_samples; 3378 3379 if (_mesa_is_multisample_enabled(ctx)) { 3380 if (ctx->Multisample.SampleCoverage) { 3381 coverage = ctx->Multisample.SampleCoverageValue; 3382 coverage_invert = ctx->Multisample.SampleCoverageInvert; 3383 } 3384 if (ctx->Multisample.SampleMask) { 3385 sample_mask = ctx->Multisample.SampleMaskValue; 3386 } 3387 } 3388 3389 if (num_samples > 1) { 3390 int coverage_int = (int) (num_samples * coverage + 0.5f); 3391 uint32_t coverage_bits = (1 << coverage_int) - 1; 3392 if (coverage_invert) 3393 coverage_bits ^= (1 << num_samples) - 1; 3394 return coverage_bits & sample_mask; 3395 } else { 3396 return 1; 3397 } 3398} 3399 3400static void 3401genX(emit_3dstate_multisample2)(struct brw_context *brw, 3402 unsigned num_samples) 3403{ 3404 unsigned log2_samples = ffs(num_samples) - 1; 3405 3406 brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) { 3407 multi.PixelLocation = CENTER; 3408 multi.NumberofMultisamples = log2_samples; 3409#if GEN_GEN == 6 3410 GEN_SAMPLE_POS_4X(multi.Sample); 3411#elif GEN_GEN == 7 3412 switch (num_samples) { 3413 case 1: 3414 GEN_SAMPLE_POS_1X(multi.Sample); 3415 break; 3416 case 2: 3417 GEN_SAMPLE_POS_2X(multi.Sample); 3418 break; 3419 case 4: 3420 GEN_SAMPLE_POS_4X(multi.Sample); 3421 break; 3422 case 8: 3423 GEN_SAMPLE_POS_8X(multi.Sample); 3424 break; 3425 default: 3426 break; 3427 } 3428#endif 3429 } 3430} 3431 3432static void 3433genX(upload_multisample_state)(struct brw_context *brw) 3434{ 3435 assert(brw->num_samples > 0 && brw->num_samples <= 16); 3436 3437 genX(emit_3dstate_multisample2)(brw, brw->num_samples); 3438 3439 brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) { 3440 sm.SampleMask = genX(determine_sample_mask)(brw); 3441 } 3442} 3443 3444static const struct brw_tracked_state genX(multisample_state) = { 3445 .dirty = { 3446 .mesa = _NEW_MULTISAMPLE | 3447 (GEN_GEN == 10 ? _NEW_BUFFERS : 0), 3448 .brw = BRW_NEW_BLORP | 3449 BRW_NEW_CONTEXT | 3450 BRW_NEW_NUM_SAMPLES, 3451 }, 3452 .emit = genX(upload_multisample_state) 3453}; 3454#endif 3455 3456/* ---------------------------------------------------------------------- */ 3457 3458static void 3459genX(upload_color_calc_state)(struct brw_context *brw) 3460{ 3461 struct gl_context *ctx = &brw->ctx; 3462 3463 brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) { 3464#if GEN_GEN <= 5 3465 cc.IndependentAlphaBlendEnable = 3466 set_blend_entry_bits(brw, &cc, 0, false); 3467 set_depth_stencil_bits(brw, &cc); 3468 3469 if (ctx->Color.AlphaEnabled && 3470 ctx->DrawBuffer->_NumColorDrawBuffers <= 1) { 3471 cc.AlphaTestEnable = true; 3472 cc.AlphaTestFunction = 3473 intel_translate_compare_func(ctx->Color.AlphaFunc); 3474 } 3475 3476 cc.ColorDitherEnable = ctx->Color.DitherFlag; 3477 3478 cc.StatisticsEnable = brw->stats_wm; 3479 3480 cc.CCViewportStatePointer = 3481 ro_bo(brw->batch.state.bo, brw->cc.vp_offset); 3482#else 3483 /* _NEW_COLOR */ 3484 cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0]; 3485 cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1]; 3486 cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2]; 3487 cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3]; 3488 3489#if GEN_GEN < 9 3490 /* _NEW_STENCIL */ 3491 cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0); 3492 cc.BackfaceStencilReferenceValue = 3493 _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace); 3494#endif 3495 3496#endif 3497 3498 /* _NEW_COLOR */ 3499 UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8, 3500 ctx->Color.AlphaRef); 3501 } 3502 3503#if GEN_GEN >= 6 3504 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 3505 ptr.ColorCalcStatePointer = brw->cc.state_offset; 3506#if GEN_GEN != 7 3507 ptr.ColorCalcStatePointerValid = true; 3508#endif 3509 } 3510#else 3511 brw->ctx.NewDriverState |= BRW_NEW_GEN4_UNIT_STATE; 3512#endif 3513} 3514 3515static const struct brw_tracked_state genX(color_calc_state) = { 3516 .dirty = { 3517 .mesa = _NEW_COLOR | 3518 _NEW_STENCIL | 3519 (GEN_GEN <= 5 ? _NEW_BUFFERS | 3520 _NEW_DEPTH 3521 : 0), 3522 .brw = BRW_NEW_BATCH | 3523 BRW_NEW_BLORP | 3524 (GEN_GEN <= 5 ? BRW_NEW_CC_VP | 3525 BRW_NEW_STATS_WM 3526 : BRW_NEW_CC_STATE | 3527 BRW_NEW_STATE_BASE_ADDRESS), 3528 }, 3529 .emit = genX(upload_color_calc_state), 3530}; 3531 3532 3533/* ---------------------------------------------------------------------- */ 3534 3535#if GEN_GEN >= 7 3536static void 3537genX(upload_sbe)(struct brw_context *brw) 3538{ 3539 struct gl_context *ctx = &brw->ctx; 3540 /* BRW_NEW_FRAGMENT_PROGRAM */ 3541 UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 3542 /* BRW_NEW_FS_PROG_DATA */ 3543 const struct brw_wm_prog_data *wm_prog_data = 3544 brw_wm_prog_data(brw->wm.base.prog_data); 3545#if GEN_GEN >= 8 3546 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } }; 3547#else 3548#define attr_overrides sbe.Attribute 3549#endif 3550 uint32_t urb_entry_read_length; 3551 uint32_t urb_entry_read_offset; 3552 uint32_t point_sprite_enables; 3553 3554 brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) { 3555 sbe.AttributeSwizzleEnable = true; 3556 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 3557 3558 /* _NEW_BUFFERS */ 3559 bool flip_y = ctx->DrawBuffer->FlipY; 3560 3561 /* _NEW_POINT 3562 * 3563 * Window coordinates in an FBO are inverted, which means point 3564 * sprite origin must be inverted. 3565 */ 3566 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) 3567 sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT; 3568 else 3569 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT; 3570 3571 /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM, 3572 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | 3573 * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA | 3574 * BRW_NEW_VUE_MAP_GEOM_OUT 3575 */ 3576 genX(calculate_attr_overrides)(brw, 3577 attr_overrides, 3578 &point_sprite_enables, 3579 &urb_entry_read_length, 3580 &urb_entry_read_offset); 3581 3582 /* Typically, the URB entry read length and offset should be programmed 3583 * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active 3584 * stage which produces geometry. However, we don't know the proper 3585 * value until we call calculate_attr_overrides(). 3586 * 3587 * To fit with our existing code, we override the inherited values and 3588 * specify it here directly, as we did on previous generations. 3589 */ 3590 sbe.VertexURBEntryReadLength = urb_entry_read_length; 3591 sbe.VertexURBEntryReadOffset = urb_entry_read_offset; 3592 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables; 3593 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 3594 3595#if GEN_GEN >= 8 3596 sbe.ForceVertexURBEntryReadLength = true; 3597 sbe.ForceVertexURBEntryReadOffset = true; 3598#endif 3599 3600#if GEN_GEN >= 9 3601 /* prepare the active component dwords */ 3602 for (int i = 0; i < 32; i++) 3603 sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW; 3604#endif 3605 } 3606 3607#if GEN_GEN >= 8 3608 brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) { 3609 for (int i = 0; i < 16; i++) 3610 sbes.Attribute[i] = attr_overrides[i]; 3611 } 3612#endif 3613 3614#undef attr_overrides 3615} 3616 3617static const struct brw_tracked_state genX(sbe_state) = { 3618 .dirty = { 3619 .mesa = _NEW_BUFFERS | 3620 _NEW_LIGHT | 3621 _NEW_POINT | 3622 _NEW_POLYGON | 3623 _NEW_PROGRAM, 3624 .brw = BRW_NEW_BLORP | 3625 BRW_NEW_CONTEXT | 3626 BRW_NEW_FRAGMENT_PROGRAM | 3627 BRW_NEW_FS_PROG_DATA | 3628 BRW_NEW_GS_PROG_DATA | 3629 BRW_NEW_TES_PROG_DATA | 3630 BRW_NEW_VUE_MAP_GEOM_OUT | 3631 (GEN_GEN == 7 ? BRW_NEW_PRIMITIVE 3632 : 0), 3633 }, 3634 .emit = genX(upload_sbe), 3635}; 3636#endif 3637 3638/* ---------------------------------------------------------------------- */ 3639 3640#if GEN_GEN >= 7 3641/** 3642 * Outputs the 3DSTATE_SO_DECL_LIST command. 3643 * 3644 * The data output is a series of 64-bit entries containing a SO_DECL per 3645 * stream. We only have one stream of rendering coming out of the GS unit, so 3646 * we only emit stream 0 (low 16 bits) SO_DECLs. 3647 */ 3648static void 3649genX(upload_3dstate_so_decl_list)(struct brw_context *brw, 3650 const struct brw_vue_map *vue_map) 3651{ 3652 struct gl_context *ctx = &brw->ctx; 3653 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3654 struct gl_transform_feedback_object *xfb_obj = 3655 ctx->TransformFeedback.CurrentObject; 3656 const struct gl_transform_feedback_info *linked_xfb_info = 3657 xfb_obj->program->sh.LinkedTransformFeedback; 3658 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128]; 3659 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3660 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3661 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3662 int max_decls = 0; 3663 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); 3664 3665 memset(so_decl, 0, sizeof(so_decl)); 3666 3667 /* Construct the list of SO_DECLs to be emitted. The formatting of the 3668 * command feels strange -- each dword pair contains a SO_DECL per stream. 3669 */ 3670 for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { 3671 const struct gl_transform_feedback_output *output = 3672 &linked_xfb_info->Outputs[i]; 3673 const int buffer = output->OutputBuffer; 3674 const int varying = output->OutputRegister; 3675 const unsigned stream_id = output->StreamId; 3676 assert(stream_id < MAX_VERTEX_STREAMS); 3677 3678 buffer_mask[stream_id] |= 1 << buffer; 3679 3680 assert(vue_map->varying_to_slot[varying] >= 0); 3681 3682 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] 3683 * array. Instead, it simply increments DstOffset for the following 3684 * input by the number of components that should be skipped. 3685 * 3686 * Our hardware is unusual in that it requires us to program SO_DECLs 3687 * for fake "hole" components, rather than simply taking the offset 3688 * for each real varying. Each hole can have size 1, 2, 3, or 4; we 3689 * program as many size = 4 holes as we can, then a final hole to 3690 * accommodate the final 1, 2, or 3 remaining. 3691 */ 3692 int skip_components = output->DstOffset - next_offset[buffer]; 3693 3694 while (skip_components > 0) { 3695 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 3696 .HoleFlag = 1, 3697 .OutputBufferSlot = output->OutputBuffer, 3698 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1, 3699 }; 3700 skip_components -= 4; 3701 } 3702 3703 next_offset[buffer] = output->DstOffset + output->NumComponents; 3704 3705 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 3706 .OutputBufferSlot = output->OutputBuffer, 3707 .RegisterIndex = vue_map->varying_to_slot[varying], 3708 .ComponentMask = 3709 ((1 << output->NumComponents) - 1) << output->ComponentOffset, 3710 }; 3711 3712 if (decls[stream_id] > max_decls) 3713 max_decls = decls[stream_id]; 3714 } 3715 3716 uint32_t *dw; 3717 dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls, 3718 .StreamtoBufferSelects0 = buffer_mask[0], 3719 .StreamtoBufferSelects1 = buffer_mask[1], 3720 .StreamtoBufferSelects2 = buffer_mask[2], 3721 .StreamtoBufferSelects3 = buffer_mask[3], 3722 .NumEntries0 = decls[0], 3723 .NumEntries1 = decls[1], 3724 .NumEntries2 = decls[2], 3725 .NumEntries3 = decls[3]); 3726 3727 for (int i = 0; i < max_decls; i++) { 3728 GENX(SO_DECL_ENTRY_pack)( 3729 brw, dw + 2 + i * 2, 3730 &(struct GENX(SO_DECL_ENTRY)) { 3731 .Stream0Decl = so_decl[0][i], 3732 .Stream1Decl = so_decl[1][i], 3733 .Stream2Decl = so_decl[2][i], 3734 .Stream3Decl = so_decl[3][i], 3735 }); 3736 } 3737} 3738 3739static void 3740genX(upload_3dstate_so_buffers)(struct brw_context *brw) 3741{ 3742 struct gl_context *ctx = &brw->ctx; 3743 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3744 struct gl_transform_feedback_object *xfb_obj = 3745 ctx->TransformFeedback.CurrentObject; 3746#if GEN_GEN < 8 3747 const struct gl_transform_feedback_info *linked_xfb_info = 3748 xfb_obj->program->sh.LinkedTransformFeedback; 3749#else 3750 struct brw_transform_feedback_object *brw_obj = 3751 (struct brw_transform_feedback_object *) xfb_obj; 3752 uint32_t mocs_wb = GEN_GEN >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; 3753#endif 3754 3755 /* Set up the up to 4 output buffers. These are the ranges defined in the 3756 * gl_transform_feedback_object. 3757 */ 3758 for (int i = 0; i < 4; i++) { 3759 struct intel_buffer_object *bufferobj = 3760 intel_buffer_object(xfb_obj->Buffers[i]); 3761 uint32_t start = xfb_obj->Offset[i]; 3762 uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); 3763 uint32_t const size = end - start; 3764 3765 if (!bufferobj || !size) { 3766 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { 3767 sob.SOBufferIndex = i; 3768 } 3769 continue; 3770 } 3771 3772 assert(start % 4 == 0); 3773 struct brw_bo *bo = 3774 intel_bufferobj_buffer(brw, bufferobj, start, size, true); 3775 assert(end <= bo->size); 3776 3777 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { 3778 sob.SOBufferIndex = i; 3779 3780 sob.SurfaceBaseAddress = rw_bo(bo, start); 3781#if GEN_GEN < 8 3782 sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4; 3783 sob.SurfaceEndAddress = rw_bo(bo, end); 3784#else 3785 sob.SOBufferEnable = true; 3786 sob.StreamOffsetWriteEnable = true; 3787 sob.StreamOutputBufferOffsetAddressEnable = true; 3788 sob.MOCS = mocs_wb; 3789 3790 sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1; 3791 sob.StreamOutputBufferOffsetAddress = 3792 rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t)); 3793 3794 if (brw_obj->zero_offsets) { 3795 /* Zero out the offset and write that to offset_bo */ 3796 sob.StreamOffset = 0; 3797 } else { 3798 /* Use offset_bo as the "Stream Offset." */ 3799 sob.StreamOffset = 0xFFFFFFFF; 3800 } 3801#endif 3802 } 3803 } 3804 3805#if GEN_GEN >= 8 3806 brw_obj->zero_offsets = false; 3807#endif 3808} 3809 3810static bool 3811query_active(struct gl_query_object *q) 3812{ 3813 return q && q->Active; 3814} 3815 3816static void 3817genX(upload_3dstate_streamout)(struct brw_context *brw, bool active, 3818 const struct brw_vue_map *vue_map) 3819{ 3820 struct gl_context *ctx = &brw->ctx; 3821 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3822 struct gl_transform_feedback_object *xfb_obj = 3823 ctx->TransformFeedback.CurrentObject; 3824 3825 brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) { 3826 if (active) { 3827 int urb_entry_read_offset = 0; 3828 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - 3829 urb_entry_read_offset; 3830 3831 sos.SOFunctionEnable = true; 3832 sos.SOStatisticsEnable = true; 3833 3834 /* BRW_NEW_RASTERIZER_DISCARD */ 3835 if (ctx->RasterDiscard) { 3836 if (!query_active(ctx->Query.PrimitivesGenerated[0])) { 3837 sos.RenderingDisable = true; 3838 } else { 3839 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " 3840 "query active relies on the clipper.\n"); 3841 } 3842 } 3843 3844 /* _NEW_LIGHT */ 3845 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) 3846 sos.ReorderMode = TRAILING; 3847 3848#if GEN_GEN < 8 3849 sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL; 3850 sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL; 3851 sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL; 3852 sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL; 3853#else 3854 const struct gl_transform_feedback_info *linked_xfb_info = 3855 xfb_obj->program->sh.LinkedTransformFeedback; 3856 /* Set buffer pitches; 0 means unbound. */ 3857 if (xfb_obj->Buffers[0]) 3858 sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4; 3859 if (xfb_obj->Buffers[1]) 3860 sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4; 3861 if (xfb_obj->Buffers[2]) 3862 sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4; 3863 if (xfb_obj->Buffers[3]) 3864 sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4; 3865#endif 3866 3867 /* We always read the whole vertex. This could be reduced at some 3868 * point by reading less and offsetting the register index in the 3869 * SO_DECLs. 3870 */ 3871 sos.Stream0VertexReadOffset = urb_entry_read_offset; 3872 sos.Stream0VertexReadLength = urb_entry_read_length - 1; 3873 sos.Stream1VertexReadOffset = urb_entry_read_offset; 3874 sos.Stream1VertexReadLength = urb_entry_read_length - 1; 3875 sos.Stream2VertexReadOffset = urb_entry_read_offset; 3876 sos.Stream2VertexReadLength = urb_entry_read_length - 1; 3877 sos.Stream3VertexReadOffset = urb_entry_read_offset; 3878 sos.Stream3VertexReadLength = urb_entry_read_length - 1; 3879 } 3880 } 3881} 3882 3883static void 3884genX(upload_sol)(struct brw_context *brw) 3885{ 3886 struct gl_context *ctx = &brw->ctx; 3887 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3888 bool active = _mesa_is_xfb_active_and_unpaused(ctx); 3889 3890 if (active) { 3891 genX(upload_3dstate_so_buffers)(brw); 3892 3893 /* BRW_NEW_VUE_MAP_GEOM_OUT */ 3894 genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out); 3895 } 3896 3897 /* Finally, set up the SOL stage. This command must always follow updates to 3898 * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or 3899 * MMIO register updates (current performed by the kernel at each batch 3900 * emit). 3901 */ 3902 genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out); 3903} 3904 3905static const struct brw_tracked_state genX(sol_state) = { 3906 .dirty = { 3907 .mesa = _NEW_LIGHT, 3908 .brw = BRW_NEW_BATCH | 3909 BRW_NEW_BLORP | 3910 BRW_NEW_RASTERIZER_DISCARD | 3911 BRW_NEW_VUE_MAP_GEOM_OUT | 3912 BRW_NEW_TRANSFORM_FEEDBACK, 3913 }, 3914 .emit = genX(upload_sol), 3915}; 3916#endif 3917 3918/* ---------------------------------------------------------------------- */ 3919 3920#if GEN_GEN >= 7 3921static void 3922genX(upload_ps)(struct brw_context *brw) 3923{ 3924 UNUSED const struct gl_context *ctx = &brw->ctx; 3925 UNUSED const struct gen_device_info *devinfo = &brw->screen->devinfo; 3926 3927 /* BRW_NEW_FS_PROG_DATA */ 3928 const struct brw_wm_prog_data *prog_data = 3929 brw_wm_prog_data(brw->wm.base.prog_data); 3930 const struct brw_stage_state *stage_state = &brw->wm.base; 3931 3932#if GEN_GEN < 8 3933#endif 3934 3935 brw_batch_emit(brw, GENX(3DSTATE_PS), ps) { 3936 /* Initialize the execution mask with VMask. Otherwise, derivatives are 3937 * incorrect for subspans where some of the pixels are unlit. We believe 3938 * the bit just didn't take effect in previous generations. 3939 */ 3940 ps.VectorMaskEnable = GEN_GEN >= 8; 3941 3942 /* WA_1606682166: 3943 * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. 3944 * Disable the Sampler state prefetch functionality in the SARB by 3945 * programming 0xB000[30] to '1'." 3946 */ 3947 ps.SamplerCount = GEN_GEN == 11 ? 3948 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); 3949 3950 /* BRW_NEW_FS_PROG_DATA */ 3951 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to disable 3952 * prefetching of binding tables in A0 and B0 steppings. 3953 * TODO: Revisit this workaround on C0 stepping. 3954 */ 3955 ps.BindingTableEntryCount = GEN_GEN == 11 ? 3956 0 : 3957 prog_data->base.binding_table.size_bytes / 4; 3958 3959 if (prog_data->base.use_alt_mode) 3960 ps.FloatingPointMode = Alternate; 3961 3962 /* Haswell requires the sample mask to be set in this packet as well as 3963 * in 3DSTATE_SAMPLE_MASK; the values should match. 3964 */ 3965 3966 /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ 3967#if GEN_IS_HASWELL 3968 ps.SampleMask = genX(determine_sample_mask(brw)); 3969#endif 3970 3971 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64 3972 * for pre Gen11 and 128 for gen11+; On gen11+ If a programmed value is 3973 * k, it implies 2(k+1) threads. It implicitly scales for different GT 3974 * levels (which have some # of PSDs). 3975 * 3976 * In Gen8 the format is U8-2 whereas in Gen9+ it is U9-1. 3977 */ 3978#if GEN_GEN >= 9 3979 ps.MaximumNumberofThreadsPerPSD = 64 - 1; 3980#elif GEN_GEN >= 8 3981 ps.MaximumNumberofThreadsPerPSD = 64 - 2; 3982#else 3983 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 3984#endif 3985 3986 if (prog_data->base.nr_params > 0 || 3987 prog_data->base.ubo_ranges[0].length > 0) 3988 ps.PushConstantEnable = true; 3989 3990#if GEN_GEN < 8 3991 /* From the IVB PRM, volume 2 part 1, page 287: 3992 * "This bit is inserted in the PS payload header and made available to 3993 * the DataPort (either via the message header or via header bypass) to 3994 * indicate that oMask data (one or two phases) is included in Render 3995 * Target Write messages. If present, the oMask data is used to mask off 3996 * samples." 3997 */ 3998 ps.oMaskPresenttoRenderTarget = prog_data->uses_omask; 3999 4000 /* The hardware wedges if you have this bit set but don't turn on any 4001 * dual source blend factors. 4002 * 4003 * BRW_NEW_FS_PROG_DATA | _NEW_COLOR 4004 */ 4005 ps.DualSourceBlendEnable = prog_data->dual_src_blend && 4006 (ctx->Color.BlendEnabled & 1) && 4007 ctx->Color.Blend[0]._UsesDualSrc; 4008 4009 /* BRW_NEW_FS_PROG_DATA */ 4010 ps.AttributeEnable = (prog_data->num_varying_inputs != 0); 4011#endif 4012 4013 /* From the documentation for this packet: 4014 * "If the PS kernel does not need the Position XY Offsets to 4015 * compute a Position Value, then this field should be programmed 4016 * to POSOFFSET_NONE." 4017 * 4018 * "SW Recommendation: If the PS kernel needs the Position Offsets 4019 * to compute a Position XY value, this field should match Position 4020 * ZW Interpolation Mode to ensure a consistent position.xyzw 4021 * computation." 4022 * 4023 * We only require XY sample offsets. So, this recommendation doesn't 4024 * look useful at the moment. We might need this in future. 4025 */ 4026 if (prog_data->uses_pos_offset) 4027 ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE; 4028 else 4029 ps.PositionXYOffsetSelect = POSOFFSET_NONE; 4030 4031 ps._8PixelDispatchEnable = prog_data->dispatch_8; 4032 ps._16PixelDispatchEnable = prog_data->dispatch_16; 4033 ps._32PixelDispatchEnable = prog_data->dispatch_32; 4034 4035 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: 4036 * 4037 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 4038 * Dispatch must not be enabled for PER_PIXEL dispatch mode." 4039 * 4040 * Since 16x MSAA is first introduced on SKL, we don't need to apply 4041 * the workaround on any older hardware. 4042 * 4043 * BRW_NEW_NUM_SAMPLES 4044 */ 4045 if (GEN_GEN >= 9 && !prog_data->persample_dispatch && 4046 brw->num_samples == 16) { 4047 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); 4048 ps._32PixelDispatchEnable = false; 4049 } 4050 4051 ps.DispatchGRFStartRegisterForConstantSetupData0 = 4052 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); 4053 ps.DispatchGRFStartRegisterForConstantSetupData1 = 4054 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); 4055 ps.DispatchGRFStartRegisterForConstantSetupData2 = 4056 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); 4057 4058 ps.KernelStartPointer0 = stage_state->prog_offset + 4059 brw_wm_prog_data_prog_offset(prog_data, ps, 0); 4060 ps.KernelStartPointer1 = stage_state->prog_offset + 4061 brw_wm_prog_data_prog_offset(prog_data, ps, 1); 4062 ps.KernelStartPointer2 = stage_state->prog_offset + 4063 brw_wm_prog_data_prog_offset(prog_data, ps, 2); 4064 4065 if (prog_data->base.total_scratch) { 4066 ps.ScratchSpaceBasePointer = 4067 rw_32_bo(stage_state->scratch_bo, 4068 ffs(stage_state->per_thread_scratch) - 11); 4069 } 4070 } 4071} 4072 4073static const struct brw_tracked_state genX(ps_state) = { 4074 .dirty = { 4075 .mesa = _NEW_MULTISAMPLE | 4076 (GEN_GEN < 8 ? _NEW_BUFFERS | 4077 _NEW_COLOR 4078 : 0), 4079 .brw = BRW_NEW_BATCH | 4080 BRW_NEW_BLORP | 4081 BRW_NEW_FS_PROG_DATA | 4082 (GEN_GEN >= 9 ? BRW_NEW_NUM_SAMPLES : 0), 4083 }, 4084 .emit = genX(upload_ps), 4085}; 4086#endif 4087 4088/* ---------------------------------------------------------------------- */ 4089 4090#if GEN_GEN >= 7 4091static void 4092genX(upload_hs_state)(struct brw_context *brw) 4093{ 4094 const struct gen_device_info *devinfo = &brw->screen->devinfo; 4095 struct brw_stage_state *stage_state = &brw->tcs.base; 4096 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 4097 const struct brw_vue_prog_data *vue_prog_data = 4098 brw_vue_prog_data(stage_prog_data); 4099 4100 /* BRW_NEW_TES_PROG_DATA */ 4101 struct brw_tcs_prog_data *tcs_prog_data = 4102 brw_tcs_prog_data(stage_prog_data); 4103 4104 if (!tcs_prog_data) { 4105 brw_batch_emit(brw, GENX(3DSTATE_HS), hs); 4106 } else { 4107 brw_batch_emit(brw, GENX(3DSTATE_HS), hs) { 4108 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex); 4109 4110 hs.InstanceCount = tcs_prog_data->instances - 1; 4111 hs.IncludeVertexHandles = true; 4112 4113 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; 4114 } 4115 } 4116} 4117 4118static const struct brw_tracked_state genX(hs_state) = { 4119 .dirty = { 4120 .mesa = 0, 4121 .brw = BRW_NEW_BATCH | 4122 BRW_NEW_BLORP | 4123 BRW_NEW_TCS_PROG_DATA | 4124 BRW_NEW_TESS_PROGRAMS, 4125 }, 4126 .emit = genX(upload_hs_state), 4127}; 4128 4129static void 4130genX(upload_ds_state)(struct brw_context *brw) 4131{ 4132 const struct gen_device_info *devinfo = &brw->screen->devinfo; 4133 const struct brw_stage_state *stage_state = &brw->tes.base; 4134 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 4135 4136 /* BRW_NEW_TES_PROG_DATA */ 4137 const struct brw_tes_prog_data *tes_prog_data = 4138 brw_tes_prog_data(stage_prog_data); 4139 const struct brw_vue_prog_data *vue_prog_data = 4140 brw_vue_prog_data(stage_prog_data); 4141 4142 if (!tes_prog_data) { 4143 brw_batch_emit(brw, GENX(3DSTATE_DS), ds); 4144 } else { 4145 assert(GEN_GEN < 11 || 4146 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8); 4147 4148 brw_batch_emit(brw, GENX(3DSTATE_DS), ds) { 4149 INIT_THREAD_DISPATCH_FIELDS(ds, Patch); 4150 4151 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; 4152 ds.ComputeWCoordinateEnable = 4153 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; 4154 4155#if GEN_GEN >= 8 4156 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8) 4157 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; 4158 ds.UserClipDistanceCullTestEnableBitmask = 4159 vue_prog_data->cull_distance_mask; 4160#endif 4161 } 4162 } 4163} 4164 4165static const struct brw_tracked_state genX(ds_state) = { 4166 .dirty = { 4167 .mesa = 0, 4168 .brw = BRW_NEW_BATCH | 4169 BRW_NEW_BLORP | 4170 BRW_NEW_TESS_PROGRAMS | 4171 BRW_NEW_TES_PROG_DATA, 4172 }, 4173 .emit = genX(upload_ds_state), 4174}; 4175 4176/* ---------------------------------------------------------------------- */ 4177 4178static void 4179upload_te_state(struct brw_context *brw) 4180{ 4181 /* BRW_NEW_TESS_PROGRAMS */ 4182 bool active = brw->programs[MESA_SHADER_TESS_EVAL]; 4183 4184 /* BRW_NEW_TES_PROG_DATA */ 4185 const struct brw_tes_prog_data *tes_prog_data = 4186 brw_tes_prog_data(brw->tes.base.prog_data); 4187 4188 if (active) { 4189 brw_batch_emit(brw, GENX(3DSTATE_TE), te) { 4190 te.Partitioning = tes_prog_data->partitioning; 4191 te.OutputTopology = tes_prog_data->output_topology; 4192 te.TEDomain = tes_prog_data->domain; 4193 te.TEEnable = true; 4194 te.MaximumTessellationFactorOdd = 63.0; 4195 te.MaximumTessellationFactorNotOdd = 64.0; 4196 } 4197 } else { 4198 brw_batch_emit(brw, GENX(3DSTATE_TE), te); 4199 } 4200} 4201 4202static const struct brw_tracked_state genX(te_state) = { 4203 .dirty = { 4204 .mesa = 0, 4205 .brw = BRW_NEW_BLORP | 4206 BRW_NEW_CONTEXT | 4207 BRW_NEW_TES_PROG_DATA | 4208 BRW_NEW_TESS_PROGRAMS, 4209 }, 4210 .emit = upload_te_state, 4211}; 4212 4213/* ---------------------------------------------------------------------- */ 4214 4215static void 4216genX(upload_tes_push_constants)(struct brw_context *brw) 4217{ 4218 struct brw_stage_state *stage_state = &brw->tes.base; 4219 /* BRW_NEW_TESS_PROGRAMS */ 4220 const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL]; 4221 4222 /* BRW_NEW_TES_PROG_DATA */ 4223 const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data; 4224 gen6_upload_push_constants(brw, tep, prog_data, stage_state); 4225} 4226 4227static const struct brw_tracked_state genX(tes_push_constants) = { 4228 .dirty = { 4229 .mesa = _NEW_PROGRAM_CONSTANTS, 4230 .brw = BRW_NEW_BATCH | 4231 BRW_NEW_BLORP | 4232 BRW_NEW_TESS_PROGRAMS | 4233 BRW_NEW_TES_PROG_DATA, 4234 }, 4235 .emit = genX(upload_tes_push_constants), 4236}; 4237 4238static void 4239genX(upload_tcs_push_constants)(struct brw_context *brw) 4240{ 4241 struct brw_stage_state *stage_state = &brw->tcs.base; 4242 /* BRW_NEW_TESS_PROGRAMS */ 4243 const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL]; 4244 4245 /* BRW_NEW_TCS_PROG_DATA */ 4246 const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data; 4247 4248 gen6_upload_push_constants(brw, tcp, prog_data, stage_state); 4249} 4250 4251static const struct brw_tracked_state genX(tcs_push_constants) = { 4252 .dirty = { 4253 .mesa = _NEW_PROGRAM_CONSTANTS, 4254 .brw = BRW_NEW_BATCH | 4255 BRW_NEW_BLORP | 4256 BRW_NEW_DEFAULT_TESS_LEVELS | 4257 BRW_NEW_TESS_PROGRAMS | 4258 BRW_NEW_TCS_PROG_DATA, 4259 }, 4260 .emit = genX(upload_tcs_push_constants), 4261}; 4262 4263#endif 4264 4265/* ---------------------------------------------------------------------- */ 4266 4267#if GEN_GEN >= 7 4268static void 4269genX(upload_cs_push_constants)(struct brw_context *brw) 4270{ 4271 struct brw_stage_state *stage_state = &brw->cs.base; 4272 4273 /* BRW_NEW_COMPUTE_PROGRAM */ 4274 const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE]; 4275 4276 if (cp) { 4277 /* BRW_NEW_CS_PROG_DATA */ 4278 struct brw_cs_prog_data *cs_prog_data = 4279 brw_cs_prog_data(brw->cs.base.prog_data); 4280 4281 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 4282 brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state); 4283 } 4284} 4285 4286const struct brw_tracked_state genX(cs_push_constants) = { 4287 .dirty = { 4288 .mesa = _NEW_PROGRAM_CONSTANTS, 4289 .brw = BRW_NEW_BATCH | 4290 BRW_NEW_BLORP | 4291 BRW_NEW_COMPUTE_PROGRAM | 4292 BRW_NEW_CS_PROG_DATA, 4293 }, 4294 .emit = genX(upload_cs_push_constants), 4295}; 4296 4297/** 4298 * Creates a new CS constant buffer reflecting the current CS program's 4299 * constants, if needed by the CS program. 4300 */ 4301static void 4302genX(upload_cs_pull_constants)(struct brw_context *brw) 4303{ 4304 struct brw_stage_state *stage_state = &brw->cs.base; 4305 4306 /* BRW_NEW_COMPUTE_PROGRAM */ 4307 struct brw_program *cp = 4308 (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE]; 4309 4310 /* BRW_NEW_CS_PROG_DATA */ 4311 const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data; 4312 4313 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 4314 /* _NEW_PROGRAM_CONSTANTS */ 4315 brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program, 4316 stage_state, prog_data); 4317} 4318 4319const struct brw_tracked_state genX(cs_pull_constants) = { 4320 .dirty = { 4321 .mesa = _NEW_PROGRAM_CONSTANTS, 4322 .brw = BRW_NEW_BATCH | 4323 BRW_NEW_BLORP | 4324 BRW_NEW_COMPUTE_PROGRAM | 4325 BRW_NEW_CS_PROG_DATA, 4326 }, 4327 .emit = genX(upload_cs_pull_constants), 4328}; 4329 4330static void 4331genX(upload_cs_state)(struct brw_context *brw) 4332{ 4333 if (!brw->cs.base.prog_data) 4334 return; 4335 4336 uint32_t offset; 4337 uint32_t *desc = (uint32_t*) brw_state_batch( 4338 brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64, 4339 &offset); 4340 4341 struct brw_stage_state *stage_state = &brw->cs.base; 4342 struct brw_stage_prog_data *prog_data = stage_state->prog_data; 4343 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 4344 const struct gen_device_info *devinfo = &brw->screen->devinfo; 4345 4346 if (INTEL_DEBUG & DEBUG_SHADER_TIME) { 4347 brw_emit_buffer_surface_state( 4348 brw, &stage_state->surf_offset[ 4349 prog_data->binding_table.shader_time_start], 4350 brw->shader_time.bo, 0, ISL_FORMAT_RAW, 4351 brw->shader_time.bo->size, 1, 4352 RELOC_WRITE); 4353 } 4354 4355 uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes, 4356 32, &stage_state->bind_bo_offset); 4357 4358 /* The MEDIA_VFE_STATE documentation for Gen8+ says: 4359 * 4360 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 4361 * the only bits that are changed are scoreboard related: Scoreboard 4362 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 4363 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient." 4364 * 4365 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL", 4366 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL. 4367 */ 4368 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL); 4369 4370 brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) { 4371 if (prog_data->total_scratch) { 4372 uint32_t per_thread_scratch_value; 4373 4374 if (GEN_GEN >= 8) { 4375 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 4376 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 4377 */ 4378 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11; 4379 } else if (GEN_IS_HASWELL) { 4380 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 4381 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 4382 */ 4383 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12; 4384 } else { 4385 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] 4386 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 4387 */ 4388 per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1; 4389 } 4390 vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); 4391 vfe.PerThreadScratchSpace = per_thread_scratch_value; 4392 } 4393 4394 /* If brw->screen->subslice_total is greater than one, then 4395 * devinfo->max_cs_threads stores number of threads per sub-slice; 4396 * thus we need to multiply by that number by subslices to get 4397 * the actual maximum number of threads; the -1 is because the HW 4398 * has a bias of 1 (would not make sense to say the maximum number 4399 * of threads is 0). 4400 */ 4401 const uint32_t subslices = MAX2(brw->screen->subslice_total, 1); 4402 vfe.MaximumNumberofThreads = devinfo->max_cs_threads * subslices - 1; 4403 vfe.NumberofURBEntries = GEN_GEN >= 8 ? 2 : 0; 4404#if GEN_GEN < 11 4405 vfe.ResetGatewayTimer = 4406 Resettingrelativetimerandlatchingtheglobaltimestamp; 4407#endif 4408#if GEN_GEN < 9 4409 vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol; 4410#endif 4411#if GEN_GEN == 7 4412 vfe.GPGPUMode = 1; 4413#endif 4414 4415 /* We are uploading duplicated copies of push constant uniforms for each 4416 * thread. Although the local id data needs to vary per thread, it won't 4417 * change for other uniform data. Unfortunately this duplication is 4418 * required for gen7. As of Haswell, this duplication can be avoided, 4419 * but this older mechanism with duplicated data continues to work. 4420 * 4421 * FINISHME: As of Haswell, we could make use of the 4422 * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" 4423 * field to only store one copy of uniform data. 4424 * 4425 * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage" 4426 * which is described in the GPGPU_WALKER command and in the Broadwell 4427 * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of 4428 * Operations => GPGPU Mode => Indirect Payload Storage. 4429 * 4430 * Note: The constant data is built in brw_upload_cs_push_constants 4431 * below. 4432 */ 4433 vfe.URBEntryAllocationSize = GEN_GEN >= 8 ? 2 : 0; 4434 4435 const uint32_t vfe_curbe_allocation = 4436 ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + 4437 cs_prog_data->push.cross_thread.regs, 2); 4438 vfe.CURBEAllocationSize = vfe_curbe_allocation; 4439 } 4440 4441 if (cs_prog_data->push.total.size > 0) { 4442 brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) { 4443 curbe.CURBETotalDataLength = 4444 ALIGN(cs_prog_data->push.total.size, 64); 4445 curbe.CURBEDataStartAddress = stage_state->push_const_offset; 4446 } 4447 } 4448 4449 /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ 4450 memcpy(bind, stage_state->surf_offset, 4451 prog_data->binding_table.size_bytes); 4452 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = { 4453 .KernelStartPointer = brw->cs.base.prog_offset, 4454 .SamplerStatePointer = stage_state->sampler_offset, 4455 .SamplerCount = DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), 4456 .BindingTablePointer = stage_state->bind_bo_offset, 4457 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, 4458 .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads, 4459 .SharedLocalMemorySize = encode_slm_size(GEN_GEN, 4460 prog_data->total_shared), 4461 .BarrierEnable = cs_prog_data->uses_barrier, 4462#if GEN_GEN >= 8 || GEN_IS_HASWELL 4463 .CrossThreadConstantDataReadLength = 4464 cs_prog_data->push.cross_thread.regs, 4465#endif 4466 }; 4467 4468 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd); 4469 4470 brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { 4471 load.InterfaceDescriptorTotalLength = 4472 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 4473 load.InterfaceDescriptorDataStartAddress = offset; 4474 } 4475} 4476 4477static const struct brw_tracked_state genX(cs_state) = { 4478 .dirty = { 4479 .mesa = _NEW_PROGRAM_CONSTANTS, 4480 .brw = BRW_NEW_BATCH | 4481 BRW_NEW_BLORP | 4482 BRW_NEW_CS_PROG_DATA | 4483 BRW_NEW_SAMPLER_STATE_TABLE | 4484 BRW_NEW_SURFACES, 4485 }, 4486 .emit = genX(upload_cs_state) 4487}; 4488 4489#define GPGPU_DISPATCHDIMX 0x2500 4490#define GPGPU_DISPATCHDIMY 0x2504 4491#define GPGPU_DISPATCHDIMZ 0x2508 4492 4493#define MI_PREDICATE_SRC0 0x2400 4494#define MI_PREDICATE_SRC1 0x2408 4495 4496static void 4497prepare_indirect_gpgpu_walker(struct brw_context *brw) 4498{ 4499 GLintptr indirect_offset = brw->compute.num_work_groups_offset; 4500 struct brw_bo *bo = brw->compute.num_work_groups_bo; 4501 4502 emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0)); 4503 emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4)); 4504 emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8)); 4505 4506#if GEN_GEN <= 7 4507 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ 4508 emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0); 4509 emit_lri(brw, MI_PREDICATE_SRC1 , 0); 4510 emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0); 4511 4512 /* Load compute_dispatch_indirect_x_size into SRC0 */ 4513 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0)); 4514 4515 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 4516 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4517 mip.LoadOperation = LOAD_LOAD; 4518 mip.CombineOperation = COMBINE_SET; 4519 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4520 } 4521 4522 /* Load compute_dispatch_indirect_y_size into SRC0 */ 4523 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4)); 4524 4525 /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 4526 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4527 mip.LoadOperation = LOAD_LOAD; 4528 mip.CombineOperation = COMBINE_OR; 4529 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4530 } 4531 4532 /* Load compute_dispatch_indirect_z_size into SRC0 */ 4533 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8)); 4534 4535 /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 4536 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4537 mip.LoadOperation = LOAD_LOAD; 4538 mip.CombineOperation = COMBINE_OR; 4539 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4540 } 4541 4542 /* predicate = !predicate; */ 4543#define COMPARE_FALSE 1 4544 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4545 mip.LoadOperation = LOAD_LOADINV; 4546 mip.CombineOperation = COMBINE_OR; 4547 mip.CompareOperation = COMPARE_FALSE; 4548 } 4549#endif 4550} 4551 4552static void 4553genX(emit_gpgpu_walker)(struct brw_context *brw) 4554{ 4555 const struct brw_cs_prog_data *prog_data = 4556 brw_cs_prog_data(brw->cs.base.prog_data); 4557 4558 const GLuint *num_groups = brw->compute.num_work_groups; 4559 4560 bool indirect = brw->compute.num_work_groups_bo != NULL; 4561 if (indirect) 4562 prepare_indirect_gpgpu_walker(brw); 4563 4564 const unsigned simd_size = prog_data->simd_size; 4565 unsigned group_size = prog_data->local_size[0] * 4566 prog_data->local_size[1] * prog_data->local_size[2]; 4567 4568 uint32_t right_mask = 0xffffffffu >> (32 - simd_size); 4569 const unsigned right_non_aligned = group_size & (simd_size - 1); 4570 if (right_non_aligned != 0) 4571 right_mask >>= (simd_size - right_non_aligned); 4572 4573 brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) { 4574 ggw.IndirectParameterEnable = indirect; 4575 ggw.PredicateEnable = GEN_GEN <= 7 && indirect; 4576 ggw.SIMDSize = prog_data->simd_size / 16; 4577 ggw.ThreadDepthCounterMaximum = 0; 4578 ggw.ThreadHeightCounterMaximum = 0; 4579 ggw.ThreadWidthCounterMaximum = prog_data->threads - 1; 4580 ggw.ThreadGroupIDXDimension = num_groups[0]; 4581 ggw.ThreadGroupIDYDimension = num_groups[1]; 4582 ggw.ThreadGroupIDZDimension = num_groups[2]; 4583 ggw.RightExecutionMask = right_mask; 4584 ggw.BottomExecutionMask = 0xffffffff; 4585 } 4586 4587 brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf); 4588} 4589 4590#endif 4591 4592/* ---------------------------------------------------------------------- */ 4593 4594#if GEN_GEN >= 8 4595static void 4596genX(upload_raster)(struct brw_context *brw) 4597{ 4598 const struct gl_context *ctx = &brw->ctx; 4599 4600 /* _NEW_BUFFERS */ 4601 const bool flip_y = ctx->DrawBuffer->FlipY; 4602 4603 /* _NEW_POLYGON */ 4604 const struct gl_polygon_attrib *polygon = &ctx->Polygon; 4605 4606 /* _NEW_POINT */ 4607 const struct gl_point_attrib *point = &ctx->Point; 4608 4609 brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) { 4610 if (brw->polygon_front_bit != flip_y) 4611 raster.FrontWinding = CounterClockwise; 4612 4613 if (polygon->CullFlag) { 4614 switch (polygon->CullFaceMode) { 4615 case GL_FRONT: 4616 raster.CullMode = CULLMODE_FRONT; 4617 break; 4618 case GL_BACK: 4619 raster.CullMode = CULLMODE_BACK; 4620 break; 4621 case GL_FRONT_AND_BACK: 4622 raster.CullMode = CULLMODE_BOTH; 4623 break; 4624 default: 4625 unreachable("not reached"); 4626 } 4627 } else { 4628 raster.CullMode = CULLMODE_NONE; 4629 } 4630 4631 raster.SmoothPointEnable = point->SmoothFlag; 4632 4633 raster.DXMultisampleRasterizationEnable = 4634 _mesa_is_multisample_enabled(ctx); 4635 4636 raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill; 4637 raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine; 4638 raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint; 4639 4640 switch (polygon->FrontMode) { 4641 case GL_FILL: 4642 raster.FrontFaceFillMode = FILL_MODE_SOLID; 4643 break; 4644 case GL_LINE: 4645 raster.FrontFaceFillMode = FILL_MODE_WIREFRAME; 4646 break; 4647 case GL_POINT: 4648 raster.FrontFaceFillMode = FILL_MODE_POINT; 4649 break; 4650 default: 4651 unreachable("not reached"); 4652 } 4653 4654 switch (polygon->BackMode) { 4655 case GL_FILL: 4656 raster.BackFaceFillMode = FILL_MODE_SOLID; 4657 break; 4658 case GL_LINE: 4659 raster.BackFaceFillMode = FILL_MODE_WIREFRAME; 4660 break; 4661 case GL_POINT: 4662 raster.BackFaceFillMode = FILL_MODE_POINT; 4663 break; 4664 default: 4665 unreachable("not reached"); 4666 } 4667 4668 /* _NEW_LINE */ 4669 raster.AntialiasingEnable = ctx->Line.SmoothFlag; 4670 4671#if GEN_GEN == 10 4672 /* _NEW_BUFFERS 4673 * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1. 4674 */ 4675 const bool multisampled_fbo = 4676 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 4677 if (multisampled_fbo) 4678 raster.AntialiasingEnable = false; 4679#endif 4680 4681 /* _NEW_SCISSOR */ 4682 raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags; 4683 4684 /* _NEW_TRANSFORM */ 4685#if GEN_GEN < 9 4686 if (!(ctx->Transform.DepthClampNear && 4687 ctx->Transform.DepthClampFar)) 4688 raster.ViewportZClipTestEnable = true; 4689#endif 4690 4691#if GEN_GEN >= 9 4692 if (!ctx->Transform.DepthClampNear) 4693 raster.ViewportZNearClipTestEnable = true; 4694 4695 if (!ctx->Transform.DepthClampFar) 4696 raster.ViewportZFarClipTestEnable = true; 4697#endif 4698 4699 /* BRW_NEW_CONSERVATIVE_RASTERIZATION */ 4700#if GEN_GEN >= 9 4701 raster.ConservativeRasterizationEnable = 4702 ctx->IntelConservativeRasterization; 4703#endif 4704 4705 raster.GlobalDepthOffsetClamp = polygon->OffsetClamp; 4706 raster.GlobalDepthOffsetScale = polygon->OffsetFactor; 4707 4708 raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2; 4709 } 4710} 4711 4712static const struct brw_tracked_state genX(raster_state) = { 4713 .dirty = { 4714 .mesa = _NEW_BUFFERS | 4715 _NEW_LINE | 4716 _NEW_MULTISAMPLE | 4717 _NEW_POINT | 4718 _NEW_POLYGON | 4719 _NEW_SCISSOR | 4720 _NEW_TRANSFORM, 4721 .brw = BRW_NEW_BLORP | 4722 BRW_NEW_CONTEXT | 4723 BRW_NEW_CONSERVATIVE_RASTERIZATION, 4724 }, 4725 .emit = genX(upload_raster), 4726}; 4727#endif 4728 4729/* ---------------------------------------------------------------------- */ 4730 4731#if GEN_GEN >= 8 4732static void 4733genX(upload_ps_extra)(struct brw_context *brw) 4734{ 4735 UNUSED struct gl_context *ctx = &brw->ctx; 4736 4737 const struct brw_wm_prog_data *prog_data = 4738 brw_wm_prog_data(brw->wm.base.prog_data); 4739 4740 brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) { 4741 psx.PixelShaderValid = true; 4742 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode; 4743 psx.PixelShaderKillsPixel = prog_data->uses_kill; 4744 psx.AttributeEnable = prog_data->num_varying_inputs != 0; 4745 psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth; 4746 psx.PixelShaderUsesSourceW = prog_data->uses_src_w; 4747 psx.PixelShaderIsPerSample = prog_data->persample_dispatch; 4748 4749 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */ 4750 if (prog_data->uses_sample_mask) { 4751#if GEN_GEN >= 9 4752 if (prog_data->post_depth_coverage) 4753 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; 4754 else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization) 4755 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE; 4756 else 4757 psx.InputCoverageMaskState = ICMS_NORMAL; 4758#else 4759 psx.PixelShaderUsesInputCoverageMask = true; 4760#endif 4761 } 4762 4763 psx.oMaskPresenttoRenderTarget = prog_data->uses_omask; 4764#if GEN_GEN >= 9 4765 psx.PixelShaderPullsBary = prog_data->pulls_bary; 4766 psx.PixelShaderComputesStencil = prog_data->computed_stencil; 4767#endif 4768 4769 /* The stricter cross-primitive coherency guarantees that the hardware 4770 * gives us with the "Accesses UAV" bit set for at least one shader stage 4771 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command 4772 * are redundant within the current image, atomic counter and SSBO GL 4773 * APIs, which all have very loose ordering and coherency requirements 4774 * and generally rely on the application to insert explicit barriers when 4775 * a shader invocation is expected to see the memory writes performed by 4776 * the invocations of some previous primitive. Regardless of the value 4777 * of "UAV coherency required", the "Accesses UAV" bits will implicitly 4778 * cause an in most cases useless DC flush when the lowermost stage with 4779 * the bit set finishes execution. 4780 * 4781 * It would be nice to disable it, but in some cases we can't because on 4782 * Gen8+ it also has an influence on rasterization via the PS UAV-only 4783 * signal (which could be set independently from the coherency mechanism 4784 * in the 3DSTATE_WM command on Gen7), and because in some cases it will 4785 * determine whether the hardware skips execution of the fragment shader 4786 * or not via the ThreadDispatchEnable signal. However if we know that 4787 * GEN8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and 4788 * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any 4789 * difference so we may just disable it here. 4790 * 4791 * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't 4792 * take into account KillPixels when no depth or stencil writes are 4793 * enabled. In order for occlusion queries to work correctly with no 4794 * attachments, we need to force-enable here. 4795 * 4796 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | 4797 * _NEW_COLOR 4798 */ 4799 if ((prog_data->has_side_effects || prog_data->uses_kill) && 4800 !brw_color_buffer_write_enabled(brw)) 4801 psx.PixelShaderHasUAV = true; 4802 } 4803} 4804 4805const struct brw_tracked_state genX(ps_extra) = { 4806 .dirty = { 4807 .mesa = _NEW_BUFFERS | _NEW_COLOR, 4808 .brw = BRW_NEW_BLORP | 4809 BRW_NEW_CONTEXT | 4810 BRW_NEW_FRAGMENT_PROGRAM | 4811 BRW_NEW_FS_PROG_DATA | 4812 BRW_NEW_CONSERVATIVE_RASTERIZATION, 4813 }, 4814 .emit = genX(upload_ps_extra), 4815}; 4816#endif 4817 4818/* ---------------------------------------------------------------------- */ 4819 4820#if GEN_GEN >= 8 4821static void 4822genX(upload_ps_blend)(struct brw_context *brw) 4823{ 4824 struct gl_context *ctx = &brw->ctx; 4825 4826 /* _NEW_BUFFERS */ 4827 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0]; 4828 const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1; 4829 4830 /* _NEW_COLOR */ 4831 struct gl_colorbuffer_attrib *color = &ctx->Color; 4832 4833 brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) { 4834 /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */ 4835 pb.HasWriteableRT = brw_color_buffer_write_enabled(brw); 4836 4837 bool alpha_to_one = false; 4838 4839 if (!buffer0_is_integer) { 4840 /* _NEW_MULTISAMPLE */ 4841 4842 if (_mesa_is_multisample_enabled(ctx)) { 4843 pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage; 4844 alpha_to_one = ctx->Multisample.SampleAlphaToOne; 4845 } 4846 4847 pb.AlphaTestEnable = color->AlphaEnabled; 4848 } 4849 4850 /* Used for implementing the following bit of GL_EXT_texture_integer: 4851 * "Per-fragment operations that require floating-point color 4852 * components, including multisample alpha operations, alpha test, 4853 * blending, and dithering, have no effect when the corresponding 4854 * colors are written to an integer color buffer." 4855 * 4856 * The OpenGL specification 3.3 (page 196), section 4.1.3 says: 4857 * "If drawbuffer zero is not NONE and the buffer it references has an 4858 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE 4859 * operations are skipped." 4860 */ 4861 if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) { 4862 GLenum eqRGB = color->Blend[0].EquationRGB; 4863 GLenum eqA = color->Blend[0].EquationA; 4864 GLenum srcRGB = color->Blend[0].SrcRGB; 4865 GLenum dstRGB = color->Blend[0].DstRGB; 4866 GLenum srcA = color->Blend[0].SrcA; 4867 GLenum dstA = color->Blend[0].DstA; 4868 4869 if (eqRGB == GL_MIN || eqRGB == GL_MAX) 4870 srcRGB = dstRGB = GL_ONE; 4871 4872 if (eqA == GL_MIN || eqA == GL_MAX) 4873 srcA = dstA = GL_ONE; 4874 4875 /* Due to hardware limitations, the destination may have information 4876 * in an alpha channel even when the format specifies no alpha 4877 * channel. In order to avoid getting any incorrect blending due to 4878 * that alpha channel, coerce the blend factors to values that will 4879 * not read the alpha channel, but will instead use the correct 4880 * implicit value for alpha. 4881 */ 4882 if (!_mesa_base_format_has_channel(rb->_BaseFormat, 4883 GL_TEXTURE_ALPHA_TYPE)) { 4884 srcRGB = brw_fix_xRGB_alpha(srcRGB); 4885 srcA = brw_fix_xRGB_alpha(srcA); 4886 dstRGB = brw_fix_xRGB_alpha(dstRGB); 4887 dstA = brw_fix_xRGB_alpha(dstA); 4888 } 4889 4890 /* Alpha to One doesn't work with Dual Color Blending. Override 4891 * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO. 4892 */ 4893 if (alpha_to_one && color->Blend[0]._UsesDualSrc) { 4894 srcRGB = fix_dual_blend_alpha_to_one(srcRGB); 4895 srcA = fix_dual_blend_alpha_to_one(srcA); 4896 dstRGB = fix_dual_blend_alpha_to_one(dstRGB); 4897 dstA = fix_dual_blend_alpha_to_one(dstA); 4898 } 4899 4900 /* BRW_NEW_FS_PROG_DATA */ 4901 const struct brw_wm_prog_data *wm_prog_data = 4902 brw_wm_prog_data(brw->wm.base.prog_data); 4903 4904 /* The Dual Source Blending documentation says: 4905 * 4906 * "If SRC1 is included in a src/dst blend factor and 4907 * a DualSource RT Write message is not used, results 4908 * are UNDEFINED. (This reflects the same restriction in DX APIs, 4909 * where undefined results are produced if “o1” is not written 4910 * by a PS – there are no default values defined). 4911 * If SRC1 is not included in a src/dst blend factor, 4912 * dual source blending must be disabled." 4913 * 4914 * There is no way to gracefully fix this undefined situation 4915 * so we just disable the blending to prevent possible issues. 4916 */ 4917 pb.ColorBufferBlendEnable = 4918 !color->Blend[0]._UsesDualSrc || wm_prog_data->dual_src_blend; 4919 pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA); 4920 pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA); 4921 pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB); 4922 pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB); 4923 4924 pb.IndependentAlphaBlendEnable = 4925 srcA != srcRGB || dstA != dstRGB || eqA != eqRGB; 4926 } 4927 } 4928} 4929 4930static const struct brw_tracked_state genX(ps_blend) = { 4931 .dirty = { 4932 .mesa = _NEW_BUFFERS | 4933 _NEW_COLOR | 4934 _NEW_MULTISAMPLE, 4935 .brw = BRW_NEW_BLORP | 4936 BRW_NEW_CONTEXT | 4937 BRW_NEW_FRAGMENT_PROGRAM | 4938 BRW_NEW_FS_PROG_DATA, 4939 }, 4940 .emit = genX(upload_ps_blend) 4941}; 4942#endif 4943 4944/* ---------------------------------------------------------------------- */ 4945 4946#if GEN_GEN >= 8 4947static void 4948genX(emit_vf_topology)(struct brw_context *brw) 4949{ 4950 brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) { 4951 vftopo.PrimitiveTopologyType = brw->primitive; 4952 } 4953} 4954 4955static const struct brw_tracked_state genX(vf_topology) = { 4956 .dirty = { 4957 .mesa = 0, 4958 .brw = BRW_NEW_BLORP | 4959 BRW_NEW_PRIMITIVE, 4960 }, 4961 .emit = genX(emit_vf_topology), 4962}; 4963#endif 4964 4965/* ---------------------------------------------------------------------- */ 4966 4967#if GEN_GEN >= 7 4968static void 4969genX(emit_mi_report_perf_count)(struct brw_context *brw, 4970 struct brw_bo *bo, 4971 uint32_t offset_in_bytes, 4972 uint32_t report_id) 4973{ 4974 brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) { 4975 mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes); 4976 mi_rpc.ReportID = report_id; 4977 } 4978} 4979#endif 4980 4981/* ---------------------------------------------------------------------- */ 4982 4983/** 4984 * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet. 4985 */ 4986static void 4987genX(emit_sampler_state_pointers_xs)(MAYBE_UNUSED struct brw_context *brw, 4988 MAYBE_UNUSED struct brw_stage_state *stage_state) 4989{ 4990#if GEN_GEN >= 7 4991 static const uint16_t packet_headers[] = { 4992 [MESA_SHADER_VERTEX] = 43, 4993 [MESA_SHADER_TESS_CTRL] = 44, 4994 [MESA_SHADER_TESS_EVAL] = 45, 4995 [MESA_SHADER_GEOMETRY] = 46, 4996 [MESA_SHADER_FRAGMENT] = 47, 4997 }; 4998 4999 /* Ivybridge requires a workaround flush before VS packets. */ 5000 if (GEN_GEN == 7 && !GEN_IS_HASWELL && 5001 stage_state->stage == MESA_SHADER_VERTEX) { 5002 gen7_emit_vs_workaround_flush(brw); 5003 } 5004 5005 brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) { 5006 ptr._3DCommandSubOpcode = packet_headers[stage_state->stage]; 5007 ptr.PointertoVSSamplerState = stage_state->sampler_offset; 5008 } 5009#endif 5010} 5011 5012UNUSED static bool 5013has_component(mesa_format format, int i) 5014{ 5015 if (_mesa_is_format_color_format(format)) 5016 return _mesa_format_has_color_component(format, i); 5017 5018 /* depth and stencil have only one component */ 5019 return i == 0; 5020} 5021 5022/** 5023 * Upload SAMPLER_BORDER_COLOR_STATE. 5024 */ 5025static void 5026genX(upload_default_color)(struct brw_context *brw, 5027 const struct gl_sampler_object *sampler, 5028 MAYBE_UNUSED mesa_format format, GLenum base_format, 5029 bool is_integer_format, bool is_stencil_sampling, 5030 uint32_t *sdc_offset) 5031{ 5032 union gl_color_union color; 5033 5034 switch (base_format) { 5035 case GL_DEPTH_COMPONENT: 5036 /* GL specs that border color for depth textures is taken from the 5037 * R channel, while the hardware uses A. Spam R into all the 5038 * channels for safety. 5039 */ 5040 color.ui[0] = sampler->BorderColor.ui[0]; 5041 color.ui[1] = sampler->BorderColor.ui[0]; 5042 color.ui[2] = sampler->BorderColor.ui[0]; 5043 color.ui[3] = sampler->BorderColor.ui[0]; 5044 break; 5045 case GL_ALPHA: 5046 color.ui[0] = 0u; 5047 color.ui[1] = 0u; 5048 color.ui[2] = 0u; 5049 color.ui[3] = sampler->BorderColor.ui[3]; 5050 break; 5051 case GL_INTENSITY: 5052 color.ui[0] = sampler->BorderColor.ui[0]; 5053 color.ui[1] = sampler->BorderColor.ui[0]; 5054 color.ui[2] = sampler->BorderColor.ui[0]; 5055 color.ui[3] = sampler->BorderColor.ui[0]; 5056 break; 5057 case GL_LUMINANCE: 5058 color.ui[0] = sampler->BorderColor.ui[0]; 5059 color.ui[1] = sampler->BorderColor.ui[0]; 5060 color.ui[2] = sampler->BorderColor.ui[0]; 5061 color.ui[3] = float_as_int(1.0); 5062 break; 5063 case GL_LUMINANCE_ALPHA: 5064 color.ui[0] = sampler->BorderColor.ui[0]; 5065 color.ui[1] = sampler->BorderColor.ui[0]; 5066 color.ui[2] = sampler->BorderColor.ui[0]; 5067 color.ui[3] = sampler->BorderColor.ui[3]; 5068 break; 5069 default: 5070 color.ui[0] = sampler->BorderColor.ui[0]; 5071 color.ui[1] = sampler->BorderColor.ui[1]; 5072 color.ui[2] = sampler->BorderColor.ui[2]; 5073 color.ui[3] = sampler->BorderColor.ui[3]; 5074 break; 5075 } 5076 5077 /* In some cases we use an RGBA surface format for GL RGB textures, 5078 * where we've initialized the A channel to 1.0. We also have to set 5079 * the border color alpha to 1.0 in that case. 5080 */ 5081 if (base_format == GL_RGB) 5082 color.ui[3] = float_as_int(1.0); 5083 5084 int alignment = 32; 5085 if (GEN_GEN >= 8) { 5086 alignment = 64; 5087 } else if (GEN_IS_HASWELL && (is_integer_format || is_stencil_sampling)) { 5088 alignment = 512; 5089 } 5090 5091 uint32_t *sdc = brw_state_batch( 5092 brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t), 5093 alignment, sdc_offset); 5094 5095 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 }; 5096 5097#define ASSIGN(dst, src) \ 5098 do { \ 5099 dst = src; \ 5100 } while (0) 5101 5102#define ASSIGNu16(dst, src) \ 5103 do { \ 5104 dst = (uint16_t)src; \ 5105 } while (0) 5106 5107#define ASSIGNu8(dst, src) \ 5108 do { \ 5109 dst = (uint8_t)src; \ 5110 } while (0) 5111 5112#define BORDER_COLOR_ATTR(macro, _color_type, src) \ 5113 macro(state.BorderColor ## _color_type ## Red, src[0]); \ 5114 macro(state.BorderColor ## _color_type ## Green, src[1]); \ 5115 macro(state.BorderColor ## _color_type ## Blue, src[2]); \ 5116 macro(state.BorderColor ## _color_type ## Alpha, src[3]); 5117 5118#if GEN_GEN >= 8 5119 /* On Broadwell, the border color is represented as four 32-bit floats, 5120 * integers, or unsigned values, interpreted according to the surface 5121 * format. This matches the sampler->BorderColor union exactly; just 5122 * memcpy the values. 5123 */ 5124 BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui); 5125#elif GEN_IS_HASWELL 5126 if (is_integer_format || is_stencil_sampling) { 5127 bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling; 5128 const int bits_per_channel = 5129 _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS); 5130 5131 /* From the Haswell PRM, "Command Reference: Structures", Page 36: 5132 * "If any color channel is missing from the surface format, 5133 * corresponding border color should be programmed as zero and if 5134 * alpha channel is missing, corresponding Alpha border color should 5135 * be programmed as 1." 5136 */ 5137 unsigned c[4] = { 0, 0, 0, 1 }; 5138 for (int i = 0; i < 4; i++) { 5139 if (has_component(format, i)) 5140 c[i] = color.ui[i]; 5141 } 5142 5143 switch (bits_per_channel) { 5144 case 8: 5145 /* Copy RGBA in order. */ 5146 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c); 5147 break; 5148 case 10: 5149 /* R10G10B10A2_UINT is treated like a 16-bit format. */ 5150 case 16: 5151 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c); 5152 break; 5153 case 32: 5154 if (base_format == GL_RG) { 5155 /* Careful inspection of the tables reveals that for RG32 formats, 5156 * the green channel needs to go where blue normally belongs. 5157 */ 5158 state.BorderColor32bitRed = c[0]; 5159 state.BorderColor32bitBlue = c[1]; 5160 state.BorderColor32bitAlpha = 1; 5161 } else { 5162 /* Copy RGBA in order. */ 5163 BORDER_COLOR_ATTR(ASSIGN, 32bit, c); 5164 } 5165 break; 5166 default: 5167 assert(!"Invalid number of bits per channel in integer format."); 5168 break; 5169 } 5170 } else { 5171 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5172 } 5173#elif GEN_GEN == 5 || GEN_GEN == 6 5174 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f); 5175 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f); 5176 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f); 5177 5178#define MESA_FLOAT_TO_HALF(dst, src) \ 5179 dst = _mesa_float_to_half(src); 5180 5181 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f); 5182 5183#undef MESA_FLOAT_TO_HALF 5184 5185 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8; 5186 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8; 5187 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8; 5188 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8; 5189 5190 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5191#elif GEN_GEN == 4 5192 BORDER_COLOR_ATTR(ASSIGN, , color.f); 5193#else 5194 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5195#endif 5196 5197#undef ASSIGN 5198#undef BORDER_COLOR_ATTR 5199 5200 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state); 5201} 5202 5203static uint32_t 5204translate_wrap_mode(GLenum wrap, MAYBE_UNUSED bool using_nearest) 5205{ 5206 switch (wrap) { 5207 case GL_REPEAT: 5208 return TCM_WRAP; 5209 case GL_CLAMP: 5210#if GEN_GEN >= 8 5211 /* GL_CLAMP is the weird mode where coordinates are clamped to 5212 * [0.0, 1.0], so linear filtering of coordinates outside of 5213 * [0.0, 1.0] give you half edge texel value and half border 5214 * color. 5215 * 5216 * Gen8+ supports this natively. 5217 */ 5218 return TCM_HALF_BORDER; 5219#else 5220 /* On Gen4-7.5, we clamp the coordinates in the fragment shader 5221 * and set clamp_border here, which gets the result desired. 5222 * We just use clamp(_to_edge) for nearest, because for nearest 5223 * clamping to 1.0 gives border color instead of the desired 5224 * edge texels. 5225 */ 5226 if (using_nearest) 5227 return TCM_CLAMP; 5228 else 5229 return TCM_CLAMP_BORDER; 5230#endif 5231 case GL_CLAMP_TO_EDGE: 5232 return TCM_CLAMP; 5233 case GL_CLAMP_TO_BORDER: 5234 return TCM_CLAMP_BORDER; 5235 case GL_MIRRORED_REPEAT: 5236 return TCM_MIRROR; 5237 case GL_MIRROR_CLAMP_TO_EDGE: 5238 return TCM_MIRROR_ONCE; 5239 default: 5240 return TCM_WRAP; 5241 } 5242} 5243 5244/** 5245 * Return true if the given wrap mode requires the border color to exist. 5246 */ 5247static bool 5248wrap_mode_needs_border_color(unsigned wrap_mode) 5249{ 5250#if GEN_GEN >= 8 5251 return wrap_mode == TCM_CLAMP_BORDER || 5252 wrap_mode == TCM_HALF_BORDER; 5253#else 5254 return wrap_mode == TCM_CLAMP_BORDER; 5255#endif 5256} 5257 5258/** 5259 * Sets the sampler state for a single unit based off of the sampler key 5260 * entry. 5261 */ 5262static void 5263genX(update_sampler_state)(struct brw_context *brw, 5264 GLenum target, bool tex_cube_map_seamless, 5265 GLfloat tex_unit_lod_bias, 5266 mesa_format format, GLenum base_format, 5267 const struct gl_texture_object *texObj, 5268 const struct gl_sampler_object *sampler, 5269 uint32_t *sampler_state) 5270{ 5271 struct GENX(SAMPLER_STATE) samp_st = { 0 }; 5272 5273 /* Select min and mip filters. */ 5274 switch (sampler->MinFilter) { 5275 case GL_NEAREST: 5276 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5277 samp_st.MipModeFilter = MIPFILTER_NONE; 5278 break; 5279 case GL_LINEAR: 5280 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5281 samp_st.MipModeFilter = MIPFILTER_NONE; 5282 break; 5283 case GL_NEAREST_MIPMAP_NEAREST: 5284 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5285 samp_st.MipModeFilter = MIPFILTER_NEAREST; 5286 break; 5287 case GL_LINEAR_MIPMAP_NEAREST: 5288 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5289 samp_st.MipModeFilter = MIPFILTER_NEAREST; 5290 break; 5291 case GL_NEAREST_MIPMAP_LINEAR: 5292 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5293 samp_st.MipModeFilter = MIPFILTER_LINEAR; 5294 break; 5295 case GL_LINEAR_MIPMAP_LINEAR: 5296 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5297 samp_st.MipModeFilter = MIPFILTER_LINEAR; 5298 break; 5299 default: 5300 unreachable("not reached"); 5301 } 5302 5303 /* Select mag filter. */ 5304 samp_st.MagModeFilter = sampler->MagFilter == GL_LINEAR ? 5305 MAPFILTER_LINEAR : MAPFILTER_NEAREST; 5306 5307 /* Enable anisotropic filtering if desired. */ 5308 samp_st.MaximumAnisotropy = RATIO21; 5309 5310 if (sampler->MaxAnisotropy > 1.0f) { 5311 if (samp_st.MinModeFilter == MAPFILTER_LINEAR) 5312 samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC; 5313 if (samp_st.MagModeFilter == MAPFILTER_LINEAR) 5314 samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC; 5315 5316 if (sampler->MaxAnisotropy > 2.0f) { 5317 samp_st.MaximumAnisotropy = 5318 MIN2((sampler->MaxAnisotropy - 2) / 2, RATIO161); 5319 } 5320 } 5321 5322 /* Set address rounding bits if not using nearest filtering. */ 5323 if (samp_st.MinModeFilter != MAPFILTER_NEAREST) { 5324 samp_st.UAddressMinFilterRoundingEnable = true; 5325 samp_st.VAddressMinFilterRoundingEnable = true; 5326 samp_st.RAddressMinFilterRoundingEnable = true; 5327 } 5328 5329 if (samp_st.MagModeFilter != MAPFILTER_NEAREST) { 5330 samp_st.UAddressMagFilterRoundingEnable = true; 5331 samp_st.VAddressMagFilterRoundingEnable = true; 5332 samp_st.RAddressMagFilterRoundingEnable = true; 5333 } 5334 5335 bool either_nearest = 5336 sampler->MinFilter == GL_NEAREST || sampler->MagFilter == GL_NEAREST; 5337 unsigned wrap_s = translate_wrap_mode(sampler->WrapS, either_nearest); 5338 unsigned wrap_t = translate_wrap_mode(sampler->WrapT, either_nearest); 5339 unsigned wrap_r = translate_wrap_mode(sampler->WrapR, either_nearest); 5340 5341 if (target == GL_TEXTURE_CUBE_MAP || 5342 target == GL_TEXTURE_CUBE_MAP_ARRAY) { 5343 /* Cube maps must use the same wrap mode for all three coordinate 5344 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid. 5345 * 5346 * Ivybridge and Baytrail seem to have problems with CUBE mode and 5347 * integer formats. Fall back to CLAMP for now. 5348 */ 5349 if ((tex_cube_map_seamless || sampler->CubeMapSeamless) && 5350 !(GEN_GEN == 7 && !GEN_IS_HASWELL && texObj->_IsIntegerFormat)) { 5351 wrap_s = TCM_CUBE; 5352 wrap_t = TCM_CUBE; 5353 wrap_r = TCM_CUBE; 5354 } else { 5355 wrap_s = TCM_CLAMP; 5356 wrap_t = TCM_CLAMP; 5357 wrap_r = TCM_CLAMP; 5358 } 5359 } else if (target == GL_TEXTURE_1D) { 5360 /* There's a bug in 1D texture sampling - it actually pays 5361 * attention to the wrap_t value, though it should not. 5362 * Override the wrap_t value here to GL_REPEAT to keep 5363 * any nonexistent border pixels from floating in. 5364 */ 5365 wrap_t = TCM_WRAP; 5366 } 5367 5368 samp_st.TCXAddressControlMode = wrap_s; 5369 samp_st.TCYAddressControlMode = wrap_t; 5370 samp_st.TCZAddressControlMode = wrap_r; 5371 5372 samp_st.ShadowFunction = 5373 sampler->CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ? 5374 intel_translate_shadow_compare_func(sampler->CompareFunc) : 0; 5375 5376#if GEN_GEN >= 7 5377 /* Set shadow function. */ 5378 samp_st.AnisotropicAlgorithm = 5379 samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ? 5380 EWAApproximation : LEGACY; 5381#endif 5382 5383#if GEN_GEN >= 6 5384 samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE; 5385#endif 5386 5387 const float hw_max_lod = GEN_GEN >= 7 ? 14 : 13; 5388 samp_st.MinLOD = CLAMP(sampler->MinLod, 0, hw_max_lod); 5389 samp_st.MaxLOD = CLAMP(sampler->MaxLod, 0, hw_max_lod); 5390 samp_st.TextureLODBias = 5391 CLAMP(tex_unit_lod_bias + sampler->LodBias, -16, 15); 5392 5393#if GEN_GEN == 6 5394 samp_st.BaseMipLevel = 5395 CLAMP(texObj->MinLevel + texObj->BaseLevel, 0, hw_max_lod); 5396 samp_st.MinandMagStateNotEqual = 5397 samp_st.MinModeFilter != samp_st.MagModeFilter; 5398#endif 5399 5400 /* Upload the border color if necessary. If not, just point it at 5401 * offset 0 (the start of the batch) - the color should be ignored, 5402 * but that address won't fault in case something reads it anyway. 5403 */ 5404 uint32_t border_color_offset = 0; 5405 if (wrap_mode_needs_border_color(wrap_s) || 5406 wrap_mode_needs_border_color(wrap_t) || 5407 wrap_mode_needs_border_color(wrap_r)) { 5408 genX(upload_default_color)(brw, sampler, format, base_format, 5409 texObj->_IsIntegerFormat, 5410 texObj->StencilSampling, 5411 &border_color_offset); 5412 } 5413#if GEN_GEN < 6 5414 samp_st.BorderColorPointer = 5415 ro_bo(brw->batch.state.bo, border_color_offset); 5416#else 5417 samp_st.BorderColorPointer = border_color_offset; 5418#endif 5419 5420#if GEN_GEN >= 8 5421 samp_st.LODPreClampMode = CLAMP_MODE_OGL; 5422#else 5423 samp_st.LODPreClampEnable = true; 5424#endif 5425 5426 GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st); 5427} 5428 5429static void 5430update_sampler_state(struct brw_context *brw, 5431 int unit, 5432 uint32_t *sampler_state) 5433{ 5434 struct gl_context *ctx = &brw->ctx; 5435 const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit]; 5436 const struct gl_texture_object *texObj = texUnit->_Current; 5437 const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit); 5438 5439 /* These don't use samplers at all. */ 5440 if (texObj->Target == GL_TEXTURE_BUFFER) 5441 return; 5442 5443 struct gl_texture_image *firstImage = texObj->Image[0][texObj->BaseLevel]; 5444 genX(update_sampler_state)(brw, texObj->Target, 5445 ctx->Texture.CubeMapSeamless, 5446 texUnit->LodBias, 5447 firstImage->TexFormat, firstImage->_BaseFormat, 5448 texObj, sampler, 5449 sampler_state); 5450} 5451 5452static void 5453genX(upload_sampler_state_table)(struct brw_context *brw, 5454 struct gl_program *prog, 5455 struct brw_stage_state *stage_state) 5456{ 5457 struct gl_context *ctx = &brw->ctx; 5458 uint32_t sampler_count = stage_state->sampler_count; 5459 5460 GLbitfield SamplersUsed = prog->SamplersUsed; 5461 5462 if (sampler_count == 0) 5463 return; 5464 5465 /* SAMPLER_STATE is 4 DWords on all platforms. */ 5466 const int dwords = GENX(SAMPLER_STATE_length); 5467 const int size_in_bytes = dwords * sizeof(uint32_t); 5468 5469 uint32_t *sampler_state = brw_state_batch(brw, 5470 sampler_count * size_in_bytes, 5471 32, &stage_state->sampler_offset); 5472 /* memset(sampler_state, 0, sampler_count * size_in_bytes); */ 5473 5474 for (unsigned s = 0; s < sampler_count; s++) { 5475 if (SamplersUsed & (1 << s)) { 5476 const unsigned unit = prog->SamplerUnits[s]; 5477 if (ctx->Texture.Unit[unit]._Current) { 5478 update_sampler_state(brw, unit, sampler_state); 5479 } 5480 } 5481 5482 sampler_state += dwords; 5483 } 5484 5485 if (GEN_GEN >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) { 5486 /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */ 5487 genX(emit_sampler_state_pointers_xs)(brw, stage_state); 5488 } else { 5489 /* Flag that the sampler state table pointer has changed; later atoms 5490 * will handle it. 5491 */ 5492 brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE; 5493 } 5494} 5495 5496static void 5497genX(upload_fs_samplers)(struct brw_context *brw) 5498{ 5499 /* BRW_NEW_FRAGMENT_PROGRAM */ 5500 struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT]; 5501 genX(upload_sampler_state_table)(brw, fs, &brw->wm.base); 5502} 5503 5504static const struct brw_tracked_state genX(fs_samplers) = { 5505 .dirty = { 5506 .mesa = _NEW_TEXTURE, 5507 .brw = BRW_NEW_BATCH | 5508 BRW_NEW_BLORP | 5509 BRW_NEW_FRAGMENT_PROGRAM, 5510 }, 5511 .emit = genX(upload_fs_samplers), 5512}; 5513 5514static void 5515genX(upload_vs_samplers)(struct brw_context *brw) 5516{ 5517 /* BRW_NEW_VERTEX_PROGRAM */ 5518 struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX]; 5519 genX(upload_sampler_state_table)(brw, vs, &brw->vs.base); 5520} 5521 5522static const struct brw_tracked_state genX(vs_samplers) = { 5523 .dirty = { 5524 .mesa = _NEW_TEXTURE, 5525 .brw = BRW_NEW_BATCH | 5526 BRW_NEW_BLORP | 5527 BRW_NEW_VERTEX_PROGRAM, 5528 }, 5529 .emit = genX(upload_vs_samplers), 5530}; 5531 5532#if GEN_GEN >= 6 5533static void 5534genX(upload_gs_samplers)(struct brw_context *brw) 5535{ 5536 /* BRW_NEW_GEOMETRY_PROGRAM */ 5537 struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY]; 5538 if (!gs) 5539 return; 5540 5541 genX(upload_sampler_state_table)(brw, gs, &brw->gs.base); 5542} 5543 5544 5545static const struct brw_tracked_state genX(gs_samplers) = { 5546 .dirty = { 5547 .mesa = _NEW_TEXTURE, 5548 .brw = BRW_NEW_BATCH | 5549 BRW_NEW_BLORP | 5550 BRW_NEW_GEOMETRY_PROGRAM, 5551 }, 5552 .emit = genX(upload_gs_samplers), 5553}; 5554#endif 5555 5556#if GEN_GEN >= 7 5557static void 5558genX(upload_tcs_samplers)(struct brw_context *brw) 5559{ 5560 /* BRW_NEW_TESS_PROGRAMS */ 5561 struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL]; 5562 if (!tcs) 5563 return; 5564 5565 genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base); 5566} 5567 5568static const struct brw_tracked_state genX(tcs_samplers) = { 5569 .dirty = { 5570 .mesa = _NEW_TEXTURE, 5571 .brw = BRW_NEW_BATCH | 5572 BRW_NEW_BLORP | 5573 BRW_NEW_TESS_PROGRAMS, 5574 }, 5575 .emit = genX(upload_tcs_samplers), 5576}; 5577#endif 5578 5579#if GEN_GEN >= 7 5580static void 5581genX(upload_tes_samplers)(struct brw_context *brw) 5582{ 5583 /* BRW_NEW_TESS_PROGRAMS */ 5584 struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL]; 5585 if (!tes) 5586 return; 5587 5588 genX(upload_sampler_state_table)(brw, tes, &brw->tes.base); 5589} 5590 5591static const struct brw_tracked_state genX(tes_samplers) = { 5592 .dirty = { 5593 .mesa = _NEW_TEXTURE, 5594 .brw = BRW_NEW_BATCH | 5595 BRW_NEW_BLORP | 5596 BRW_NEW_TESS_PROGRAMS, 5597 }, 5598 .emit = genX(upload_tes_samplers), 5599}; 5600#endif 5601 5602#if GEN_GEN >= 7 5603static void 5604genX(upload_cs_samplers)(struct brw_context *brw) 5605{ 5606 /* BRW_NEW_COMPUTE_PROGRAM */ 5607 struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE]; 5608 if (!cs) 5609 return; 5610 5611 genX(upload_sampler_state_table)(brw, cs, &brw->cs.base); 5612} 5613 5614const struct brw_tracked_state genX(cs_samplers) = { 5615 .dirty = { 5616 .mesa = _NEW_TEXTURE, 5617 .brw = BRW_NEW_BATCH | 5618 BRW_NEW_BLORP | 5619 BRW_NEW_COMPUTE_PROGRAM, 5620 }, 5621 .emit = genX(upload_cs_samplers), 5622}; 5623#endif 5624 5625/* ---------------------------------------------------------------------- */ 5626 5627#if GEN_GEN <= 5 5628 5629static void genX(upload_blend_constant_color)(struct brw_context *brw) 5630{ 5631 struct gl_context *ctx = &brw->ctx; 5632 5633 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) { 5634 blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0]; 5635 blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1]; 5636 blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2]; 5637 blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3]; 5638 } 5639} 5640 5641static const struct brw_tracked_state genX(blend_constant_color) = { 5642 .dirty = { 5643 .mesa = _NEW_COLOR, 5644 .brw = BRW_NEW_CONTEXT | 5645 BRW_NEW_BLORP, 5646 }, 5647 .emit = genX(upload_blend_constant_color) 5648}; 5649#endif 5650 5651/* ---------------------------------------------------------------------- */ 5652 5653void 5654genX(init_atoms)(struct brw_context *brw) 5655{ 5656#if GEN_GEN < 6 5657 static const struct brw_tracked_state *render_atoms[] = 5658 { 5659 &genX(vf_statistics), 5660 5661 /* Once all the programs are done, we know how large urb entry 5662 * sizes need to be and can decide if we need to change the urb 5663 * layout. 5664 */ 5665 &brw_curbe_offsets, 5666 &brw_recalculate_urb_fence, 5667 5668 &genX(cc_vp), 5669 &genX(color_calc_state), 5670 5671 /* Surface state setup. Must come before the VS/WM unit. The binding 5672 * table upload must be last. 5673 */ 5674 &brw_vs_pull_constants, 5675 &brw_wm_pull_constants, 5676 &brw_renderbuffer_surfaces, 5677 &brw_renderbuffer_read_surfaces, 5678 &brw_texture_surfaces, 5679 &brw_vs_binding_table, 5680 &brw_wm_binding_table, 5681 5682 &genX(fs_samplers), 5683 &genX(vs_samplers), 5684 5685 /* These set up state for brw_psp_urb_cbs */ 5686 &genX(wm_state), 5687 &genX(sf_clip_viewport), 5688 &genX(sf_state), 5689 &genX(vs_state), /* always required, enabled or not */ 5690 &genX(clip_state), 5691 &genX(gs_state), 5692 5693 /* Command packets: 5694 */ 5695 &brw_binding_table_pointers, 5696 &genX(blend_constant_color), 5697 5698 &brw_depthbuffer, 5699 5700 &genX(polygon_stipple), 5701 &genX(polygon_stipple_offset), 5702 5703 &genX(line_stipple), 5704 5705 &brw_psp_urb_cbs, 5706 5707 &genX(drawing_rect), 5708 &brw_indices, /* must come before brw_vertices */ 5709 &genX(index_buffer), 5710 &genX(vertices), 5711 5712 &brw_constant_buffer 5713 }; 5714#elif GEN_GEN == 6 5715 static const struct brw_tracked_state *render_atoms[] = 5716 { 5717 &genX(vf_statistics), 5718 5719 &genX(sf_clip_viewport), 5720 5721 /* Command packets: */ 5722 5723 &genX(cc_vp), 5724 5725 &gen6_urb, 5726 &genX(blend_state), /* must do before cc unit */ 5727 &genX(color_calc_state), /* must do before cc unit */ 5728 &genX(depth_stencil_state), /* must do before cc unit */ 5729 5730 &genX(vs_push_constants), /* Before vs_state */ 5731 &genX(gs_push_constants), /* Before gs_state */ 5732 &genX(wm_push_constants), /* Before wm_state */ 5733 5734 /* Surface state setup. Must come before the VS/WM unit. The binding 5735 * table upload must be last. 5736 */ 5737 &brw_vs_pull_constants, 5738 &brw_vs_ubo_surfaces, 5739 &brw_gs_pull_constants, 5740 &brw_gs_ubo_surfaces, 5741 &brw_wm_pull_constants, 5742 &brw_wm_ubo_surfaces, 5743 &gen6_renderbuffer_surfaces, 5744 &brw_renderbuffer_read_surfaces, 5745 &brw_texture_surfaces, 5746 &gen6_sol_surface, 5747 &brw_vs_binding_table, 5748 &gen6_gs_binding_table, 5749 &brw_wm_binding_table, 5750 5751 &genX(fs_samplers), 5752 &genX(vs_samplers), 5753 &genX(gs_samplers), 5754 &gen6_sampler_state, 5755 &genX(multisample_state), 5756 5757 &genX(vs_state), 5758 &genX(gs_state), 5759 &genX(clip_state), 5760 &genX(sf_state), 5761 &genX(wm_state), 5762 5763 &genX(scissor_state), 5764 5765 &gen6_binding_table_pointers, 5766 5767 &brw_depthbuffer, 5768 5769 &genX(polygon_stipple), 5770 &genX(polygon_stipple_offset), 5771 5772 &genX(line_stipple), 5773 5774 &genX(drawing_rect), 5775 5776 &brw_indices, /* must come before brw_vertices */ 5777 &genX(index_buffer), 5778 &genX(vertices), 5779 }; 5780#elif GEN_GEN == 7 5781 static const struct brw_tracked_state *render_atoms[] = 5782 { 5783 &genX(vf_statistics), 5784 5785 /* Command packets: */ 5786 5787 &genX(cc_vp), 5788 &genX(sf_clip_viewport), 5789 5790 &gen7_l3_state, 5791 &gen7_push_constant_space, 5792 &gen7_urb, 5793 &genX(blend_state), /* must do before cc unit */ 5794 &genX(color_calc_state), /* must do before cc unit */ 5795 &genX(depth_stencil_state), /* must do before cc unit */ 5796 5797 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */ 5798 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */ 5799 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */ 5800 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */ 5801 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */ 5802 5803 &genX(vs_push_constants), /* Before vs_state */ 5804 &genX(tcs_push_constants), 5805 &genX(tes_push_constants), 5806 &genX(gs_push_constants), /* Before gs_state */ 5807 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */ 5808 5809 /* Surface state setup. Must come before the VS/WM unit. The binding 5810 * table upload must be last. 5811 */ 5812 &brw_vs_pull_constants, 5813 &brw_vs_ubo_surfaces, 5814 &brw_tcs_pull_constants, 5815 &brw_tcs_ubo_surfaces, 5816 &brw_tes_pull_constants, 5817 &brw_tes_ubo_surfaces, 5818 &brw_gs_pull_constants, 5819 &brw_gs_ubo_surfaces, 5820 &brw_wm_pull_constants, 5821 &brw_wm_ubo_surfaces, 5822 &gen6_renderbuffer_surfaces, 5823 &brw_renderbuffer_read_surfaces, 5824 &brw_texture_surfaces, 5825 5826 &genX(push_constant_packets), 5827 5828 &brw_vs_binding_table, 5829 &brw_tcs_binding_table, 5830 &brw_tes_binding_table, 5831 &brw_gs_binding_table, 5832 &brw_wm_binding_table, 5833 5834 &genX(fs_samplers), 5835 &genX(vs_samplers), 5836 &genX(tcs_samplers), 5837 &genX(tes_samplers), 5838 &genX(gs_samplers), 5839 &genX(multisample_state), 5840 5841 &genX(vs_state), 5842 &genX(hs_state), 5843 &genX(te_state), 5844 &genX(ds_state), 5845 &genX(gs_state), 5846 &genX(sol_state), 5847 &genX(clip_state), 5848 &genX(sbe_state), 5849 &genX(sf_state), 5850 &genX(wm_state), 5851 &genX(ps_state), 5852 5853 &genX(scissor_state), 5854 5855 &brw_depthbuffer, 5856 5857 &genX(polygon_stipple), 5858 &genX(polygon_stipple_offset), 5859 5860 &genX(line_stipple), 5861 5862 &genX(drawing_rect), 5863 5864 &brw_indices, /* must come before brw_vertices */ 5865 &genX(index_buffer), 5866 &genX(vertices), 5867 5868#if GEN_IS_HASWELL 5869 &genX(cut_index), 5870#endif 5871 }; 5872#elif GEN_GEN >= 8 5873 static const struct brw_tracked_state *render_atoms[] = 5874 { 5875 &genX(vf_statistics), 5876 5877 &genX(cc_vp), 5878 &genX(sf_clip_viewport), 5879 5880 &gen7_l3_state, 5881 &gen7_push_constant_space, 5882 &gen7_urb, 5883 &genX(blend_state), 5884 &genX(color_calc_state), 5885 5886 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */ 5887 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */ 5888 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */ 5889 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */ 5890 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */ 5891 5892 &genX(vs_push_constants), /* Before vs_state */ 5893 &genX(tcs_push_constants), 5894 &genX(tes_push_constants), 5895 &genX(gs_push_constants), /* Before gs_state */ 5896 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */ 5897 5898 /* Surface state setup. Must come before the VS/WM unit. The binding 5899 * table upload must be last. 5900 */ 5901 &brw_vs_pull_constants, 5902 &brw_vs_ubo_surfaces, 5903 &brw_tcs_pull_constants, 5904 &brw_tcs_ubo_surfaces, 5905 &brw_tes_pull_constants, 5906 &brw_tes_ubo_surfaces, 5907 &brw_gs_pull_constants, 5908 &brw_gs_ubo_surfaces, 5909 &brw_wm_pull_constants, 5910 &brw_wm_ubo_surfaces, 5911 &gen6_renderbuffer_surfaces, 5912 &brw_renderbuffer_read_surfaces, 5913 &brw_texture_surfaces, 5914 5915 &genX(push_constant_packets), 5916 5917 &brw_vs_binding_table, 5918 &brw_tcs_binding_table, 5919 &brw_tes_binding_table, 5920 &brw_gs_binding_table, 5921 &brw_wm_binding_table, 5922 5923 &genX(fs_samplers), 5924 &genX(vs_samplers), 5925 &genX(tcs_samplers), 5926 &genX(tes_samplers), 5927 &genX(gs_samplers), 5928 &genX(multisample_state), 5929 5930 &genX(vs_state), 5931 &genX(hs_state), 5932 &genX(te_state), 5933 &genX(ds_state), 5934 &genX(gs_state), 5935 &genX(sol_state), 5936 &genX(clip_state), 5937 &genX(raster_state), 5938 &genX(sbe_state), 5939 &genX(sf_state), 5940 &genX(ps_blend), 5941 &genX(ps_extra), 5942 &genX(ps_state), 5943 &genX(depth_stencil_state), 5944 &genX(wm_state), 5945 5946 &genX(scissor_state), 5947 5948 &brw_depthbuffer, 5949 5950 &genX(polygon_stipple), 5951 &genX(polygon_stipple_offset), 5952 5953 &genX(line_stipple), 5954 5955 &genX(drawing_rect), 5956 5957 &genX(vf_topology), 5958 5959 &brw_indices, 5960 &genX(index_buffer), 5961 &genX(vertices), 5962 5963 &genX(cut_index), 5964 &gen8_pma_fix, 5965 }; 5966#endif 5967 5968 STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms)); 5969 brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE, 5970 render_atoms, ARRAY_SIZE(render_atoms)); 5971 5972#if GEN_GEN >= 7 5973 static const struct brw_tracked_state *compute_atoms[] = 5974 { 5975 &gen7_l3_state, 5976 &brw_cs_image_surfaces, 5977 &genX(cs_push_constants), 5978 &genX(cs_pull_constants), 5979 &brw_cs_ubo_surfaces, 5980 &brw_cs_texture_surfaces, 5981 &brw_cs_work_groups_surface, 5982 &genX(cs_samplers), 5983 &genX(cs_state), 5984 }; 5985 5986 STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms)); 5987 brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE, 5988 compute_atoms, ARRAY_SIZE(compute_atoms)); 5989 5990 brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count); 5991 brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker); 5992#endif 5993} 5994