1/* 2 * Copyright © 2017 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <assert.h> 25 26#include "main/samplerobj.h" 27 28#include "dev/intel_device_info.h" 29#include "common/intel_sample_positions.h" 30#include "genxml/gen_macros.h" 31#include "common/intel_guardband.h" 32 33#include "main/bufferobj.h" 34#include "main/context.h" 35#include "main/enums.h" 36#include "main/macros.h" 37#include "main/state.h" 38 39#include "genX_boilerplate.h" 40 41#include "brw_context.h" 42#include "brw_cs.h" 43#include "brw_draw.h" 44#include "brw_multisample_state.h" 45#include "brw_state.h" 46#include "brw_wm.h" 47#include "brw_util.h" 48 49#include "brw_batch.h" 50#include "brw_buffer_objects.h" 51#include "brw_fbo.h" 52 53#include "main/enums.h" 54#include "main/fbobject.h" 55#include "main/framebuffer.h" 56#include "main/glformats.h" 57#include "main/shaderapi.h" 58#include "main/stencil.h" 59#include "main/transformfeedback.h" 60#include "main/varray.h" 61#include "main/viewport.h" 62#include "util/half_float.h" 63 64#if GFX_VER == 4 65static struct brw_address 66KSP(struct brw_context *brw, uint32_t offset) 67{ 68 return ro_bo(brw->cache.bo, offset); 69} 70#else 71static uint32_t 72KSP(UNUSED struct brw_context *brw, uint32_t offset) 73{ 74 return offset; 75} 76#endif 77 78#if GFX_VER >= 7 79static void 80emit_lrm(struct brw_context *brw, uint32_t reg, struct brw_address addr) 81{ 82 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_MEM), lrm) { 83 lrm.RegisterAddress = reg; 84 lrm.MemoryAddress = addr; 85 } 86} 87#endif 88 89#if GFX_VER == 7 90static void 91emit_lri(struct brw_context *brw, uint32_t reg, uint32_t imm) 92{ 93 brw_batch_emit(brw, GENX(MI_LOAD_REGISTER_IMM), lri) { 94 lri.RegisterOffset = reg; 95 lri.DataDWord = imm; 96 } 97} 98#endif 99 100/** 101 * Polygon stipple packet 102 */ 103static void 104genX(upload_polygon_stipple)(struct brw_context *brw) 105{ 106 struct gl_context *ctx = &brw->ctx; 107 108 /* _NEW_POLYGON */ 109 if (!ctx->Polygon.StippleFlag) 110 return; 111 112 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_PATTERN), poly) { 113 /* Polygon stipple is provided in OpenGL order, i.e. bottom 114 * row first. If we're rendering to a window (i.e. the 115 * default frame buffer object, 0), then we need to invert 116 * it to match our pixel layout. But if we're rendering 117 * to a FBO (i.e. any named frame buffer object), we *don't* 118 * need to invert - we already match the layout. 119 */ 120 if (ctx->DrawBuffer->FlipY) { 121 for (unsigned i = 0; i < 32; i++) 122 poly.PatternRow[i] = ctx->PolygonStipple[31 - i]; /* invert */ 123 } else { 124 for (unsigned i = 0; i < 32; i++) 125 poly.PatternRow[i] = ctx->PolygonStipple[i]; 126 } 127 } 128} 129 130static const struct brw_tracked_state genX(polygon_stipple) = { 131 .dirty = { 132 .mesa = _NEW_POLYGON | 133 _NEW_POLYGONSTIPPLE, 134 .brw = BRW_NEW_CONTEXT, 135 }, 136 .emit = genX(upload_polygon_stipple), 137}; 138 139/** 140 * Polygon stipple offset packet 141 */ 142static void 143genX(upload_polygon_stipple_offset)(struct brw_context *brw) 144{ 145 struct gl_context *ctx = &brw->ctx; 146 147 /* _NEW_POLYGON */ 148 if (!ctx->Polygon.StippleFlag) 149 return; 150 151 brw_batch_emit(brw, GENX(3DSTATE_POLY_STIPPLE_OFFSET), poly) { 152 /* _NEW_BUFFERS 153 * 154 * If we're drawing to a system window we have to invert the Y axis 155 * in order to match the OpenGL pixel coordinate system, and our 156 * offset must be matched to the window position. If we're drawing 157 * to a user-created FBO then our native pixel coordinate system 158 * works just fine, and there's no window system to worry about. 159 */ 160 if (ctx->DrawBuffer->FlipY) { 161 poly.PolygonStippleYOffset = 162 (32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31; 163 } 164 } 165} 166 167static const struct brw_tracked_state genX(polygon_stipple_offset) = { 168 .dirty = { 169 .mesa = _NEW_BUFFERS | 170 _NEW_POLYGON, 171 .brw = BRW_NEW_CONTEXT, 172 }, 173 .emit = genX(upload_polygon_stipple_offset), 174}; 175 176/** 177 * Line stipple packet 178 */ 179static void 180genX(upload_line_stipple)(struct brw_context *brw) 181{ 182 struct gl_context *ctx = &brw->ctx; 183 184 if (!ctx->Line.StippleFlag) 185 return; 186 187 brw_batch_emit(brw, GENX(3DSTATE_LINE_STIPPLE), line) { 188 line.LineStipplePattern = ctx->Line.StipplePattern; 189 190 line.LineStippleInverseRepeatCount = 1.0f / ctx->Line.StippleFactor; 191 line.LineStippleRepeatCount = ctx->Line.StippleFactor; 192 } 193} 194 195static const struct brw_tracked_state genX(line_stipple) = { 196 .dirty = { 197 .mesa = _NEW_LINE, 198 .brw = BRW_NEW_CONTEXT, 199 }, 200 .emit = genX(upload_line_stipple), 201}; 202 203/* Constant single cliprect for framebuffer object or DRI2 drawing */ 204static void 205genX(upload_drawing_rect)(struct brw_context *brw) 206{ 207 struct gl_context *ctx = &brw->ctx; 208 const struct gl_framebuffer *fb = ctx->DrawBuffer; 209 const unsigned int fb_width = _mesa_geometric_width(fb); 210 const unsigned int fb_height = _mesa_geometric_height(fb); 211 212 brw_batch_emit(brw, GENX(3DSTATE_DRAWING_RECTANGLE), rect) { 213 rect.ClippedDrawingRectangleXMax = fb_width - 1; 214 rect.ClippedDrawingRectangleYMax = fb_height - 1; 215 } 216} 217 218static const struct brw_tracked_state genX(drawing_rect) = { 219 .dirty = { 220 .mesa = _NEW_BUFFERS, 221 .brw = BRW_NEW_BLORP | 222 BRW_NEW_CONTEXT, 223 }, 224 .emit = genX(upload_drawing_rect), 225}; 226 227static uint32_t * 228genX(emit_vertex_buffer_state)(struct brw_context *brw, 229 uint32_t *dw, 230 unsigned buffer_nr, 231 struct brw_bo *bo, 232 unsigned start_offset, 233 UNUSED unsigned end_offset, 234 unsigned stride, 235 UNUSED unsigned step_rate) 236{ 237 struct GENX(VERTEX_BUFFER_STATE) buf_state = { 238 .VertexBufferIndex = buffer_nr, 239 .BufferPitch = stride, 240 241 /* The VF cache designers apparently cut corners, and made the cache 242 * only consider the bottom 32 bits of memory addresses. If you happen 243 * to have two vertex buffers which get placed exactly 4 GiB apart and 244 * use them in back-to-back draw calls, you can get collisions. To work 245 * around this problem, we restrict vertex buffers to the low 32 bits of 246 * the address space. 247 */ 248 .BufferStartingAddress = ro_32_bo(bo, start_offset), 249#if GFX_VER >= 8 250 .BufferSize = end_offset - start_offset, 251#endif 252 253#if GFX_VER >= 7 254 .AddressModifyEnable = true, 255#endif 256 257#if GFX_VER < 8 258 .BufferAccessType = step_rate ? INSTANCEDATA : VERTEXDATA, 259 .InstanceDataStepRate = step_rate, 260#if GFX_VER >= 5 261 .EndAddress = ro_bo(bo, end_offset - 1), 262#endif 263#endif 264 265#if GFX_VER == 11 266 .MOCS = ICL_MOCS_WB, 267#elif GFX_VER == 10 268 .MOCS = CNL_MOCS_WB, 269#elif GFX_VER == 9 270 .MOCS = SKL_MOCS_WB, 271#elif GFX_VER == 8 272 .MOCS = BDW_MOCS_WB, 273#elif GFX_VER == 7 274 .MOCS = GFX7_MOCS_L3, 275#endif 276 }; 277 278 GENX(VERTEX_BUFFER_STATE_pack)(brw, dw, &buf_state); 279 return dw + GENX(VERTEX_BUFFER_STATE_length); 280} 281 282UNUSED static bool 283is_passthru_format(uint32_t format) 284{ 285 switch (format) { 286 case ISL_FORMAT_R64_PASSTHRU: 287 case ISL_FORMAT_R64G64_PASSTHRU: 288 case ISL_FORMAT_R64G64B64_PASSTHRU: 289 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 290 return true; 291 default: 292 return false; 293 } 294} 295 296UNUSED static int 297uploads_needed(uint32_t format, 298 bool is_dual_slot) 299{ 300 if (!is_passthru_format(format)) 301 return 1; 302 303 if (is_dual_slot) 304 return 2; 305 306 switch (format) { 307 case ISL_FORMAT_R64_PASSTHRU: 308 case ISL_FORMAT_R64G64_PASSTHRU: 309 return 1; 310 case ISL_FORMAT_R64G64B64_PASSTHRU: 311 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 312 return 2; 313 default: 314 unreachable("not reached"); 315 } 316} 317 318/* 319 * Returns the format that we are finally going to use when upload a vertex 320 * element. It will only change if we are using *64*PASSTHRU formats, as for 321 * gen < 8 they need to be splitted on two *32*FLOAT formats. 322 * 323 * @upload points in which upload we are. Valid values are [0,1] 324 */ 325static uint32_t 326downsize_format_if_needed(uint32_t format, 327 int upload) 328{ 329 assert(upload == 0 || upload == 1); 330 331 if (!is_passthru_format(format)) 332 return format; 333 334 /* ISL_FORMAT_R64_PASSTHRU and ISL_FORMAT_R64G64_PASSTHRU with an upload == 335 * 1 means that we have been forced to do 2 uploads for a size <= 2. This 336 * happens with gen < 8 and dvec3 or dvec4 vertex shader input 337 * variables. In those cases, we return ISL_FORMAT_R32_FLOAT as a way of 338 * flagging that we want to fill with zeroes this second forced upload. 339 */ 340 switch (format) { 341 case ISL_FORMAT_R64_PASSTHRU: 342 return upload == 0 ? ISL_FORMAT_R32G32_FLOAT 343 : ISL_FORMAT_R32_FLOAT; 344 case ISL_FORMAT_R64G64_PASSTHRU: 345 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT 346 : ISL_FORMAT_R32_FLOAT; 347 case ISL_FORMAT_R64G64B64_PASSTHRU: 348 return upload == 0 ? ISL_FORMAT_R32G32B32A32_FLOAT 349 : ISL_FORMAT_R32G32_FLOAT; 350 case ISL_FORMAT_R64G64B64A64_PASSTHRU: 351 return ISL_FORMAT_R32G32B32A32_FLOAT; 352 default: 353 unreachable("not reached"); 354 } 355} 356 357/* 358 * Returns the number of componentes associated with a format that is used on 359 * a 64 to 32 format split. See downsize_format() 360 */ 361static int 362upload_format_size(uint32_t upload_format) 363{ 364 switch (upload_format) { 365 case ISL_FORMAT_R32_FLOAT: 366 367 /* downsized_format has returned this one in order to flag that we are 368 * performing a second upload which we want to have filled with 369 * zeroes. This happens with gen < 8, a size <= 2, and dvec3 or dvec4 370 * vertex shader input variables. 371 */ 372 373 return 0; 374 case ISL_FORMAT_R32G32_FLOAT: 375 return 2; 376 case ISL_FORMAT_R32G32B32A32_FLOAT: 377 return 4; 378 default: 379 unreachable("not reached"); 380 } 381} 382 383static UNUSED uint16_t 384pinned_bo_high_bits(struct brw_bo *bo) 385{ 386 return (bo->kflags & EXEC_OBJECT_PINNED) ? bo->gtt_offset >> 32ull : 0; 387} 388 389/* The VF cache designers apparently cut corners, and made the cache key's 390 * <VertexBufferIndex, Memory Address> tuple only consider the bottom 32 bits 391 * of the address. If you happen to have two vertex buffers which get placed 392 * exactly 4 GiB apart and use them in back-to-back draw calls, you can get 393 * collisions. (These collisions can happen within a single batch.) 394 * 395 * In the soft-pin world, we'd like to assign addresses up front, and never 396 * move buffers. So, we need to do a VF cache invalidate if the buffer for 397 * a particular VB slot has different [48:32] address bits than the last one. 398 * 399 * In the relocation world, we have no idea what the addresses will be, so 400 * we can't apply this workaround. Instead, we tell the kernel to move it 401 * to the low 4GB regardless. 402 * 403 * This HW issue is gone on Gfx11+. 404 */ 405static void 406vf_invalidate_for_vb_48bit_transitions(UNUSED struct brw_context *brw) 407{ 408#if GFX_VER >= 8 && GFX_VER < 11 409 bool need_invalidate = false; 410 411 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { 412 uint16_t high_bits = pinned_bo_high_bits(brw->vb.buffers[i].bo); 413 414 if (high_bits != brw->vb.last_bo_high_bits[i]) { 415 need_invalidate = true; 416 brw->vb.last_bo_high_bits[i] = high_bits; 417 } 418 } 419 420 if (brw->draw.draw_params_bo) { 421 uint16_t high_bits = pinned_bo_high_bits(brw->draw.draw_params_bo); 422 423 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers] != high_bits) { 424 need_invalidate = true; 425 brw->vb.last_bo_high_bits[brw->vb.nr_buffers] = high_bits; 426 } 427 } 428 429 if (brw->draw.derived_draw_params_bo) { 430 uint16_t high_bits = pinned_bo_high_bits(brw->draw.derived_draw_params_bo); 431 432 if (brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] != high_bits) { 433 need_invalidate = true; 434 brw->vb.last_bo_high_bits[brw->vb.nr_buffers + 1] = high_bits; 435 } 436 } 437 438 if (need_invalidate) { 439 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_CS_STALL); 440 } 441#endif 442} 443 444static void 445vf_invalidate_for_ib_48bit_transition(UNUSED struct brw_context *brw) 446{ 447#if GFX_VER >= 8 448 uint16_t high_bits = pinned_bo_high_bits(brw->ib.bo); 449 450 if (high_bits != brw->ib.last_bo_high_bits) { 451 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_VF_CACHE_INVALIDATE); 452 brw->ib.last_bo_high_bits = high_bits; 453 } 454#endif 455} 456 457static void 458genX(emit_vertices)(struct brw_context *brw) 459{ 460 const struct intel_device_info *devinfo = &brw->screen->devinfo; 461 uint32_t *dw; 462 463 brw_prepare_vertices(brw); 464 brw_prepare_shader_draw_parameters(brw); 465 466#if GFX_VER < 6 467 brw_emit_query_begin(brw); 468#endif 469 470 const struct brw_vs_prog_data *vs_prog_data = 471 brw_vs_prog_data(brw->vs.base.prog_data); 472 473#if GFX_VER >= 8 474 struct gl_context *ctx = &brw->ctx; 475 const bool uses_edge_flag = (ctx->Polygon.FrontMode != GL_FILL || 476 ctx->Polygon.BackMode != GL_FILL); 477 478 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) { 479 unsigned vue = brw->vb.nr_enabled; 480 481 /* The element for the edge flags must always be last, so we have to 482 * insert the SGVS before it in that case. 483 */ 484 if (uses_edge_flag) { 485 assert(vue > 0); 486 vue--; 487 } 488 489 WARN_ONCE(vue >= 33, 490 "Trying to insert VID/IID past 33rd vertex element, " 491 "need to reorder the vertex attrbutes."); 492 493 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs) { 494 if (vs_prog_data->uses_vertexid) { 495 vfs.VertexIDEnable = true; 496 vfs.VertexIDComponentNumber = 2; 497 vfs.VertexIDElementOffset = vue; 498 } 499 500 if (vs_prog_data->uses_instanceid) { 501 vfs.InstanceIDEnable = true; 502 vfs.InstanceIDComponentNumber = 3; 503 vfs.InstanceIDElementOffset = vue; 504 } 505 } 506 507 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 508 vfi.InstancingEnable = true; 509 vfi.VertexElementIndex = vue; 510 } 511 } else { 512 brw_batch_emit(brw, GENX(3DSTATE_VF_SGVS), vfs); 513 } 514#endif 515 516 const bool uses_draw_params = 517 vs_prog_data->uses_firstvertex || 518 vs_prog_data->uses_baseinstance; 519 520 const bool uses_derived_draw_params = 521 vs_prog_data->uses_drawid || 522 vs_prog_data->uses_is_indexed_draw; 523 524 const bool needs_sgvs_element = (uses_draw_params || 525 vs_prog_data->uses_instanceid || 526 vs_prog_data->uses_vertexid); 527 528 unsigned nr_elements = 529 brw->vb.nr_enabled + needs_sgvs_element + uses_derived_draw_params; 530 531#if GFX_VER < 8 532 /* If any of the formats of vb.enabled needs more that one upload, we need 533 * to add it to nr_elements 534 */ 535 for (unsigned i = 0; i < brw->vb.nr_enabled; i++) { 536 struct brw_vertex_element *input = brw->vb.enabled[i]; 537 uint32_t format = brw_get_vertex_surface_type(brw, input->glformat); 538 539 if (uploads_needed(format, input->is_dual_slot) > 1) 540 nr_elements++; 541 } 542#endif 543 544 /* If the VS doesn't read any inputs (calculating vertex position from 545 * a state variable for some reason, for example), emit a single pad 546 * VERTEX_ELEMENT struct and bail. 547 * 548 * The stale VB state stays in place, but they don't do anything unless 549 * a VE loads from them. 550 */ 551 if (nr_elements == 0) { 552 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS), 553 1 + GENX(VERTEX_ELEMENT_STATE_length)); 554 struct GENX(VERTEX_ELEMENT_STATE) elem = { 555 .Valid = true, 556 .SourceElementFormat = ISL_FORMAT_R32G32B32A32_FLOAT, 557 .Component0Control = VFCOMP_STORE_0, 558 .Component1Control = VFCOMP_STORE_0, 559 .Component2Control = VFCOMP_STORE_0, 560 .Component3Control = VFCOMP_STORE_1_FP, 561 }; 562 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem); 563 return; 564 } 565 566 /* Now emit 3DSTATE_VERTEX_BUFFERS and 3DSTATE_VERTEX_ELEMENTS packets. */ 567 const unsigned nr_buffers = brw->vb.nr_buffers + 568 uses_draw_params + uses_derived_draw_params; 569 570 vf_invalidate_for_vb_48bit_transitions(brw); 571 572 if (nr_buffers) { 573 assert(nr_buffers <= (GFX_VER >= 6 ? 33 : 17)); 574 575 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_BUFFERS), 576 1 + GENX(VERTEX_BUFFER_STATE_length) * nr_buffers); 577 578 for (unsigned i = 0; i < brw->vb.nr_buffers; i++) { 579 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[i]; 580 /* Prior to Haswell and Bay Trail we have to use 4-component formats 581 * to fake 3-component ones. In particular, we do this for 582 * half-float and 8 and 16-bit integer formats. This means that the 583 * vertex element may poke over the end of the buffer by 2 bytes. 584 */ 585 const unsigned padding = 586 (GFX_VERx10 < 75 && !devinfo->is_baytrail) * 2; 587 const unsigned end = buffer->offset + buffer->size + padding; 588 dw = genX(emit_vertex_buffer_state)(brw, dw, i, buffer->bo, 589 buffer->offset, 590 end, 591 buffer->stride, 592 buffer->step_rate); 593 } 594 595 if (uses_draw_params) { 596 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers, 597 brw->draw.draw_params_bo, 598 brw->draw.draw_params_offset, 599 brw->draw.draw_params_bo->size, 600 0 /* stride */, 601 0 /* step rate */); 602 } 603 604 if (uses_derived_draw_params) { 605 dw = genX(emit_vertex_buffer_state)(brw, dw, brw->vb.nr_buffers + 1, 606 brw->draw.derived_draw_params_bo, 607 brw->draw.derived_draw_params_offset, 608 brw->draw.derived_draw_params_bo->size, 609 0 /* stride */, 610 0 /* step rate */); 611 } 612 } 613 614 /* The hardware allows one more VERTEX_ELEMENTS than VERTEX_BUFFERS, 615 * presumably for VertexID/InstanceID. 616 */ 617#if GFX_VER >= 6 618 assert(nr_elements <= 34); 619 const struct brw_vertex_element *gfx6_edgeflag_input = NULL; 620#else 621 assert(nr_elements <= 18); 622#endif 623 624 dw = brw_batch_emitn(brw, GENX(3DSTATE_VERTEX_ELEMENTS), 625 1 + GENX(VERTEX_ELEMENT_STATE_length) * nr_elements); 626 unsigned i; 627 for (i = 0; i < brw->vb.nr_enabled; i++) { 628 const struct brw_vertex_element *input = brw->vb.enabled[i]; 629 const struct gl_vertex_format *glformat = input->glformat; 630 uint32_t format = brw_get_vertex_surface_type(brw, glformat); 631 uint32_t comp0 = VFCOMP_STORE_SRC; 632 uint32_t comp1 = VFCOMP_STORE_SRC; 633 uint32_t comp2 = VFCOMP_STORE_SRC; 634 uint32_t comp3 = VFCOMP_STORE_SRC; 635 const unsigned num_uploads = GFX_VER < 8 ? 636 uploads_needed(format, input->is_dual_slot) : 1; 637 638#if GFX_VER >= 8 639 /* From the BDW PRM, Volume 2d, page 588 (VERTEX_ELEMENT_STATE): 640 * "Any SourceElementFormat of *64*_PASSTHRU cannot be used with an 641 * element which has edge flag enabled." 642 */ 643 assert(!(is_passthru_format(format) && uses_edge_flag)); 644#endif 645 646 /* The gfx4 driver expects edgeflag to come in as a float, and passes 647 * that float on to the tests in the clipper. Mesa's current vertex 648 * attribute value for EdgeFlag is stored as a float, which works out. 649 * glEdgeFlagPointer, on the other hand, gives us an unnormalized 650 * integer ubyte. Just rewrite that to convert to a float. 651 * 652 * Gfx6+ passes edgeflag as sideband along with the vertex, instead 653 * of in the VUE. We have to upload it sideband as the last vertex 654 * element according to the B-Spec. 655 */ 656#if GFX_VER >= 6 657 if (input == &brw->vb.inputs[VERT_ATTRIB_EDGEFLAG]) { 658 gfx6_edgeflag_input = input; 659 continue; 660 } 661#endif 662 663 for (unsigned c = 0; c < num_uploads; c++) { 664 const uint32_t upload_format = GFX_VER >= 8 ? format : 665 downsize_format_if_needed(format, c); 666 /* If we need more that one upload, the offset stride would be 128 667 * bits (16 bytes), as for previous uploads we are using the full 668 * entry. */ 669 const unsigned offset = input->offset + c * 16; 670 671 const int size = (GFX_VER < 8 && is_passthru_format(format)) ? 672 upload_format_size(upload_format) : glformat->Size; 673 674 switch (size) { 675 case 0: comp0 = VFCOMP_STORE_0; FALLTHROUGH; 676 case 1: comp1 = VFCOMP_STORE_0; FALLTHROUGH; 677 case 2: comp2 = VFCOMP_STORE_0; FALLTHROUGH; 678 case 3: 679 if (GFX_VER >= 8 && glformat->Doubles) { 680 comp3 = VFCOMP_STORE_0; 681 } else if (glformat->Integer) { 682 comp3 = VFCOMP_STORE_1_INT; 683 } else { 684 comp3 = VFCOMP_STORE_1_FP; 685 } 686 687 break; 688 } 689 690#if GFX_VER >= 8 691 /* From the BDW PRM, Volume 2d, page 586 (VERTEX_ELEMENT_STATE): 692 * 693 * "When SourceElementFormat is set to one of the *64*_PASSTHRU 694 * formats, 64-bit components are stored in the URB without any 695 * conversion. In this case, vertex elements must be written as 128 696 * or 256 bits, with VFCOMP_STORE_0 being used to pad the output as 697 * required. E.g., if R64_PASSTHRU is used to copy a 64-bit Red 698 * component into the URB, Component 1 must be specified as 699 * VFCOMP_STORE_0 (with Components 2,3 set to VFCOMP_NOSTORE) in 700 * order to output a 128-bit vertex element, or Components 1-3 must 701 * be specified as VFCOMP_STORE_0 in order to output a 256-bit vertex 702 * element. Likewise, use of R64G64B64_PASSTHRU requires Component 3 703 * to be specified as VFCOMP_STORE_0 in order to output a 256-bit 704 * vertex element." 705 */ 706 if (glformat->Doubles && !input->is_dual_slot) { 707 /* Store vertex elements which correspond to double and dvec2 vertex 708 * shader inputs as 128-bit vertex elements, instead of 256-bits. 709 */ 710 comp2 = VFCOMP_NOSTORE; 711 comp3 = VFCOMP_NOSTORE; 712 } 713#endif 714 715 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 716 .VertexBufferIndex = input->buffer, 717 .Valid = true, 718 .SourceElementFormat = upload_format, 719 .SourceElementOffset = offset, 720 .Component0Control = comp0, 721 .Component1Control = comp1, 722 .Component2Control = comp2, 723 .Component3Control = comp3, 724#if GFX_VER < 5 725 .DestinationElementOffset = i * 4, 726#endif 727 }; 728 729 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 730 dw += GENX(VERTEX_ELEMENT_STATE_length); 731 } 732 } 733 734 if (needs_sgvs_element) { 735 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 736 .Valid = true, 737 .Component0Control = VFCOMP_STORE_0, 738 .Component1Control = VFCOMP_STORE_0, 739 .Component2Control = VFCOMP_STORE_0, 740 .Component3Control = VFCOMP_STORE_0, 741#if GFX_VER < 5 742 .DestinationElementOffset = i * 4, 743#endif 744 }; 745 746#if GFX_VER >= 8 747 if (uses_draw_params) { 748 elem_state.VertexBufferIndex = brw->vb.nr_buffers; 749 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 750 elem_state.Component0Control = VFCOMP_STORE_SRC; 751 elem_state.Component1Control = VFCOMP_STORE_SRC; 752 } 753#else 754 elem_state.VertexBufferIndex = brw->vb.nr_buffers; 755 elem_state.SourceElementFormat = ISL_FORMAT_R32G32_UINT; 756 if (uses_draw_params) { 757 elem_state.Component0Control = VFCOMP_STORE_SRC; 758 elem_state.Component1Control = VFCOMP_STORE_SRC; 759 } 760 761 if (vs_prog_data->uses_vertexid) 762 elem_state.Component2Control = VFCOMP_STORE_VID; 763 764 if (vs_prog_data->uses_instanceid) 765 elem_state.Component3Control = VFCOMP_STORE_IID; 766#endif 767 768 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 769 dw += GENX(VERTEX_ELEMENT_STATE_length); 770 } 771 772 if (uses_derived_draw_params) { 773 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 774 .Valid = true, 775 .VertexBufferIndex = brw->vb.nr_buffers + 1, 776 .SourceElementFormat = ISL_FORMAT_R32G32_UINT, 777 .Component0Control = VFCOMP_STORE_SRC, 778 .Component1Control = VFCOMP_STORE_SRC, 779 .Component2Control = VFCOMP_STORE_0, 780 .Component3Control = VFCOMP_STORE_0, 781#if GFX_VER < 5 782 .DestinationElementOffset = i * 4, 783#endif 784 }; 785 786 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 787 dw += GENX(VERTEX_ELEMENT_STATE_length); 788 } 789 790#if GFX_VER >= 6 791 if (gfx6_edgeflag_input) { 792 const struct gl_vertex_format *glformat = gfx6_edgeflag_input->glformat; 793 const uint32_t format = brw_get_vertex_surface_type(brw, glformat); 794 795 struct GENX(VERTEX_ELEMENT_STATE) elem_state = { 796 .Valid = true, 797 .VertexBufferIndex = gfx6_edgeflag_input->buffer, 798 .EdgeFlagEnable = true, 799 .SourceElementFormat = format, 800 .SourceElementOffset = gfx6_edgeflag_input->offset, 801 .Component0Control = VFCOMP_STORE_SRC, 802 .Component1Control = VFCOMP_STORE_0, 803 .Component2Control = VFCOMP_STORE_0, 804 .Component3Control = VFCOMP_STORE_0, 805 }; 806 807 GENX(VERTEX_ELEMENT_STATE_pack)(brw, dw, &elem_state); 808 dw += GENX(VERTEX_ELEMENT_STATE_length); 809 } 810#endif 811 812#if GFX_VER >= 8 813 for (unsigned i = 0, j = 0; i < brw->vb.nr_enabled; i++) { 814 const struct brw_vertex_element *input = brw->vb.enabled[i]; 815 const struct brw_vertex_buffer *buffer = &brw->vb.buffers[input->buffer]; 816 unsigned element_index; 817 818 /* The edge flag element is reordered to be the last one in the code 819 * above so we need to compensate for that in the element indices used 820 * below. 821 */ 822 if (input == gfx6_edgeflag_input) 823 element_index = nr_elements - 1; 824 else 825 element_index = j++; 826 827 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 828 vfi.VertexElementIndex = element_index; 829 vfi.InstancingEnable = buffer->step_rate != 0; 830 vfi.InstanceDataStepRate = buffer->step_rate; 831 } 832 } 833 834 if (vs_prog_data->uses_drawid) { 835 const unsigned element = brw->vb.nr_enabled + needs_sgvs_element; 836 837 brw_batch_emit(brw, GENX(3DSTATE_VF_INSTANCING), vfi) { 838 vfi.VertexElementIndex = element; 839 } 840 } 841#endif 842} 843 844static const struct brw_tracked_state genX(vertices) = { 845 .dirty = { 846 .mesa = _NEW_POLYGON, 847 .brw = BRW_NEW_BATCH | 848 BRW_NEW_BLORP | 849 BRW_NEW_VERTEX_PROGRAM | 850 BRW_NEW_VERTICES | 851 BRW_NEW_VS_PROG_DATA, 852 }, 853 .emit = genX(emit_vertices), 854}; 855 856static void 857genX(emit_index_buffer)(struct brw_context *brw) 858{ 859 const struct _mesa_index_buffer *index_buffer = brw->ib.ib; 860 861 if (index_buffer == NULL) 862 return; 863 864 vf_invalidate_for_ib_48bit_transition(brw); 865 866 brw_batch_emit(brw, GENX(3DSTATE_INDEX_BUFFER), ib) { 867#if GFX_VERx10 < 75 868 assert(brw->ib.enable_cut_index == brw->prim_restart.enable_cut_index); 869 ib.CutIndexEnable = brw->ib.enable_cut_index; 870#endif 871 ib.IndexFormat = brw_get_index_type(1 << index_buffer->index_size_shift); 872 873 /* The VF cache designers apparently cut corners, and made the cache 874 * only consider the bottom 32 bits of memory addresses. If you happen 875 * to have two index buffers which get placed exactly 4 GiB apart and 876 * use them in back-to-back draw calls, you can get collisions. To work 877 * around this problem, we restrict index buffers to the low 32 bits of 878 * the address space. 879 */ 880 ib.BufferStartingAddress = ro_32_bo(brw->ib.bo, 0); 881#if GFX_VER >= 8 882 ib.MOCS = GFX_VER >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; 883 ib.BufferSize = brw->ib.size; 884#else 885 ib.BufferEndingAddress = ro_bo(brw->ib.bo, brw->ib.size - 1); 886#endif 887 } 888} 889 890static const struct brw_tracked_state genX(index_buffer) = { 891 .dirty = { 892 .mesa = 0, 893 .brw = BRW_NEW_BATCH | 894 BRW_NEW_BLORP | 895 BRW_NEW_INDEX_BUFFER, 896 }, 897 .emit = genX(emit_index_buffer), 898}; 899 900#if GFX_VERx10 >= 75 901static void 902genX(upload_cut_index)(struct brw_context *brw) 903{ 904 brw_batch_emit(brw, GENX(3DSTATE_VF), vf) { 905 if (brw->prim_restart.enable_cut_index && brw->ib.ib) { 906 vf.IndexedDrawCutIndexEnable = true; 907 vf.CutIndex = brw->prim_restart.restart_index; 908 } 909 } 910} 911 912const struct brw_tracked_state genX(cut_index) = { 913 .dirty = { 914 .mesa = _NEW_TRANSFORM, 915 .brw = BRW_NEW_INDEX_BUFFER, 916 }, 917 .emit = genX(upload_cut_index), 918}; 919#endif 920 921static void 922genX(upload_vf_statistics)(struct brw_context *brw) 923{ 924 brw_batch_emit(brw, GENX(3DSTATE_VF_STATISTICS), vf) { 925 vf.StatisticsEnable = true; 926 } 927} 928 929const struct brw_tracked_state genX(vf_statistics) = { 930 .dirty = { 931 .mesa = 0, 932 .brw = BRW_NEW_BLORP | BRW_NEW_CONTEXT, 933 }, 934 .emit = genX(upload_vf_statistics), 935}; 936 937#if GFX_VER >= 6 938/** 939 * Determine the appropriate attribute override value to store into the 940 * 3DSTATE_SF structure for a given fragment shader attribute. The attribute 941 * override value contains two pieces of information: the location of the 942 * attribute in the VUE (relative to urb_entry_read_offset, see below), and a 943 * flag indicating whether to "swizzle" the attribute based on the direction 944 * the triangle is facing. 945 * 946 * If an attribute is "swizzled", then the given VUE location is used for 947 * front-facing triangles, and the VUE location that immediately follows is 948 * used for back-facing triangles. We use this to implement the mapping from 949 * gl_FrontColor/gl_BackColor to gl_Color. 950 * 951 * urb_entry_read_offset is the offset into the VUE at which the SF unit is 952 * being instructed to begin reading attribute data. It can be set to a 953 * nonzero value to prevent the SF unit from wasting time reading elements of 954 * the VUE that are not needed by the fragment shader. It is measured in 955 * 256-bit increments. 956 */ 957static void 958genX(get_attr_override)(struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr, 959 const struct brw_vue_map *vue_map, 960 int urb_entry_read_offset, int fs_attr, 961 bool two_side_color, uint32_t *max_source_attr) 962{ 963 /* Find the VUE slot for this attribute. */ 964 int slot = vue_map->varying_to_slot[fs_attr]; 965 966 /* Viewport and Layer are stored in the VUE header. We need to override 967 * them to zero if earlier stages didn't write them, as GL requires that 968 * they read back as zero when not explicitly set. 969 */ 970 if (fs_attr == VARYING_SLOT_VIEWPORT || fs_attr == VARYING_SLOT_LAYER) { 971 attr->ComponentOverrideX = true; 972 attr->ComponentOverrideW = true; 973 attr->ConstantSource = CONST_0000; 974 975 if (!(vue_map->slots_valid & VARYING_BIT_LAYER)) 976 attr->ComponentOverrideY = true; 977 if (!(vue_map->slots_valid & VARYING_BIT_VIEWPORT)) 978 attr->ComponentOverrideZ = true; 979 980 return; 981 } 982 983 /* If there was only a back color written but not front, use back 984 * as the color instead of undefined 985 */ 986 if (slot == -1 && fs_attr == VARYING_SLOT_COL0) 987 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC0]; 988 if (slot == -1 && fs_attr == VARYING_SLOT_COL1) 989 slot = vue_map->varying_to_slot[VARYING_SLOT_BFC1]; 990 991 if (slot == -1) { 992 /* This attribute does not exist in the VUE--that means that the vertex 993 * shader did not write to it. This means that either: 994 * 995 * (a) This attribute is a texture coordinate, and it is going to be 996 * replaced with point coordinates (as a consequence of a call to 997 * glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)), so the 998 * hardware will ignore whatever attribute override we supply. 999 * 1000 * (b) This attribute is read by the fragment shader but not written by 1001 * the vertex shader, so its value is undefined. Therefore the 1002 * attribute override we supply doesn't matter. 1003 * 1004 * (c) This attribute is gl_PrimitiveID, and it wasn't written by the 1005 * previous shader stage. 1006 * 1007 * Note that we don't have to worry about the cases where the attribute 1008 * is gl_PointCoord or is undergoing point sprite coordinate 1009 * replacement, because in those cases, this function isn't called. 1010 * 1011 * In case (c), we need to program the attribute overrides so that the 1012 * primitive ID will be stored in this slot. In every other case, the 1013 * attribute override we supply doesn't matter. So just go ahead and 1014 * program primitive ID in every case. 1015 */ 1016 attr->ComponentOverrideW = true; 1017 attr->ComponentOverrideX = true; 1018 attr->ComponentOverrideY = true; 1019 attr->ComponentOverrideZ = true; 1020 attr->ConstantSource = PRIM_ID; 1021 return; 1022 } 1023 1024 /* Compute the location of the attribute relative to urb_entry_read_offset. 1025 * Each increment of urb_entry_read_offset represents a 256-bit value, so 1026 * it counts for two 128-bit VUE slots. 1027 */ 1028 int source_attr = slot - 2 * urb_entry_read_offset; 1029 assert(source_attr >= 0 && source_attr < 32); 1030 1031 /* If we are doing two-sided color, and the VUE slot following this one 1032 * represents a back-facing color, then we need to instruct the SF unit to 1033 * do back-facing swizzling. 1034 */ 1035 bool swizzling = two_side_color && 1036 ((vue_map->slot_to_varying[slot] == VARYING_SLOT_COL0 && 1037 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC0) || 1038 (vue_map->slot_to_varying[slot] == VARYING_SLOT_COL1 && 1039 vue_map->slot_to_varying[slot+1] == VARYING_SLOT_BFC1)); 1040 1041 /* Update max_source_attr. If swizzling, the SF will read this slot + 1. */ 1042 if (*max_source_attr < source_attr + swizzling) 1043 *max_source_attr = source_attr + swizzling; 1044 1045 attr->SourceAttribute = source_attr; 1046 if (swizzling) 1047 attr->SwizzleSelect = INPUTATTR_FACING; 1048} 1049 1050 1051static void 1052genX(calculate_attr_overrides)(const struct brw_context *brw, 1053 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) *attr_overrides, 1054 uint32_t *point_sprite_enables, 1055 uint32_t *urb_entry_read_length, 1056 uint32_t *urb_entry_read_offset) 1057{ 1058 const struct gl_context *ctx = &brw->ctx; 1059 1060 /* _NEW_POINT */ 1061 const struct gl_point_attrib *point = &ctx->Point; 1062 1063 /* BRW_NEW_FRAGMENT_PROGRAM */ 1064 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 1065 1066 /* BRW_NEW_FS_PROG_DATA */ 1067 const struct brw_wm_prog_data *wm_prog_data = 1068 brw_wm_prog_data(brw->wm.base.prog_data); 1069 uint32_t max_source_attr = 0; 1070 1071 *point_sprite_enables = 0; 1072 1073 int first_slot = 1074 brw_compute_first_urb_slot_required(fp->info.inputs_read, 1075 &brw->vue_map_geom_out); 1076 1077 /* Each URB offset packs two varying slots */ 1078 assert(first_slot % 2 == 0); 1079 *urb_entry_read_offset = first_slot / 2; 1080 1081 /* From the Ivybridge PRM, Vol 2 Part 1, 3DSTATE_SBE, 1082 * description of dw10 Point Sprite Texture Coordinate Enable: 1083 * 1084 * "This field must be programmed to zero when non-point primitives 1085 * are rendered." 1086 * 1087 * The SandyBridge PRM doesn't explicitly say that point sprite enables 1088 * must be programmed to zero when rendering non-point primitives, but 1089 * the IvyBridge PRM does, and if we don't, we get garbage. 1090 * 1091 * This is not required on Haswell, as the hardware ignores this state 1092 * when drawing non-points -- although we do still need to be careful to 1093 * correctly set the attr overrides. 1094 * 1095 * _NEW_POLYGON 1096 * BRW_NEW_PRIMITIVE | BRW_NEW_GS_PROG_DATA | BRW_NEW_TES_PROG_DATA 1097 */ 1098 bool drawing_points = brw_is_drawing_points(brw); 1099 1100 for (uint8_t idx = 0; idx < wm_prog_data->urb_setup_attribs_count; idx++) { 1101 uint8_t attr = wm_prog_data->urb_setup_attribs[idx]; 1102 int input_index = wm_prog_data->urb_setup[attr]; 1103 1104 assert(0 <= input_index); 1105 1106 /* _NEW_POINT */ 1107 bool point_sprite = false; 1108 if (drawing_points) { 1109 if (point->PointSprite && 1110 (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7) && 1111 (point->CoordReplace & (1u << (attr - VARYING_SLOT_TEX0)))) { 1112 point_sprite = true; 1113 } 1114 1115 if (attr == VARYING_SLOT_PNTC) 1116 point_sprite = true; 1117 1118 if (point_sprite) 1119 *point_sprite_enables |= (1 << input_index); 1120 } 1121 1122 /* BRW_NEW_VUE_MAP_GEOM_OUT | _NEW_LIGHT | _NEW_PROGRAM */ 1123 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attribute = { 0 }; 1124 1125 if (!point_sprite) { 1126 genX(get_attr_override)(&attribute, 1127 &brw->vue_map_geom_out, 1128 *urb_entry_read_offset, attr, 1129 _mesa_vertex_program_two_side_enabled(ctx), 1130 &max_source_attr); 1131 } 1132 1133 /* The hardware can only do the overrides on 16 overrides at a 1134 * time, and the other up to 16 have to be lined up so that the 1135 * input index = the output index. We'll need to do some 1136 * tweaking to make sure that's the case. 1137 */ 1138 if (input_index < 16) 1139 attr_overrides[input_index] = attribute; 1140 else 1141 assert(attribute.SourceAttribute == input_index); 1142 } 1143 1144 /* From the Sandy Bridge PRM, Volume 2, Part 1, documentation for 1145 * 3DSTATE_SF DWord 1 bits 15:11, "Vertex URB Entry Read Length": 1146 * 1147 * "This field should be set to the minimum length required to read the 1148 * maximum source attribute. The maximum source attribute is indicated 1149 * by the maximum value of the enabled Attribute # Source Attribute if 1150 * Attribute Swizzle Enable is set, Number of Output Attributes-1 if 1151 * enable is not set. 1152 * read_length = ceiling((max_source_attr + 1) / 2) 1153 * 1154 * [errata] Corruption/Hang possible if length programmed larger than 1155 * recommended" 1156 * 1157 * Similar text exists for Ivy Bridge. 1158 */ 1159 *urb_entry_read_length = DIV_ROUND_UP(max_source_attr + 1, 2); 1160} 1161#endif 1162 1163/* ---------------------------------------------------------------------- */ 1164 1165#if GFX_VER >= 8 1166typedef struct GENX(3DSTATE_WM_DEPTH_STENCIL) DEPTH_STENCIL_GENXML; 1167#elif GFX_VER >= 6 1168typedef struct GENX(DEPTH_STENCIL_STATE) DEPTH_STENCIL_GENXML; 1169#else 1170typedef struct GENX(COLOR_CALC_STATE) DEPTH_STENCIL_GENXML; 1171#endif 1172 1173static inline void 1174set_depth_stencil_bits(struct brw_context *brw, DEPTH_STENCIL_GENXML *ds) 1175{ 1176 struct gl_context *ctx = &brw->ctx; 1177 1178 /* _NEW_BUFFERS */ 1179 struct brw_renderbuffer *depth_irb = 1180 brw_get_renderbuffer(ctx->DrawBuffer, BUFFER_DEPTH); 1181 1182 /* _NEW_DEPTH */ 1183 struct gl_depthbuffer_attrib *depth = &ctx->Depth; 1184 1185 /* _NEW_STENCIL */ 1186 struct gl_stencil_attrib *stencil = &ctx->Stencil; 1187 const int b = stencil->_BackFace; 1188 1189 if (depth->Test && depth_irb) { 1190 ds->DepthTestEnable = true; 1191 ds->DepthBufferWriteEnable = brw_depth_writes_enabled(brw); 1192 ds->DepthTestFunction = brw_translate_compare_func(depth->Func); 1193 } 1194 1195 if (brw->stencil_enabled) { 1196 ds->StencilTestEnable = true; 1197 ds->StencilWriteMask = stencil->WriteMask[0] & 0xff; 1198 ds->StencilTestMask = stencil->ValueMask[0] & 0xff; 1199 1200 ds->StencilTestFunction = 1201 brw_translate_compare_func(stencil->Function[0]); 1202 ds->StencilFailOp = 1203 brw_translate_stencil_op(stencil->FailFunc[0]); 1204 ds->StencilPassDepthPassOp = 1205 brw_translate_stencil_op(stencil->ZPassFunc[0]); 1206 ds->StencilPassDepthFailOp = 1207 brw_translate_stencil_op(stencil->ZFailFunc[0]); 1208 1209 ds->StencilBufferWriteEnable = brw->stencil_write_enabled; 1210 1211 if (brw->stencil_two_sided) { 1212 ds->DoubleSidedStencilEnable = true; 1213 ds->BackfaceStencilWriteMask = stencil->WriteMask[b] & 0xff; 1214 ds->BackfaceStencilTestMask = stencil->ValueMask[b] & 0xff; 1215 1216 ds->BackfaceStencilTestFunction = 1217 brw_translate_compare_func(stencil->Function[b]); 1218 ds->BackfaceStencilFailOp = 1219 brw_translate_stencil_op(stencil->FailFunc[b]); 1220 ds->BackfaceStencilPassDepthPassOp = 1221 brw_translate_stencil_op(stencil->ZPassFunc[b]); 1222 ds->BackfaceStencilPassDepthFailOp = 1223 brw_translate_stencil_op(stencil->ZFailFunc[b]); 1224 } 1225 1226#if GFX_VER <= 5 || GFX_VER >= 9 1227 ds->StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0); 1228 ds->BackfaceStencilReferenceValue = _mesa_get_stencil_ref(ctx, b); 1229#endif 1230 } 1231} 1232 1233#if GFX_VER >= 6 1234static void 1235genX(upload_depth_stencil_state)(struct brw_context *brw) 1236{ 1237#if GFX_VER >= 8 1238 brw_batch_emit(brw, GENX(3DSTATE_WM_DEPTH_STENCIL), wmds) { 1239 set_depth_stencil_bits(brw, &wmds); 1240 } 1241#else 1242 uint32_t ds_offset; 1243 brw_state_emit(brw, GENX(DEPTH_STENCIL_STATE), 64, &ds_offset, ds) { 1244 set_depth_stencil_bits(brw, &ds); 1245 } 1246 1247 /* Now upload a pointer to the indirect state */ 1248#if GFX_VER == 6 1249 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 1250 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 1251 ptr.DEPTH_STENCIL_STATEChange = true; 1252 } 1253#else 1254 brw_batch_emit(brw, GENX(3DSTATE_DEPTH_STENCIL_STATE_POINTERS), ptr) { 1255 ptr.PointertoDEPTH_STENCIL_STATE = ds_offset; 1256 } 1257#endif 1258#endif 1259} 1260 1261static const struct brw_tracked_state genX(depth_stencil_state) = { 1262 .dirty = { 1263 .mesa = _NEW_BUFFERS | 1264 _NEW_DEPTH | 1265 _NEW_STENCIL, 1266 .brw = BRW_NEW_BLORP | 1267 (GFX_VER >= 8 ? BRW_NEW_CONTEXT 1268 : BRW_NEW_BATCH | 1269 BRW_NEW_STATE_BASE_ADDRESS), 1270 }, 1271 .emit = genX(upload_depth_stencil_state), 1272}; 1273#endif 1274 1275/* ---------------------------------------------------------------------- */ 1276 1277#if GFX_VER <= 5 1278 1279static void 1280genX(upload_clip_state)(struct brw_context *brw) 1281{ 1282 struct gl_context *ctx = &brw->ctx; 1283 1284 ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 1285 brw_state_emit(brw, GENX(CLIP_STATE), 32, &brw->clip.state_offset, clip) { 1286 clip.KernelStartPointer = KSP(brw, brw->clip.prog_offset); 1287 clip.GRFRegisterCount = 1288 DIV_ROUND_UP(brw->clip.prog_data->total_grf, 16) - 1; 1289 clip.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1290 clip.SingleProgramFlow = true; 1291 clip.VertexURBEntryReadLength = brw->clip.prog_data->urb_read_length; 1292 clip.ConstantURBEntryReadLength = brw->clip.prog_data->curb_read_length; 1293 1294 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */ 1295 clip.ConstantURBEntryReadOffset = brw->curbe.clip_start * 2; 1296 clip.DispatchGRFStartRegisterForURBData = 1; 1297 clip.VertexURBEntryReadOffset = 0; 1298 1299 /* BRW_NEW_URB_FENCE */ 1300 clip.NumberofURBEntries = brw->urb.nr_clip_entries; 1301 clip.URBEntryAllocationSize = brw->urb.vsize - 1; 1302 1303 if (brw->urb.nr_clip_entries >= 10) { 1304 /* Half of the URB entries go to each thread, and it has to be an 1305 * even number. 1306 */ 1307 assert(brw->urb.nr_clip_entries % 2 == 0); 1308 1309 /* Although up to 16 concurrent Clip threads are allowed on Ironlake, 1310 * only 2 threads can output VUEs at a time. 1311 */ 1312 clip.MaximumNumberofThreads = (GFX_VER == 5 ? 16 : 2) - 1; 1313 } else { 1314 assert(brw->urb.nr_clip_entries >= 5); 1315 clip.MaximumNumberofThreads = 1 - 1; 1316 } 1317 1318 clip.VertexPositionSpace = VPOS_NDCSPACE; 1319 clip.UserClipFlagsMustClipEnable = true; 1320 clip.GuardbandClipTestEnable = true; 1321 1322 clip.ClipperViewportStatePointer = 1323 ro_bo(brw->batch.state.bo, brw->clip.vp_offset); 1324 1325 clip.ScreenSpaceViewportXMin = -1; 1326 clip.ScreenSpaceViewportXMax = 1; 1327 clip.ScreenSpaceViewportYMin = -1; 1328 clip.ScreenSpaceViewportYMax = 1; 1329 1330 clip.ViewportXYClipTestEnable = true; 1331 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear && 1332 ctx->Transform.DepthClampFar); 1333 1334 /* _NEW_TRANSFORM */ 1335 if (GFX_VER == 5 || GFX_VERx10 == 45) { 1336 clip.UserClipDistanceClipTestEnableBitmask = 1337 ctx->Transform.ClipPlanesEnabled; 1338 } else { 1339 /* Up to 6 actual clip flags, plus the 7th for the negative RHW 1340 * workaround. 1341 */ 1342 clip.UserClipDistanceClipTestEnableBitmask = 1343 (ctx->Transform.ClipPlanesEnabled & 0x3f) | 0x40; 1344 } 1345 1346 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE) 1347 clip.APIMode = APIMODE_D3D; 1348 else 1349 clip.APIMode = APIMODE_OGL; 1350 1351 clip.GuardbandClipTestEnable = true; 1352 1353 clip.ClipMode = brw->clip.prog_data->clip_mode; 1354 1355#if GFX_VERx10 == 45 1356 clip.NegativeWClipTestEnable = true; 1357#endif 1358 } 1359} 1360 1361const struct brw_tracked_state genX(clip_state) = { 1362 .dirty = { 1363 .mesa = _NEW_TRANSFORM | 1364 _NEW_VIEWPORT, 1365 .brw = BRW_NEW_BATCH | 1366 BRW_NEW_BLORP | 1367 BRW_NEW_CLIP_PROG_DATA | 1368 BRW_NEW_PUSH_CONSTANT_ALLOCATION | 1369 BRW_NEW_PROGRAM_CACHE | 1370 BRW_NEW_URB_FENCE, 1371 }, 1372 .emit = genX(upload_clip_state), 1373}; 1374 1375#else 1376 1377static void 1378genX(upload_clip_state)(struct brw_context *brw) 1379{ 1380 struct gl_context *ctx = &brw->ctx; 1381 1382 /* _NEW_BUFFERS */ 1383 struct gl_framebuffer *fb = ctx->DrawBuffer; 1384 1385 /* BRW_NEW_FS_PROG_DATA */ 1386 struct brw_wm_prog_data *wm_prog_data = 1387 brw_wm_prog_data(brw->wm.base.prog_data); 1388 1389 brw_batch_emit(brw, GENX(3DSTATE_CLIP), clip) { 1390 clip.StatisticsEnable = !brw->meta_in_progress; 1391 1392 if (wm_prog_data->barycentric_interp_modes & 1393 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) 1394 clip.NonPerspectiveBarycentricEnable = true; 1395 1396#if GFX_VER >= 7 1397 clip.EarlyCullEnable = true; 1398#endif 1399 1400#if GFX_VER == 7 1401 clip.FrontWinding = brw->polygon_front_bit != fb->FlipY; 1402 1403 if (ctx->Polygon.CullFlag) { 1404 switch (ctx->Polygon.CullFaceMode) { 1405 case GL_FRONT: 1406 clip.CullMode = CULLMODE_FRONT; 1407 break; 1408 case GL_BACK: 1409 clip.CullMode = CULLMODE_BACK; 1410 break; 1411 case GL_FRONT_AND_BACK: 1412 clip.CullMode = CULLMODE_BOTH; 1413 break; 1414 default: 1415 unreachable("Should not get here: invalid CullFlag"); 1416 } 1417 } else { 1418 clip.CullMode = CULLMODE_NONE; 1419 } 1420#endif 1421 1422#if GFX_VER < 8 1423 clip.UserClipDistanceCullTestEnableBitmask = 1424 brw_vue_prog_data(brw->vs.base.prog_data)->cull_distance_mask; 1425 1426 clip.ViewportZClipTestEnable = !(ctx->Transform.DepthClampNear && 1427 ctx->Transform.DepthClampFar); 1428#endif 1429 1430 /* _NEW_LIGHT */ 1431 if (ctx->Light.ProvokingVertex == GL_FIRST_VERTEX_CONVENTION) { 1432 clip.TriangleStripListProvokingVertexSelect = 0; 1433 clip.TriangleFanProvokingVertexSelect = 1; 1434 clip.LineStripListProvokingVertexSelect = 0; 1435 } else { 1436 clip.TriangleStripListProvokingVertexSelect = 2; 1437 clip.TriangleFanProvokingVertexSelect = 2; 1438 clip.LineStripListProvokingVertexSelect = 1; 1439 } 1440 1441 /* _NEW_TRANSFORM */ 1442 clip.UserClipDistanceClipTestEnableBitmask = 1443 ctx->Transform.ClipPlanesEnabled; 1444 1445#if GFX_VER >= 8 1446 clip.ForceUserClipDistanceClipTestEnableBitmask = true; 1447#endif 1448 1449 if (ctx->Transform.ClipDepthMode == GL_ZERO_TO_ONE) 1450 clip.APIMode = APIMODE_D3D; 1451 else 1452 clip.APIMode = APIMODE_OGL; 1453 1454 clip.GuardbandClipTestEnable = true; 1455 1456 /* BRW_NEW_VIEWPORT_COUNT */ 1457 const unsigned viewport_count = brw->clip.viewport_count; 1458 1459 if (ctx->RasterDiscard) { 1460 clip.ClipMode = CLIPMODE_REJECT_ALL; 1461#if GFX_VER == 6 1462 perf_debug("Rasterizer discard is currently implemented via the " 1463 "clipper; having the GS not write primitives would " 1464 "likely be faster.\n"); 1465#endif 1466 } else { 1467 clip.ClipMode = CLIPMODE_NORMAL; 1468 } 1469 1470 clip.ClipEnable = true; 1471 1472 /* _NEW_POLYGON, 1473 * BRW_NEW_GEOMETRY_PROGRAM | BRW_NEW_TES_PROG_DATA | BRW_NEW_PRIMITIVE 1474 */ 1475 if (!brw_is_drawing_points(brw) && !brw_is_drawing_lines(brw)) 1476 clip.ViewportXYClipTestEnable = true; 1477 1478 clip.MinimumPointWidth = 0.125; 1479 clip.MaximumPointWidth = 255.875; 1480 clip.MaximumVPIndex = viewport_count - 1; 1481 if (_mesa_geometric_layers(fb) == 0) 1482 clip.ForceZeroRTAIndexEnable = true; 1483 } 1484} 1485 1486static const struct brw_tracked_state genX(clip_state) = { 1487 .dirty = { 1488 .mesa = _NEW_BUFFERS | 1489 _NEW_LIGHT | 1490 _NEW_POLYGON | 1491 _NEW_TRANSFORM, 1492 .brw = BRW_NEW_BLORP | 1493 BRW_NEW_CONTEXT | 1494 BRW_NEW_FS_PROG_DATA | 1495 BRW_NEW_GS_PROG_DATA | 1496 BRW_NEW_VS_PROG_DATA | 1497 BRW_NEW_META_IN_PROGRESS | 1498 BRW_NEW_PRIMITIVE | 1499 BRW_NEW_RASTERIZER_DISCARD | 1500 BRW_NEW_TES_PROG_DATA | 1501 BRW_NEW_VIEWPORT_COUNT, 1502 }, 1503 .emit = genX(upload_clip_state), 1504}; 1505#endif 1506 1507/* ---------------------------------------------------------------------- */ 1508 1509static void 1510genX(upload_sf)(struct brw_context *brw) 1511{ 1512 struct gl_context *ctx = &brw->ctx; 1513 float point_size; 1514 1515#if GFX_VER <= 7 1516 /* _NEW_BUFFERS */ 1517 bool flip_y = ctx->DrawBuffer->FlipY; 1518 UNUSED const bool multisampled_fbo = 1519 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 1520#endif 1521 1522#if GFX_VER < 6 1523 const struct brw_sf_prog_data *sf_prog_data = brw->sf.prog_data; 1524 1525 ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 1526 1527 brw_state_emit(brw, GENX(SF_STATE), 64, &brw->sf.state_offset, sf) { 1528 sf.KernelStartPointer = KSP(brw, brw->sf.prog_offset); 1529 sf.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1530 sf.GRFRegisterCount = DIV_ROUND_UP(sf_prog_data->total_grf, 16) - 1; 1531 sf.DispatchGRFStartRegisterForURBData = 3; 1532 sf.VertexURBEntryReadOffset = BRW_SF_URB_ENTRY_READ_OFFSET; 1533 sf.VertexURBEntryReadLength = sf_prog_data->urb_read_length; 1534 sf.NumberofURBEntries = brw->urb.nr_sf_entries; 1535 sf.URBEntryAllocationSize = brw->urb.sfsize - 1; 1536 1537 /* STATE_PREFETCH command description describes this state as being 1538 * something loaded through the GPE (L2 ISC), so it's INSTRUCTION 1539 * domain. 1540 */ 1541 sf.SetupViewportStateOffset = 1542 ro_bo(brw->batch.state.bo, brw->sf.vp_offset); 1543 1544 sf.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1545 1546 /* sf.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; */ 1547 /* sf.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; */ 1548 1549 sf.MaximumNumberofThreads = 1550 MIN2(GFX_VER == 5 ? 48 : 24, brw->urb.nr_sf_entries) - 1; 1551 1552 sf.SpritePointEnable = ctx->Point.PointSprite; 1553 1554 sf.DestinationOriginHorizontalBias = 0.5; 1555 sf.DestinationOriginVerticalBias = 0.5; 1556#else 1557 brw_batch_emit(brw, GENX(3DSTATE_SF), sf) { 1558 sf.StatisticsEnable = true; 1559#endif 1560 sf.ViewportTransformEnable = true; 1561 1562#if GFX_VER == 7 1563 /* _NEW_BUFFERS */ 1564 sf.DepthBufferSurfaceFormat = brw_depthbuffer_format(brw); 1565#endif 1566 1567#if GFX_VER <= 7 1568 /* _NEW_POLYGON */ 1569 sf.FrontWinding = brw->polygon_front_bit != flip_y; 1570#if GFX_VER >= 6 1571 sf.GlobalDepthOffsetEnableSolid = ctx->Polygon.OffsetFill; 1572 sf.GlobalDepthOffsetEnableWireframe = ctx->Polygon.OffsetLine; 1573 sf.GlobalDepthOffsetEnablePoint = ctx->Polygon.OffsetPoint; 1574 1575 switch (ctx->Polygon.FrontMode) { 1576 case GL_FILL: 1577 sf.FrontFaceFillMode = FILL_MODE_SOLID; 1578 break; 1579 case GL_LINE: 1580 sf.FrontFaceFillMode = FILL_MODE_WIREFRAME; 1581 break; 1582 case GL_POINT: 1583 sf.FrontFaceFillMode = FILL_MODE_POINT; 1584 break; 1585 default: 1586 unreachable("not reached"); 1587 } 1588 1589 switch (ctx->Polygon.BackMode) { 1590 case GL_FILL: 1591 sf.BackFaceFillMode = FILL_MODE_SOLID; 1592 break; 1593 case GL_LINE: 1594 sf.BackFaceFillMode = FILL_MODE_WIREFRAME; 1595 break; 1596 case GL_POINT: 1597 sf.BackFaceFillMode = FILL_MODE_POINT; 1598 break; 1599 default: 1600 unreachable("not reached"); 1601 } 1602 1603 if (multisampled_fbo && ctx->Multisample.Enabled) 1604 sf.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 1605 1606 sf.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2; 1607 sf.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor; 1608 sf.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp; 1609#endif 1610 1611 sf.ScissorRectangleEnable = true; 1612 1613 if (ctx->Polygon.CullFlag) { 1614 switch (ctx->Polygon.CullFaceMode) { 1615 case GL_FRONT: 1616 sf.CullMode = CULLMODE_FRONT; 1617 break; 1618 case GL_BACK: 1619 sf.CullMode = CULLMODE_BACK; 1620 break; 1621 case GL_FRONT_AND_BACK: 1622 sf.CullMode = CULLMODE_BOTH; 1623 break; 1624 default: 1625 unreachable("not reached"); 1626 } 1627 } else { 1628 sf.CullMode = CULLMODE_NONE; 1629 } 1630 1631#if GFX_VERx10 == 75 1632 sf.LineStippleEnable = ctx->Line.StippleFlag; 1633#endif 1634 1635#endif 1636 1637 /* _NEW_LINE */ 1638#if GFX_VER == 8 1639 const struct intel_device_info *devinfo = &brw->screen->devinfo; 1640 1641 if (devinfo->is_cherryview) 1642 sf.CHVLineWidth = brw_get_line_width(brw); 1643 else 1644 sf.LineWidth = brw_get_line_width(brw); 1645#else 1646 sf.LineWidth = brw_get_line_width(brw); 1647#endif 1648 1649 if (ctx->Line.SmoothFlag) { 1650 sf.LineEndCapAntialiasingRegionWidth = _10pixels; 1651#if GFX_VER <= 7 1652 sf.AntialiasingEnable = true; 1653#endif 1654 } 1655 1656 /* _NEW_POINT - Clamp to ARB_point_parameters user limits */ 1657 point_size = CLAMP(ctx->Point.Size, ctx->Point.MinSize, ctx->Point.MaxSize); 1658 /* Clamp to the hardware limits */ 1659 sf.PointWidth = CLAMP(point_size, 0.125f, 255.875f); 1660 1661 /* _NEW_PROGRAM | _NEW_POINT, BRW_NEW_VUE_MAP_GEOM_OUT */ 1662 if (use_state_point_size(brw)) 1663 sf.PointWidthSource = State; 1664 1665#if GFX_VER >= 8 1666 /* _NEW_POINT | _NEW_MULTISAMPLE */ 1667 if ((ctx->Point.SmoothFlag || _mesa_is_multisample_enabled(ctx)) && 1668 !ctx->Point.PointSprite) 1669 sf.SmoothPointEnable = true; 1670#endif 1671 1672#if GFX_VER == 10 1673 /* _NEW_BUFFERS 1674 * Smooth Point Enable bit MUST not be set when NUM_MULTISAMPLES > 1. 1675 */ 1676 const bool multisampled_fbo = 1677 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 1678 if (multisampled_fbo) 1679 sf.SmoothPointEnable = false; 1680#endif 1681 1682#if GFX_VERx10 >= 45 1683 sf.AALineDistanceMode = AALINEDISTANCE_TRUE; 1684#endif 1685 1686 /* _NEW_LIGHT */ 1687 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) { 1688 sf.TriangleStripListProvokingVertexSelect = 2; 1689 sf.TriangleFanProvokingVertexSelect = 2; 1690 sf.LineStripListProvokingVertexSelect = 1; 1691 } else { 1692 sf.TriangleFanProvokingVertexSelect = 1; 1693 } 1694 1695#if GFX_VER == 6 1696 /* BRW_NEW_FS_PROG_DATA */ 1697 const struct brw_wm_prog_data *wm_prog_data = 1698 brw_wm_prog_data(brw->wm.base.prog_data); 1699 1700 sf.AttributeSwizzleEnable = true; 1701 sf.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 1702 1703 /* 1704 * Window coordinates in an FBO are inverted, which means point 1705 * sprite origin must be inverted, too. 1706 */ 1707 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) { 1708 sf.PointSpriteTextureCoordinateOrigin = LOWERLEFT; 1709 } else { 1710 sf.PointSpriteTextureCoordinateOrigin = UPPERLEFT; 1711 } 1712 1713 /* BRW_NEW_VUE_MAP_GEOM_OUT | BRW_NEW_FRAGMENT_PROGRAM | 1714 * _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM | BRW_NEW_FS_PROG_DATA 1715 */ 1716 uint32_t urb_entry_read_length; 1717 uint32_t urb_entry_read_offset; 1718 uint32_t point_sprite_enables; 1719 genX(calculate_attr_overrides)(brw, sf.Attribute, &point_sprite_enables, 1720 &urb_entry_read_length, 1721 &urb_entry_read_offset); 1722 sf.VertexURBEntryReadLength = urb_entry_read_length; 1723 sf.VertexURBEntryReadOffset = urb_entry_read_offset; 1724 sf.PointSpriteTextureCoordinateEnable = point_sprite_enables; 1725 sf.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 1726#endif 1727 } 1728} 1729 1730static const struct brw_tracked_state genX(sf_state) = { 1731 .dirty = { 1732 .mesa = _NEW_LIGHT | 1733 _NEW_LINE | 1734 _NEW_POINT | 1735 _NEW_PROGRAM | 1736 (GFX_VER >= 6 ? _NEW_MULTISAMPLE : 0) | 1737 (GFX_VER <= 7 ? _NEW_BUFFERS | _NEW_POLYGON : 0) | 1738 (GFX_VER == 10 ? _NEW_BUFFERS : 0), 1739 .brw = BRW_NEW_BLORP | 1740 BRW_NEW_VUE_MAP_GEOM_OUT | 1741 (GFX_VER <= 5 ? BRW_NEW_BATCH | 1742 BRW_NEW_PROGRAM_CACHE | 1743 BRW_NEW_SF_PROG_DATA | 1744 BRW_NEW_SF_VP | 1745 BRW_NEW_URB_FENCE 1746 : 0) | 1747 (GFX_VER >= 6 ? BRW_NEW_CONTEXT : 0) | 1748 (GFX_VER >= 6 && GFX_VER <= 7 ? 1749 BRW_NEW_GS_PROG_DATA | 1750 BRW_NEW_PRIMITIVE | 1751 BRW_NEW_TES_PROG_DATA 1752 : 0) | 1753 (GFX_VER == 6 ? BRW_NEW_FS_PROG_DATA | 1754 BRW_NEW_FRAGMENT_PROGRAM 1755 : 0), 1756 }, 1757 .emit = genX(upload_sf), 1758}; 1759 1760/* ---------------------------------------------------------------------- */ 1761 1762static bool 1763brw_color_buffer_write_enabled(struct brw_context *brw) 1764{ 1765 struct gl_context *ctx = &brw->ctx; 1766 /* BRW_NEW_FRAGMENT_PROGRAM */ 1767 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 1768 unsigned i; 1769 1770 /* _NEW_BUFFERS */ 1771 for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) { 1772 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i]; 1773 uint64_t outputs_written = fp->info.outputs_written; 1774 1775 /* _NEW_COLOR */ 1776 if (rb && (outputs_written & BITFIELD64_BIT(FRAG_RESULT_COLOR) || 1777 outputs_written & BITFIELD64_BIT(FRAG_RESULT_DATA0 + i)) && 1778 GET_COLORMASK(ctx->Color.ColorMask, i)) { 1779 return true; 1780 } 1781 } 1782 1783 return false; 1784} 1785 1786static void 1787genX(upload_wm)(struct brw_context *brw) 1788{ 1789 struct gl_context *ctx = &brw->ctx; 1790 1791 /* BRW_NEW_FS_PROG_DATA */ 1792 const struct brw_wm_prog_data *wm_prog_data = 1793 brw_wm_prog_data(brw->wm.base.prog_data); 1794 1795 UNUSED bool writes_depth = 1796 wm_prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF; 1797 UNUSED struct brw_stage_state *stage_state = &brw->wm.base; 1798 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 1799 1800#if GFX_VER == 6 1801 /* We can't fold this into gfx6_upload_wm_push_constants(), because 1802 * according to the SNB PRM, vol 2 part 1 section 7.2.2 1803 * (3DSTATE_CONSTANT_PS [DevSNB]): 1804 * 1805 * "[DevSNB]: This packet must be followed by WM_STATE." 1806 */ 1807 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_PS), wmcp) { 1808 if (wm_prog_data->base.nr_params != 0) { 1809 wmcp.Buffer0Valid = true; 1810 /* Pointer to the WM constant buffer. Covered by the set of 1811 * state flags from gfx6_upload_wm_push_constants. 1812 */ 1813 wmcp.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 1814 wmcp.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 1815 } 1816 } 1817#endif 1818 1819#if GFX_VER >= 6 1820 brw_batch_emit(brw, GENX(3DSTATE_WM), wm) { 1821#else 1822 ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 1823 brw_state_emit(brw, GENX(WM_STATE), 64, &stage_state->state_offset, wm) { 1824#endif 1825 1826#if GFX_VER <= 6 1827 wm._8PixelDispatchEnable = wm_prog_data->dispatch_8; 1828 wm._16PixelDispatchEnable = wm_prog_data->dispatch_16; 1829 wm._32PixelDispatchEnable = wm_prog_data->dispatch_32; 1830#endif 1831 1832#if GFX_VER == 4 1833 /* On gfx4, we only have one shader kernel */ 1834 if (brw_wm_state_has_ksp(wm, 0)) { 1835 assert(brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0) == 0); 1836 wm.KernelStartPointer0 = KSP(brw, stage_state->prog_offset); 1837 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 1838 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1839 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); 1840 } 1841#elif GFX_VER == 5 1842 /* On gfx5, we have multiple shader kernels but only one GRF start 1843 * register for all kernels 1844 */ 1845 wm.KernelStartPointer0 = stage_state->prog_offset + 1846 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 1847 wm.KernelStartPointer1 = stage_state->prog_offset + 1848 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 1849 wm.KernelStartPointer2 = stage_state->prog_offset + 1850 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 1851 1852 wm.GRFRegisterCount0 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 0); 1853 wm.GRFRegisterCount1 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 1); 1854 wm.GRFRegisterCount2 = brw_wm_prog_data_reg_blocks(wm_prog_data, wm, 2); 1855 1856 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1857 wm_prog_data->base.dispatch_grf_start_reg; 1858 1859 /* Dispatch GRF Start should be the same for all shaders on gfx5 */ 1860 if (brw_wm_state_has_ksp(wm, 1)) { 1861 assert(wm_prog_data->base.dispatch_grf_start_reg == 1862 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1)); 1863 } 1864 if (brw_wm_state_has_ksp(wm, 2)) { 1865 assert(wm_prog_data->base.dispatch_grf_start_reg == 1866 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2)); 1867 } 1868#elif GFX_VER == 6 1869 /* On gfx6, we have multiple shader kernels and we no longer specify a 1870 * register count for each one. 1871 */ 1872 wm.KernelStartPointer0 = stage_state->prog_offset + 1873 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 0); 1874 wm.KernelStartPointer1 = stage_state->prog_offset + 1875 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 1); 1876 wm.KernelStartPointer2 = stage_state->prog_offset + 1877 brw_wm_prog_data_prog_offset(wm_prog_data, wm, 2); 1878 1879 wm.DispatchGRFStartRegisterForConstantSetupData0 = 1880 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 0); 1881 wm.DispatchGRFStartRegisterForConstantSetupData1 = 1882 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 1); 1883 wm.DispatchGRFStartRegisterForConstantSetupData2 = 1884 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, wm, 2); 1885#endif 1886 1887#if GFX_VER <= 5 1888 wm.ConstantURBEntryReadLength = wm_prog_data->base.curb_read_length; 1889 /* BRW_NEW_PUSH_CONSTANT_ALLOCATION */ 1890 wm.ConstantURBEntryReadOffset = brw->curbe.wm_start * 2; 1891 wm.SetupURBEntryReadLength = wm_prog_data->num_varying_inputs * 2; 1892 wm.SetupURBEntryReadOffset = 0; 1893 wm.EarlyDepthTestEnable = true; 1894#endif 1895 1896#if GFX_VER >= 6 1897 wm.LineAntialiasingRegionWidth = _10pixels; 1898 wm.LineEndCapAntialiasingRegionWidth = _05pixels; 1899 1900 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1901 wm.BarycentricInterpolationMode = wm_prog_data->barycentric_interp_modes; 1902#else 1903 if (stage_state->sampler_count) 1904 wm.SamplerStatePointer = 1905 ro_bo(brw->batch.state.bo, stage_state->sampler_offset); 1906 1907 wm.LineAntialiasingRegionWidth = _05pixels; 1908 wm.LineEndCapAntialiasingRegionWidth = _10pixels; 1909 1910 /* _NEW_POLYGON */ 1911 if (ctx->Polygon.OffsetFill) { 1912 wm.GlobalDepthOffsetEnable = true; 1913 /* Something weird going on with legacy_global_depth_bias, 1914 * offset_constant, scaling and MRD. This value passes glean 1915 * but gives some odd results elsewere (eg. the 1916 * quad-offset-units test). 1917 */ 1918 wm.GlobalDepthOffsetConstant = ctx->Polygon.OffsetUnits * 2; 1919 1920 /* This is the only value that passes glean: 1921 */ 1922 wm.GlobalDepthOffsetScale = ctx->Polygon.OffsetFactor; 1923 } 1924 1925 wm.DepthCoefficientURBReadOffset = 1; 1926#endif 1927 1928 /* BRW_NEW_STATS_WM */ 1929 wm.StatisticsEnable = GFX_VER >= 6 || brw->stats_wm; 1930 1931#if GFX_VER < 7 1932 if (wm_prog_data->base.use_alt_mode) 1933 wm.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 1934 1935 wm.SamplerCount = GFX_VER == 5 ? 1936 0 : DIV_ROUND_UP(stage_state->sampler_count, 4); 1937 1938 wm.BindingTableEntryCount = 1939 wm_prog_data->base.binding_table.size_bytes / 4; 1940 wm.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 1941 1942#if GFX_VER == 6 1943 wm.DualSourceBlendEnable = 1944 wm_prog_data->dual_src_blend && (ctx->Color.BlendEnabled & 1) && 1945 ctx->Color._BlendUsesDualSrc & 0x1; 1946 wm.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 1947 wm.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 1948 1949 /* From the SNB PRM, volume 2 part 1, page 281: 1950 * "If the PS kernel does not need the Position XY Offsets 1951 * to compute a Position XY value, then this field should be 1952 * programmed to POSOFFSET_NONE." 1953 * 1954 * "SW Recommendation: If the PS kernel needs the Position Offsets 1955 * to compute a Position XY value, this field should match Position 1956 * ZW Interpolation Mode to ensure a consistent position.xyzw 1957 * computation." 1958 * We only require XY sample offsets. So, this recommendation doesn't 1959 * look useful at the moment. We might need this in future. 1960 */ 1961 if (wm_prog_data->uses_pos_offset) 1962 wm.PositionXYOffsetSelect = POSOFFSET_SAMPLE; 1963 else 1964 wm.PositionXYOffsetSelect = POSOFFSET_NONE; 1965#endif 1966 1967 if (wm_prog_data->base.total_scratch) { 1968 wm.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); 1969 wm.PerThreadScratchSpace = 1970 ffs(stage_state->per_thread_scratch) - 11; 1971 } 1972 1973 wm.PixelShaderComputedDepth = writes_depth; 1974#endif 1975 1976 /* _NEW_LINE */ 1977 wm.LineStippleEnable = ctx->Line.StippleFlag; 1978 1979 /* _NEW_POLYGON */ 1980 wm.PolygonStippleEnable = ctx->Polygon.StippleFlag; 1981 1982#if GFX_VER < 8 1983 1984#if GFX_VER >= 6 1985 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 1986 1987 /* _NEW_BUFFERS */ 1988 const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1; 1989 1990 if (multisampled_fbo) { 1991 /* _NEW_MULTISAMPLE */ 1992 if (ctx->Multisample.Enabled) 1993 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 1994 else 1995 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 1996 1997 if (wm_prog_data->persample_dispatch) 1998 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 1999 else 2000 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; 2001 } else { 2002 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 2003 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 2004 } 2005#endif 2006 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 2007 if (wm_prog_data->uses_kill || 2008 _mesa_is_alpha_test_enabled(ctx) || 2009 _mesa_is_alpha_to_coverage_enabled(ctx) || 2010 (GFX_VER >= 6 && wm_prog_data->uses_omask)) { 2011 wm.PixelShaderKillsPixel = true; 2012 } 2013 2014 /* _NEW_BUFFERS | _NEW_COLOR */ 2015 if (brw_color_buffer_write_enabled(brw) || writes_depth || 2016 wm.PixelShaderKillsPixel || 2017 (GFX_VER >= 6 && wm_prog_data->has_side_effects)) { 2018 wm.ThreadDispatchEnable = true; 2019 } 2020 2021#if GFX_VER >= 7 2022 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 2023 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 2024#endif 2025 2026 /* The "UAV access enable" bits are unnecessary on HSW because they only 2027 * seem to have an effect on the HW-assisted coherency mechanism which we 2028 * don't need, and the rasterization-related UAV_ONLY flag and the 2029 * DISPATCH_ENABLE bit can be set independently from it. 2030 * C.f. gfx8_upload_ps_extra(). 2031 * 2032 * BRW_NEW_FRAGMENT_PROGRAM | BRW_NEW_FS_PROG_DATA | _NEW_BUFFERS | 2033 * _NEW_COLOR 2034 */ 2035#if GFX_VERx10 == 75 2036 if (!(brw_color_buffer_write_enabled(brw) || writes_depth) && 2037 wm_prog_data->has_side_effects) 2038 wm.PSUAVonly = ON; 2039#endif 2040#endif 2041 2042#if GFX_VER >= 7 2043 /* BRW_NEW_FS_PROG_DATA */ 2044 if (wm_prog_data->early_fragment_tests) 2045 wm.EarlyDepthStencilControl = EDSC_PREPS; 2046 else if (wm_prog_data->has_side_effects) 2047 wm.EarlyDepthStencilControl = EDSC_PSEXEC; 2048#endif 2049 } 2050 2051#if GFX_VER <= 5 2052 if (brw->wm.offset_clamp != ctx->Polygon.OffsetClamp) { 2053 brw_batch_emit(brw, GENX(3DSTATE_GLOBAL_DEPTH_OFFSET_CLAMP), clamp) { 2054 clamp.GlobalDepthOffsetClamp = ctx->Polygon.OffsetClamp; 2055 } 2056 2057 brw->wm.offset_clamp = ctx->Polygon.OffsetClamp; 2058 } 2059#endif 2060} 2061 2062static const struct brw_tracked_state genX(wm_state) = { 2063 .dirty = { 2064 .mesa = _NEW_LINE | 2065 _NEW_POLYGON | 2066 (GFX_VER < 8 ? _NEW_BUFFERS | 2067 _NEW_COLOR : 2068 0) | 2069 (GFX_VER == 6 ? _NEW_PROGRAM_CONSTANTS : 0) | 2070 (GFX_VER < 6 ? _NEW_POLYGONSTIPPLE : 0) | 2071 (GFX_VER < 8 && GFX_VER >= 6 ? _NEW_MULTISAMPLE : 0), 2072 .brw = BRW_NEW_BLORP | 2073 BRW_NEW_FS_PROG_DATA | 2074 (GFX_VER < 6 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2075 BRW_NEW_FRAGMENT_PROGRAM | 2076 BRW_NEW_PROGRAM_CACHE | 2077 BRW_NEW_SAMPLER_STATE_TABLE | 2078 BRW_NEW_STATS_WM 2079 : 0) | 2080 (GFX_VER < 7 ? BRW_NEW_BATCH : BRW_NEW_CONTEXT), 2081 }, 2082 .emit = genX(upload_wm), 2083}; 2084 2085/* ---------------------------------------------------------------------- */ 2086 2087/* We restrict scratch buffers to the bottom 32 bits of the address space 2088 * by using rw_32_bo(). 2089 * 2090 * General State Base Address is a bit broken. If the address + size as 2091 * seen by STATE_BASE_ADDRESS overflows 48 bits, the GPU appears to treat 2092 * all accesses to the buffer as being out of bounds and returns zero. 2093 */ 2094 2095#define INIT_THREAD_DISPATCH_FIELDS(pkt, prefix) \ 2096 pkt.KernelStartPointer = KSP(brw, stage_state->prog_offset); \ 2097 /* Wa_1606682166 */ \ 2098 pkt.SamplerCount = \ 2099 GFX_VER == 11 ? \ 2100 0 : \ 2101 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); \ 2102 pkt.BindingTableEntryCount = \ 2103 stage_prog_data->binding_table.size_bytes / 4; \ 2104 pkt.FloatingPointMode = stage_prog_data->use_alt_mode; \ 2105 \ 2106 if (stage_prog_data->total_scratch) { \ 2107 pkt.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); \ 2108 pkt.PerThreadScratchSpace = \ 2109 ffs(stage_state->per_thread_scratch) - 11; \ 2110 } \ 2111 \ 2112 pkt.DispatchGRFStartRegisterForURBData = \ 2113 stage_prog_data->dispatch_grf_start_reg; \ 2114 pkt.prefix##URBEntryReadLength = vue_prog_data->urb_read_length; \ 2115 pkt.prefix##URBEntryReadOffset = 0; \ 2116 \ 2117 pkt.StatisticsEnable = true; \ 2118 pkt.Enable = true; 2119 2120static void 2121genX(upload_vs_state)(struct brw_context *brw) 2122{ 2123 UNUSED struct gl_context *ctx = &brw->ctx; 2124 const struct intel_device_info *devinfo = &brw->screen->devinfo; 2125 struct brw_stage_state *stage_state = &brw->vs.base; 2126 2127 /* BRW_NEW_VS_PROG_DATA */ 2128 const struct brw_vue_prog_data *vue_prog_data = 2129 brw_vue_prog_data(brw->vs.base.prog_data); 2130 const struct brw_stage_prog_data *stage_prog_data = &vue_prog_data->base; 2131 2132 assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 || 2133 vue_prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT); 2134 assert(GFX_VER < 11 || 2135 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8); 2136 2137#if GFX_VER == 6 2138 /* From the BSpec, 3D Pipeline > Geometry > Vertex Shader > State, 2139 * 3DSTATE_VS, Dword 5.0 "VS Function Enable": 2140 * 2141 * [DevSNB] A pipeline flush must be programmed prior to a 3DSTATE_VS 2142 * command that causes the VS Function Enable to toggle. Pipeline 2143 * flush can be executed by sending a PIPE_CONTROL command with CS 2144 * stall bit set and a post sync operation. 2145 * 2146 * We've already done such a flush at the start of state upload, so we 2147 * don't need to do another one here. 2148 */ 2149 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), cvs) { 2150 if (stage_state->push_const_size != 0) { 2151 cvs.Buffer0Valid = true; 2152 cvs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 2153 cvs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 2154 } 2155 } 2156#endif 2157 2158 if (GFX_VER == 7 && devinfo->is_ivybridge) 2159 gfx7_emit_vs_workaround_flush(brw); 2160 2161#if GFX_VER >= 6 2162 brw_batch_emit(brw, GENX(3DSTATE_VS), vs) { 2163#else 2164 ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 2165 brw_state_emit(brw, GENX(VS_STATE), 32, &stage_state->state_offset, vs) { 2166#endif 2167 INIT_THREAD_DISPATCH_FIELDS(vs, Vertex); 2168 2169 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; 2170 2171#if GFX_VER < 6 2172 vs.GRFRegisterCount = DIV_ROUND_UP(vue_prog_data->total_grf, 16) - 1; 2173 vs.ConstantURBEntryReadLength = stage_prog_data->curb_read_length; 2174 vs.ConstantURBEntryReadOffset = brw->curbe.vs_start * 2; 2175 2176 vs.NumberofURBEntries = brw->urb.nr_vs_entries >> (GFX_VER == 5 ? 2 : 0); 2177 vs.URBEntryAllocationSize = brw->urb.vsize - 1; 2178 2179 vs.MaximumNumberofThreads = 2180 CLAMP(brw->urb.nr_vs_entries / 2, 1, devinfo->max_vs_threads) - 1; 2181 2182 vs.StatisticsEnable = false; 2183 vs.SamplerStatePointer = 2184 ro_bo(brw->batch.state.bo, stage_state->sampler_offset); 2185#endif 2186 2187#if GFX_VER == 5 2188 /* Force single program flow on Ironlake. We cannot reliably get 2189 * all applications working without it. See: 2190 * https://bugs.freedesktop.org/show_bug.cgi?id=29172 2191 * 2192 * The most notable and reliably failing application is the Humus 2193 * demo "CelShading" 2194 */ 2195 vs.SingleProgramFlow = true; 2196 vs.SamplerCount = 0; /* hardware requirement */ 2197#endif 2198 2199#if GFX_VER >= 8 2200 vs.SIMD8DispatchEnable = 2201 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8; 2202 2203 vs.UserClipDistanceCullTestEnableBitmask = 2204 vue_prog_data->cull_distance_mask; 2205#endif 2206 } 2207 2208#if GFX_VER == 6 2209 /* Based on my reading of the simulator, the VS constants don't get 2210 * pulled into the VS FF unit until an appropriate pipeline flush 2211 * happens, and instead the 3DSTATE_CONSTANT_VS packet just adds 2212 * references to them into a little FIFO. The flushes are common, 2213 * but don't reliably happen between this and a 3DPRIMITIVE, causing 2214 * the primitive to use the wrong constants. Then the FIFO 2215 * containing the constant setup gets added to again on the next 2216 * constants change, and eventually when a flush does happen the 2217 * unit is overwhelmed by constant changes and dies. 2218 * 2219 * To avoid this, send a PIPE_CONTROL down the line that will 2220 * update the unit immediately loading the constants. The flush 2221 * type bits here were those set by the STATE_BASE_ADDRESS whose 2222 * move in a82a43e8d99e1715dd11c9c091b5ab734079b6a6 triggered the 2223 * bug reports that led to this workaround, and may be more than 2224 * what is strictly required to avoid the issue. 2225 */ 2226 brw_emit_pipe_control_flush(brw, 2227 PIPE_CONTROL_DEPTH_STALL | 2228 PIPE_CONTROL_INSTRUCTION_INVALIDATE | 2229 PIPE_CONTROL_STATE_CACHE_INVALIDATE); 2230#endif 2231} 2232 2233static const struct brw_tracked_state genX(vs_state) = { 2234 .dirty = { 2235 .mesa = (GFX_VER == 6 ? (_NEW_PROGRAM_CONSTANTS | _NEW_TRANSFORM) : 0), 2236 .brw = BRW_NEW_BATCH | 2237 BRW_NEW_BLORP | 2238 BRW_NEW_CONTEXT | 2239 BRW_NEW_VS_PROG_DATA | 2240 (GFX_VER == 6 ? BRW_NEW_VERTEX_PROGRAM : 0) | 2241 (GFX_VER <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2242 BRW_NEW_PROGRAM_CACHE | 2243 BRW_NEW_SAMPLER_STATE_TABLE | 2244 BRW_NEW_URB_FENCE 2245 : 0), 2246 }, 2247 .emit = genX(upload_vs_state), 2248}; 2249 2250/* ---------------------------------------------------------------------- */ 2251 2252static void 2253genX(upload_cc_viewport)(struct brw_context *brw) 2254{ 2255 struct gl_context *ctx = &brw->ctx; 2256 2257 /* BRW_NEW_VIEWPORT_COUNT */ 2258 const unsigned viewport_count = brw->clip.viewport_count; 2259 2260 struct GENX(CC_VIEWPORT) ccv; 2261 uint32_t cc_vp_offset; 2262 uint32_t *cc_map = 2263 brw_state_batch(brw, 4 * GENX(CC_VIEWPORT_length) * viewport_count, 2264 32, &cc_vp_offset); 2265 2266 for (unsigned i = 0; i < viewport_count; i++) { 2267 /* _NEW_VIEWPORT | _NEW_TRANSFORM */ 2268 const struct gl_viewport_attrib *vp = &ctx->ViewportArray[i]; 2269 if (ctx->Transform.DepthClampNear && ctx->Transform.DepthClampFar) { 2270 ccv.MinimumDepth = MIN2(vp->Near, vp->Far); 2271 ccv.MaximumDepth = MAX2(vp->Near, vp->Far); 2272 } else if (ctx->Transform.DepthClampNear) { 2273 ccv.MinimumDepth = MIN2(vp->Near, vp->Far); 2274 ccv.MaximumDepth = 0.0; 2275 } else if (ctx->Transform.DepthClampFar) { 2276 ccv.MinimumDepth = 0.0; 2277 ccv.MaximumDepth = MAX2(vp->Near, vp->Far); 2278 } else { 2279 ccv.MinimumDepth = 0.0; 2280 ccv.MaximumDepth = 1.0; 2281 } 2282 GENX(CC_VIEWPORT_pack)(NULL, cc_map, &ccv); 2283 cc_map += GENX(CC_VIEWPORT_length); 2284 } 2285 2286#if GFX_VER >= 7 2287 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_CC), ptr) { 2288 ptr.CCViewportPointer = cc_vp_offset; 2289 } 2290#elif GFX_VER == 6 2291 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 2292 vp.CCViewportStateChange = 1; 2293 vp.PointertoCC_VIEWPORT = cc_vp_offset; 2294 } 2295#else 2296 brw->cc.vp_offset = cc_vp_offset; 2297 ctx->NewDriverState |= BRW_NEW_CC_VP; 2298#endif 2299} 2300 2301const struct brw_tracked_state genX(cc_vp) = { 2302 .dirty = { 2303 .mesa = _NEW_TRANSFORM | 2304 _NEW_VIEWPORT, 2305 .brw = BRW_NEW_BATCH | 2306 BRW_NEW_BLORP | 2307 BRW_NEW_VIEWPORT_COUNT, 2308 }, 2309 .emit = genX(upload_cc_viewport) 2310}; 2311 2312/* ---------------------------------------------------------------------- */ 2313 2314static void 2315set_scissor_bits(const struct gl_context *ctx, int i, 2316 bool flip_y, unsigned fb_width, unsigned fb_height, 2317 struct GENX(SCISSOR_RECT) *sc) 2318{ 2319 int bbox[4]; 2320 2321 bbox[0] = MAX2(ctx->ViewportArray[i].X, 0); 2322 bbox[1] = MIN2(bbox[0] + ctx->ViewportArray[i].Width, fb_width); 2323 bbox[2] = CLAMP(ctx->ViewportArray[i].Y, 0, fb_height); 2324 bbox[3] = MIN2(bbox[2] + ctx->ViewportArray[i].Height, fb_height); 2325 _mesa_intersect_scissor_bounding_box(ctx, i, bbox); 2326 2327 if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) { 2328 /* If the scissor was out of bounds and got clamped to 0 width/height 2329 * at the bounds, the subtraction of 1 from maximums could produce a 2330 * negative number and thus not clip anything. Instead, just provide 2331 * a min > max scissor inside the bounds, which produces the expected 2332 * no rendering. 2333 */ 2334 sc->ScissorRectangleXMin = 1; 2335 sc->ScissorRectangleXMax = 0; 2336 sc->ScissorRectangleYMin = 1; 2337 sc->ScissorRectangleYMax = 0; 2338 } else if (!flip_y) { 2339 /* texmemory: Y=0=bottom */ 2340 sc->ScissorRectangleXMin = bbox[0]; 2341 sc->ScissorRectangleXMax = bbox[1] - 1; 2342 sc->ScissorRectangleYMin = bbox[2]; 2343 sc->ScissorRectangleYMax = bbox[3] - 1; 2344 } else { 2345 /* memory: Y=0=top */ 2346 sc->ScissorRectangleXMin = bbox[0]; 2347 sc->ScissorRectangleXMax = bbox[1] - 1; 2348 sc->ScissorRectangleYMin = fb_height - bbox[3]; 2349 sc->ScissorRectangleYMax = fb_height - bbox[2] - 1; 2350 } 2351} 2352 2353#if GFX_VER >= 6 2354static void 2355genX(upload_scissor_state)(struct brw_context *brw) 2356{ 2357 struct gl_context *ctx = &brw->ctx; 2358 const bool flip_y = ctx->DrawBuffer->FlipY; 2359 struct GENX(SCISSOR_RECT) scissor; 2360 uint32_t scissor_state_offset; 2361 const unsigned int fb_width = _mesa_geometric_width(ctx->DrawBuffer); 2362 const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer); 2363 uint32_t *scissor_map; 2364 2365 /* BRW_NEW_VIEWPORT_COUNT */ 2366 const unsigned viewport_count = brw->clip.viewport_count; 2367 /* Wa_1409725701: 2368 * "The viewport-specific state used by the SF unit (SCISSOR_RECT) is 2369 * stored as an array of up to 16 elements. The location of first 2370 * element of the array, as specified by Pointer to SCISSOR_RECT, should 2371 * be aligned to a 64-byte boundary. 2372 */ 2373 const unsigned alignment = 64; 2374 scissor_map = brw_state_batch( 2375 brw, GENX(SCISSOR_RECT_length) * sizeof(uint32_t) * viewport_count, 2376 alignment, &scissor_state_offset); 2377 2378 /* _NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT */ 2379 2380 /* The scissor only needs to handle the intersection of drawable and 2381 * scissor rect. Clipping to the boundaries of static shared buffers 2382 * for front/back/depth is covered by looping over cliprects in brw_draw.c. 2383 * 2384 * Note that the hardware's coordinates are inclusive, while Mesa's min is 2385 * inclusive but max is exclusive. 2386 */ 2387 for (unsigned i = 0; i < viewport_count; i++) { 2388 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, &scissor); 2389 GENX(SCISSOR_RECT_pack)( 2390 NULL, scissor_map + i * GENX(SCISSOR_RECT_length), &scissor); 2391 } 2392 2393 brw_batch_emit(brw, GENX(3DSTATE_SCISSOR_STATE_POINTERS), ptr) { 2394 ptr.ScissorRectPointer = scissor_state_offset; 2395 } 2396} 2397 2398static const struct brw_tracked_state genX(scissor_state) = { 2399 .dirty = { 2400 .mesa = _NEW_BUFFERS | 2401 _NEW_SCISSOR | 2402 _NEW_VIEWPORT, 2403 .brw = BRW_NEW_BATCH | 2404 BRW_NEW_BLORP | 2405 BRW_NEW_VIEWPORT_COUNT, 2406 }, 2407 .emit = genX(upload_scissor_state), 2408}; 2409#endif 2410 2411/* ---------------------------------------------------------------------- */ 2412 2413static void 2414genX(upload_sf_clip_viewport)(struct brw_context *brw) 2415{ 2416 struct gl_context *ctx = &brw->ctx; 2417 float y_scale, y_bias; 2418 2419 /* BRW_NEW_VIEWPORT_COUNT */ 2420 const unsigned viewport_count = brw->clip.viewport_count; 2421 2422 /* _NEW_BUFFERS */ 2423 const bool flip_y = ctx->DrawBuffer->FlipY; 2424 const uint32_t fb_width = (float)_mesa_geometric_width(ctx->DrawBuffer); 2425 const uint32_t fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer); 2426 2427#if GFX_VER >= 7 2428#define clv sfv 2429 struct GENX(SF_CLIP_VIEWPORT) sfv; 2430 uint32_t sf_clip_vp_offset; 2431 uint32_t *sf_clip_map = 2432 brw_state_batch(brw, GENX(SF_CLIP_VIEWPORT_length) * 4 * viewport_count, 2433 64, &sf_clip_vp_offset); 2434#else 2435 struct GENX(SF_VIEWPORT) sfv; 2436 struct GENX(CLIP_VIEWPORT) clv; 2437 uint32_t sf_vp_offset, clip_vp_offset; 2438 uint32_t *sf_map = 2439 brw_state_batch(brw, GENX(SF_VIEWPORT_length) * 4 * viewport_count, 2440 32, &sf_vp_offset); 2441 uint32_t *clip_map = 2442 brw_state_batch(brw, GENX(CLIP_VIEWPORT_length) * 4 * viewport_count, 2443 32, &clip_vp_offset); 2444#endif 2445 2446 /* _NEW_BUFFERS */ 2447 if (flip_y) { 2448 y_scale = -1.0; 2449 y_bias = (float)fb_height; 2450 } else { 2451 y_scale = 1.0; 2452 y_bias = 0; 2453 } 2454 2455 for (unsigned i = 0; i < brw->clip.viewport_count; i++) { 2456 /* _NEW_VIEWPORT: Guardband Clipping */ 2457 float scale[3], translate[3], gb_xmin, gb_xmax, gb_ymin, gb_ymax; 2458 _mesa_get_viewport_xform(ctx, i, scale, translate); 2459 2460 sfv.ViewportMatrixElementm00 = scale[0]; 2461 sfv.ViewportMatrixElementm11 = scale[1] * y_scale, 2462 sfv.ViewportMatrixElementm22 = scale[2], 2463 sfv.ViewportMatrixElementm30 = translate[0], 2464 sfv.ViewportMatrixElementm31 = translate[1] * y_scale + y_bias, 2465 sfv.ViewportMatrixElementm32 = translate[2], 2466 intel_calculate_guardband_size(fb_width, fb_height, 2467 sfv.ViewportMatrixElementm00, 2468 sfv.ViewportMatrixElementm11, 2469 sfv.ViewportMatrixElementm30, 2470 sfv.ViewportMatrixElementm31, 2471 &gb_xmin, &gb_xmax, &gb_ymin, &gb_ymax); 2472 2473 2474 clv.XMinClipGuardband = gb_xmin; 2475 clv.XMaxClipGuardband = gb_xmax; 2476 clv.YMinClipGuardband = gb_ymin; 2477 clv.YMaxClipGuardband = gb_ymax; 2478 2479#if GFX_VER < 6 2480 set_scissor_bits(ctx, i, flip_y, fb_width, fb_height, 2481 &sfv.ScissorRectangle); 2482#elif GFX_VER >= 8 2483 /* _NEW_VIEWPORT | _NEW_BUFFERS: Screen Space Viewport 2484 * The hardware will take the intersection of the drawing rectangle, 2485 * scissor rectangle, and the viewport extents. However, emitting 2486 * 3DSTATE_DRAWING_RECTANGLE is expensive since it requires a full 2487 * pipeline stall so we're better off just being a little more clever 2488 * with our viewport so we can emit it once at context creation time. 2489 */ 2490 const float viewport_Xmin = MAX2(ctx->ViewportArray[i].X, 0); 2491 const float viewport_Ymin = MAX2(ctx->ViewportArray[i].Y, 0); 2492 const float viewport_Xmax = 2493 MIN2(ctx->ViewportArray[i].X + ctx->ViewportArray[i].Width, fb_width); 2494 const float viewport_Ymax = 2495 MIN2(ctx->ViewportArray[i].Y + ctx->ViewportArray[i].Height, fb_height); 2496 2497 if (flip_y) { 2498 sfv.XMinViewPort = viewport_Xmin; 2499 sfv.XMaxViewPort = viewport_Xmax - 1; 2500 sfv.YMinViewPort = fb_height - viewport_Ymax; 2501 sfv.YMaxViewPort = fb_height - viewport_Ymin - 1; 2502 } else { 2503 sfv.XMinViewPort = viewport_Xmin; 2504 sfv.XMaxViewPort = viewport_Xmax - 1; 2505 sfv.YMinViewPort = viewport_Ymin; 2506 sfv.YMaxViewPort = viewport_Ymax - 1; 2507 } 2508#endif 2509 2510#if GFX_VER >= 7 2511 GENX(SF_CLIP_VIEWPORT_pack)(NULL, sf_clip_map, &sfv); 2512 sf_clip_map += GENX(SF_CLIP_VIEWPORT_length); 2513#else 2514 GENX(SF_VIEWPORT_pack)(NULL, sf_map, &sfv); 2515 GENX(CLIP_VIEWPORT_pack)(NULL, clip_map, &clv); 2516 sf_map += GENX(SF_VIEWPORT_length); 2517 clip_map += GENX(CLIP_VIEWPORT_length); 2518#endif 2519 } 2520 2521#if GFX_VER >= 7 2522 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS_SF_CLIP), ptr) { 2523 ptr.SFClipViewportPointer = sf_clip_vp_offset; 2524 } 2525#elif GFX_VER == 6 2526 brw_batch_emit(brw, GENX(3DSTATE_VIEWPORT_STATE_POINTERS), vp) { 2527 vp.SFViewportStateChange = 1; 2528 vp.CLIPViewportStateChange = 1; 2529 vp.PointertoCLIP_VIEWPORT = clip_vp_offset; 2530 vp.PointertoSF_VIEWPORT = sf_vp_offset; 2531 } 2532#else 2533 brw->sf.vp_offset = sf_vp_offset; 2534 brw->clip.vp_offset = clip_vp_offset; 2535 brw->ctx.NewDriverState |= BRW_NEW_SF_VP | BRW_NEW_CLIP_VP; 2536#endif 2537} 2538 2539static const struct brw_tracked_state genX(sf_clip_viewport) = { 2540 .dirty = { 2541 .mesa = _NEW_BUFFERS | 2542 _NEW_VIEWPORT | 2543 (GFX_VER <= 5 ? _NEW_SCISSOR : 0), 2544 .brw = BRW_NEW_BATCH | 2545 BRW_NEW_BLORP | 2546 BRW_NEW_VIEWPORT_COUNT, 2547 }, 2548 .emit = genX(upload_sf_clip_viewport), 2549}; 2550 2551/* ---------------------------------------------------------------------- */ 2552 2553static void 2554genX(upload_gs_state)(struct brw_context *brw) 2555{ 2556 UNUSED struct gl_context *ctx = &brw->ctx; 2557 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 2558 const struct brw_stage_state *stage_state = &brw->gs.base; 2559 const struct gl_program *gs_prog = brw->programs[MESA_SHADER_GEOMETRY]; 2560 /* BRW_NEW_GEOMETRY_PROGRAM */ 2561 bool active = GFX_VER >= 6 && gs_prog; 2562 2563 /* BRW_NEW_GS_PROG_DATA */ 2564 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 2565 UNUSED const struct brw_vue_prog_data *vue_prog_data = 2566 brw_vue_prog_data(stage_prog_data); 2567#if GFX_VER >= 7 2568 const struct brw_gs_prog_data *gs_prog_data = 2569 brw_gs_prog_data(stage_prog_data); 2570#endif 2571 2572#if GFX_VER == 6 2573 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_GS), cgs) { 2574 if (active && stage_state->push_const_size != 0) { 2575 cgs.Buffer0Valid = true; 2576 cgs.ConstantBody.PointertoConstantBuffer0 = stage_state->push_const_offset; 2577 cgs.ConstantBody.ConstantBuffer0ReadLength = stage_state->push_const_size - 1; 2578 } 2579 } 2580#endif 2581 2582#if GFX_VERx10 == 70 2583 /** 2584 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > 2585 * Geometry > Geometry Shader > State: 2586 * 2587 * "Note: Because of corruption in IVB:GT2, software needs to flush the 2588 * whole fixed function pipeline when the GS enable changes value in 2589 * the 3DSTATE_GS." 2590 * 2591 * The hardware architects have clarified that in this context "flush the 2592 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS 2593 * Stall" bit set. 2594 */ 2595 if (devinfo->gt == 2 && brw->gs.enabled != active) 2596 gfx7_emit_cs_stall_flush(brw); 2597#endif 2598 2599#if GFX_VER >= 6 2600 brw_batch_emit(brw, GENX(3DSTATE_GS), gs) { 2601#else 2602 ctx->NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 2603 brw_state_emit(brw, GENX(GS_STATE), 32, &brw->ff_gs.state_offset, gs) { 2604#endif 2605 2606#if GFX_VER >= 6 2607 if (active) { 2608 INIT_THREAD_DISPATCH_FIELDS(gs, Vertex); 2609 2610#if GFX_VER >= 7 2611 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; 2612 gs.OutputTopology = gs_prog_data->output_topology; 2613 gs.ControlDataHeaderSize = 2614 gs_prog_data->control_data_header_size_hwords; 2615 2616 gs.InstanceControl = gs_prog_data->invocations - 1; 2617 gs.DispatchMode = vue_prog_data->dispatch_mode; 2618 2619 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; 2620 2621 gs.ControlDataFormat = gs_prog_data->control_data_format; 2622#endif 2623 2624 /* Note: the meaning of the GFX7_GS_REORDER_TRAILING bit changes between 2625 * Ivy Bridge and Haswell. 2626 * 2627 * On Ivy Bridge, setting this bit causes the vertices of a triangle 2628 * strip to be delivered to the geometry shader in an order that does 2629 * not strictly follow the OpenGL spec, but preserves triangle 2630 * orientation. For example, if the vertices are (1, 2, 3, 4, 5), then 2631 * the geometry shader sees triangles: 2632 * 2633 * (1, 2, 3), (2, 4, 3), (3, 4, 5) 2634 * 2635 * (Clearing the bit is even worse, because it fails to preserve 2636 * orientation). 2637 * 2638 * Triangle strips with adjacency always ordered in a way that preserves 2639 * triangle orientation but does not strictly follow the OpenGL spec, 2640 * regardless of the setting of this bit. 2641 * 2642 * On Haswell, both triangle strips and triangle strips with adjacency 2643 * are always ordered in a way that preserves triangle orientation. 2644 * Setting this bit causes the ordering to strictly follow the OpenGL 2645 * spec. 2646 * 2647 * So in either case we want to set the bit. Unfortunately on Ivy 2648 * Bridge this will get the order close to correct but not perfect. 2649 */ 2650 gs.ReorderMode = TRAILING; 2651 gs.MaximumNumberofThreads = 2652 GFX_VER == 8 ? (devinfo->max_gs_threads / 2 - 1) 2653 : (devinfo->max_gs_threads - 1); 2654 2655#if GFX_VER < 7 2656 gs.SOStatisticsEnable = true; 2657 if (gs_prog->info.has_transform_feedback_varyings) 2658 gs.SVBIPayloadEnable = _mesa_is_xfb_active_and_unpaused(ctx); 2659 2660 /* GFX6_GS_SPF_MODE and GFX6_GS_VECTOR_MASK_ENABLE are enabled as it 2661 * was previously done for gfx6. 2662 * 2663 * TODO: test with both disabled to see if the HW is behaving 2664 * as expected, like in gfx7. 2665 */ 2666 gs.SingleProgramFlow = true; 2667 gs.VectorMaskEnable = true; 2668#endif 2669 2670#if GFX_VER >= 8 2671 gs.ExpectedVertexCount = gs_prog_data->vertices_in; 2672 2673 if (gs_prog_data->static_vertex_count != -1) { 2674 gs.StaticOutput = true; 2675 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count; 2676 } 2677 gs.IncludeVertexHandles = vue_prog_data->include_vue_handles; 2678 2679 gs.UserClipDistanceCullTestEnableBitmask = 2680 vue_prog_data->cull_distance_mask; 2681 2682 const int urb_entry_write_offset = 1; 2683 const uint32_t urb_entry_output_length = 2684 DIV_ROUND_UP(vue_prog_data->vue_map.num_slots, 2) - 2685 urb_entry_write_offset; 2686 2687 gs.VertexURBEntryOutputReadOffset = urb_entry_write_offset; 2688 gs.VertexURBEntryOutputLength = MAX2(urb_entry_output_length, 1); 2689#endif 2690 } 2691#endif 2692 2693#if GFX_VER <= 6 2694 if (!active && brw->ff_gs.prog_active) { 2695 /* In gfx6, transform feedback for the VS stage is done with an 2696 * ad-hoc GS program. This function provides the needed 3DSTATE_GS 2697 * for this. 2698 */ 2699 gs.KernelStartPointer = KSP(brw, brw->ff_gs.prog_offset); 2700 gs.SingleProgramFlow = true; 2701 gs.DispatchGRFStartRegisterForURBData = GFX_VER == 6 ? 2 : 1; 2702 gs.VertexURBEntryReadLength = brw->ff_gs.prog_data->urb_read_length; 2703 2704#if GFX_VER <= 5 2705 gs.GRFRegisterCount = 2706 DIV_ROUND_UP(brw->ff_gs.prog_data->total_grf, 16) - 1; 2707 /* BRW_NEW_URB_FENCE */ 2708 gs.NumberofURBEntries = brw->urb.nr_gs_entries; 2709 gs.URBEntryAllocationSize = brw->urb.vsize - 1; 2710 gs.MaximumNumberofThreads = brw->urb.nr_gs_entries >= 8 ? 1 : 0; 2711 gs.FloatingPointMode = FLOATING_POINT_MODE_Alternate; 2712#else 2713 gs.Enable = true; 2714 gs.VectorMaskEnable = true; 2715 gs.SVBIPayloadEnable = true; 2716 gs.SVBIPostIncrementEnable = true; 2717 gs.SVBIPostIncrementValue = 2718 brw->ff_gs.prog_data->svbi_postincrement_value; 2719 gs.SOStatisticsEnable = true; 2720 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1; 2721#endif 2722 } 2723#endif 2724 if (!active && !brw->ff_gs.prog_active) { 2725#if GFX_VER < 8 2726 gs.DispatchGRFStartRegisterForURBData = 1; 2727#if GFX_VER >= 7 2728 gs.IncludeVertexHandles = true; 2729#endif 2730#endif 2731 } 2732 2733#if GFX_VER >= 6 2734 gs.StatisticsEnable = true; 2735#endif 2736#if GFX_VER == 5 || GFX_VER == 6 2737 gs.RenderingEnabled = true; 2738#endif 2739#if GFX_VER <= 5 2740 gs.MaximumVPIndex = brw->clip.viewport_count - 1; 2741#endif 2742 } 2743 2744#if GFX_VER == 6 2745 brw->gs.enabled = active; 2746#endif 2747} 2748 2749static const struct brw_tracked_state genX(gs_state) = { 2750 .dirty = { 2751 .mesa = (GFX_VER == 6 ? _NEW_PROGRAM_CONSTANTS : 0), 2752 .brw = BRW_NEW_BATCH | 2753 BRW_NEW_BLORP | 2754 (GFX_VER <= 5 ? BRW_NEW_PUSH_CONSTANT_ALLOCATION | 2755 BRW_NEW_PROGRAM_CACHE | 2756 BRW_NEW_URB_FENCE | 2757 BRW_NEW_VIEWPORT_COUNT 2758 : 0) | 2759 (GFX_VER >= 6 ? BRW_NEW_CONTEXT | 2760 BRW_NEW_GEOMETRY_PROGRAM | 2761 BRW_NEW_GS_PROG_DATA 2762 : 0) | 2763 (GFX_VER < 7 ? BRW_NEW_FF_GS_PROG_DATA : 0), 2764 }, 2765 .emit = genX(upload_gs_state), 2766}; 2767 2768/* ---------------------------------------------------------------------- */ 2769 2770UNUSED static GLenum 2771fix_dual_blend_alpha_to_one(GLenum function) 2772{ 2773 switch (function) { 2774 case GL_SRC1_ALPHA: 2775 return GL_ONE; 2776 2777 case GL_ONE_MINUS_SRC1_ALPHA: 2778 return GL_ZERO; 2779 } 2780 2781 return function; 2782} 2783 2784#define blend_factor(x) brw_translate_blend_factor(x) 2785#define blend_eqn(x) brw_translate_blend_equation(x) 2786 2787/** 2788 * Modify blend function to force destination alpha to 1.0 2789 * 2790 * If \c function specifies a blend function that uses destination alpha, 2791 * replace it with a function that hard-wires destination alpha to 1.0. This 2792 * is used when rendering to xRGB targets. 2793 */ 2794static GLenum 2795brw_fix_xRGB_alpha(GLenum function) 2796{ 2797 switch (function) { 2798 case GL_DST_ALPHA: 2799 return GL_ONE; 2800 2801 case GL_ONE_MINUS_DST_ALPHA: 2802 case GL_SRC_ALPHA_SATURATE: 2803 return GL_ZERO; 2804 } 2805 2806 return function; 2807} 2808 2809#if GFX_VER >= 6 2810typedef struct GENX(BLEND_STATE_ENTRY) BLEND_ENTRY_GENXML; 2811#else 2812typedef struct GENX(COLOR_CALC_STATE) BLEND_ENTRY_GENXML; 2813#endif 2814 2815UNUSED static bool 2816set_blend_entry_bits(struct brw_context *brw, BLEND_ENTRY_GENXML *entry, int i, 2817 bool alpha_to_one) 2818{ 2819 struct gl_context *ctx = &brw->ctx; 2820 2821 /* _NEW_BUFFERS */ 2822 const struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i]; 2823 2824 bool independent_alpha_blend = false; 2825 2826 /* Used for implementing the following bit of GL_EXT_texture_integer: 2827 * "Per-fragment operations that require floating-point color 2828 * components, including multisample alpha operations, alpha test, 2829 * blending, and dithering, have no effect when the corresponding 2830 * colors are written to an integer color buffer." 2831 */ 2832 const bool integer = ctx->DrawBuffer->_IntegerBuffers & (0x1 << i); 2833 2834 const unsigned blend_enabled = GFX_VER >= 6 ? 2835 ctx->Color.BlendEnabled & (1 << i) : ctx->Color.BlendEnabled; 2836 2837 /* _NEW_COLOR */ 2838 if (ctx->Color.ColorLogicOpEnabled) { 2839 GLenum rb_type = rb ? _mesa_get_format_datatype(rb->Format) 2840 : GL_UNSIGNED_NORMALIZED; 2841 WARN_ONCE(ctx->Color.LogicOp != GL_COPY && 2842 rb_type != GL_UNSIGNED_NORMALIZED && 2843 rb_type != GL_FLOAT, "Ignoring %s logic op on %s " 2844 "renderbuffer\n", 2845 _mesa_enum_to_string(ctx->Color.LogicOp), 2846 _mesa_enum_to_string(rb_type)); 2847 if (GFX_VER >= 8 || rb_type == GL_UNSIGNED_NORMALIZED) { 2848 entry->LogicOpEnable = true; 2849 entry->LogicOpFunction = ctx->Color._LogicOp; 2850 } 2851 } else if (blend_enabled && 2852 ctx->Color._AdvancedBlendMode == BLEND_NONE 2853 && (GFX_VER <= 5 || !integer)) { 2854 GLenum eqRGB = ctx->Color.Blend[i].EquationRGB; 2855 GLenum eqA = ctx->Color.Blend[i].EquationA; 2856 GLenum srcRGB = ctx->Color.Blend[i].SrcRGB; 2857 GLenum dstRGB = ctx->Color.Blend[i].DstRGB; 2858 GLenum srcA = ctx->Color.Blend[i].SrcA; 2859 GLenum dstA = ctx->Color.Blend[i].DstA; 2860 2861 if (eqRGB == GL_MIN || eqRGB == GL_MAX) 2862 srcRGB = dstRGB = GL_ONE; 2863 2864 if (eqA == GL_MIN || eqA == GL_MAX) 2865 srcA = dstA = GL_ONE; 2866 2867 /* Due to hardware limitations, the destination may have information 2868 * in an alpha channel even when the format specifies no alpha 2869 * channel. In order to avoid getting any incorrect blending due to 2870 * that alpha channel, coerce the blend factors to values that will 2871 * not read the alpha channel, but will instead use the correct 2872 * implicit value for alpha. 2873 */ 2874 if (rb && !_mesa_base_format_has_channel(rb->_BaseFormat, 2875 GL_TEXTURE_ALPHA_TYPE)) { 2876 srcRGB = brw_fix_xRGB_alpha(srcRGB); 2877 srcA = brw_fix_xRGB_alpha(srcA); 2878 dstRGB = brw_fix_xRGB_alpha(dstRGB); 2879 dstA = brw_fix_xRGB_alpha(dstA); 2880 } 2881 2882 /* From the BLEND_STATE docs, DWord 0, Bit 29 (AlphaToOne Enable): 2883 * "If Dual Source Blending is enabled, this bit must be disabled." 2884 * 2885 * We override SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO, 2886 * and leave it enabled anyway. 2887 */ 2888 if (GFX_VER >= 6 && ctx->Color._BlendUsesDualSrc & (1 << i) && alpha_to_one) { 2889 srcRGB = fix_dual_blend_alpha_to_one(srcRGB); 2890 srcA = fix_dual_blend_alpha_to_one(srcA); 2891 dstRGB = fix_dual_blend_alpha_to_one(dstRGB); 2892 dstA = fix_dual_blend_alpha_to_one(dstA); 2893 } 2894 2895 /* BRW_NEW_FS_PROG_DATA */ 2896 const struct brw_wm_prog_data *wm_prog_data = 2897 brw_wm_prog_data(brw->wm.base.prog_data); 2898 2899 /* The Dual Source Blending documentation says: 2900 * 2901 * "If SRC1 is included in a src/dst blend factor and 2902 * a DualSource RT Write message is not used, results 2903 * are UNDEFINED. (This reflects the same restriction in DX APIs, 2904 * where undefined results are produced if “o1” is not written 2905 * by a PS – there are no default values defined). 2906 * If SRC1 is not included in a src/dst blend factor, 2907 * dual source blending must be disabled." 2908 * 2909 * There is no way to gracefully fix this undefined situation 2910 * so we just disable the blending to prevent possible issues. 2911 */ 2912 entry->ColorBufferBlendEnable = 2913 !(ctx->Color._BlendUsesDualSrc & 0x1) || wm_prog_data->dual_src_blend; 2914 2915 entry->DestinationBlendFactor = blend_factor(dstRGB); 2916 entry->SourceBlendFactor = blend_factor(srcRGB); 2917 entry->DestinationAlphaBlendFactor = blend_factor(dstA); 2918 entry->SourceAlphaBlendFactor = blend_factor(srcA); 2919 entry->ColorBlendFunction = blend_eqn(eqRGB); 2920 entry->AlphaBlendFunction = blend_eqn(eqA); 2921 2922 if (srcA != srcRGB || dstA != dstRGB || eqA != eqRGB) 2923 independent_alpha_blend = true; 2924 } 2925 2926 return independent_alpha_blend; 2927} 2928 2929#if GFX_VER >= 6 2930static void 2931genX(upload_blend_state)(struct brw_context *brw) 2932{ 2933 struct gl_context *ctx = &brw->ctx; 2934 int size; 2935 2936 /* We need at least one BLEND_STATE written, because we might do 2937 * thread dispatch even if _NumColorDrawBuffers is 0 (for example 2938 * for computed depth or alpha test), which will do an FB write 2939 * with render target 0, which will reference BLEND_STATE[0] for 2940 * alpha test enable. 2941 */ 2942 int nr_draw_buffers = ctx->DrawBuffer->_NumColorDrawBuffers; 2943 if (nr_draw_buffers == 0 && ctx->Color.AlphaEnabled) 2944 nr_draw_buffers = 1; 2945 2946 size = GENX(BLEND_STATE_ENTRY_length) * 4 * nr_draw_buffers; 2947#if GFX_VER >= 8 2948 size += GENX(BLEND_STATE_length) * 4; 2949#endif 2950 2951 uint32_t *blend_map; 2952 blend_map = brw_state_batch(brw, size, 64, &brw->cc.blend_state_offset); 2953 2954#if GFX_VER >= 8 2955 struct GENX(BLEND_STATE) blend = { 0 }; 2956 { 2957#else 2958 for (int i = 0; i < nr_draw_buffers; i++) { 2959 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 2960#define blend entry 2961#endif 2962 /* OpenGL specification 3.3 (page 196), section 4.1.3 says: 2963 * "If drawbuffer zero is not NONE and the buffer it references has an 2964 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE 2965 * operations are skipped." 2966 */ 2967 if (!(ctx->DrawBuffer->_IntegerBuffers & 0x1)) { 2968 /* _NEW_MULTISAMPLE */ 2969 if (_mesa_is_multisample_enabled(ctx)) { 2970 if (ctx->Multisample.SampleAlphaToCoverage) { 2971 blend.AlphaToCoverageEnable = true; 2972 blend.AlphaToCoverageDitherEnable = GFX_VER >= 7; 2973 } 2974 if (ctx->Multisample.SampleAlphaToOne) 2975 blend.AlphaToOneEnable = true; 2976 } 2977 2978 /* _NEW_COLOR */ 2979 if (ctx->Color.AlphaEnabled) { 2980 blend.AlphaTestEnable = true; 2981 blend.AlphaTestFunction = 2982 brw_translate_compare_func(ctx->Color.AlphaFunc); 2983 } 2984 2985 if (ctx->Color.DitherFlag) { 2986 blend.ColorDitherEnable = true; 2987 } 2988 } 2989 2990#if GFX_VER >= 8 2991 for (int i = 0; i < nr_draw_buffers; i++) { 2992 struct GENX(BLEND_STATE_ENTRY) entry = { 0 }; 2993#else 2994 { 2995#endif 2996 blend.IndependentAlphaBlendEnable = 2997 set_blend_entry_bits(brw, &entry, i, blend.AlphaToOneEnable) || 2998 blend.IndependentAlphaBlendEnable; 2999 3000 /* See section 8.1.6 "Pre-Blend Color Clamping" of the 3001 * SandyBridge PRM Volume 2 Part 1 for HW requirements. 3002 * 3003 * We do our ARB_color_buffer_float CLAMP_FRAGMENT_COLOR 3004 * clamping in the fragment shader. For its clamping of 3005 * blending, the spec says: 3006 * 3007 * "RESOLVED: For fixed-point color buffers, the inputs and 3008 * the result of the blending equation are clamped. For 3009 * floating-point color buffers, no clamping occurs." 3010 * 3011 * So, generally, we want clamping to the render target's range. 3012 * And, good news, the hardware tables for both pre- and 3013 * post-blend color clamping are either ignored, or any are 3014 * allowed, or clamping is required but RT range clamping is a 3015 * valid option. 3016 */ 3017 entry.PreBlendColorClampEnable = true; 3018 entry.PostBlendColorClampEnable = true; 3019 entry.ColorClampRange = COLORCLAMP_RTFORMAT; 3020 3021 entry.WriteDisableRed = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 0); 3022 entry.WriteDisableGreen = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 1); 3023 entry.WriteDisableBlue = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 2); 3024 entry.WriteDisableAlpha = !GET_COLORMASK_BIT(ctx->Color.ColorMask, i, 3); 3025 3026#if GFX_VER >= 8 3027 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[1 + i * 2], &entry); 3028#else 3029 GENX(BLEND_STATE_ENTRY_pack)(NULL, &blend_map[i * 2], &entry); 3030#endif 3031 } 3032 } 3033 3034#if GFX_VER >= 8 3035 GENX(BLEND_STATE_pack)(NULL, blend_map, &blend); 3036#endif 3037 3038#if GFX_VER < 7 3039 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 3040 ptr.PointertoBLEND_STATE = brw->cc.blend_state_offset; 3041 ptr.BLEND_STATEChange = true; 3042 } 3043#else 3044 brw_batch_emit(brw, GENX(3DSTATE_BLEND_STATE_POINTERS), ptr) { 3045 ptr.BlendStatePointer = brw->cc.blend_state_offset; 3046#if GFX_VER >= 8 3047 ptr.BlendStatePointerValid = true; 3048#endif 3049 } 3050#endif 3051} 3052 3053UNUSED static const struct brw_tracked_state genX(blend_state) = { 3054 .dirty = { 3055 .mesa = _NEW_BUFFERS | 3056 _NEW_COLOR | 3057 _NEW_MULTISAMPLE, 3058 .brw = BRW_NEW_BATCH | 3059 BRW_NEW_BLORP | 3060 BRW_NEW_FS_PROG_DATA | 3061 BRW_NEW_STATE_BASE_ADDRESS, 3062 }, 3063 .emit = genX(upload_blend_state), 3064}; 3065#endif 3066 3067/* ---------------------------------------------------------------------- */ 3068 3069#if GFX_VER >= 7 3070UNUSED static const uint32_t push_constant_opcodes[] = { 3071 [MESA_SHADER_VERTEX] = 21, 3072 [MESA_SHADER_TESS_CTRL] = 25, /* HS */ 3073 [MESA_SHADER_TESS_EVAL] = 26, /* DS */ 3074 [MESA_SHADER_GEOMETRY] = 22, 3075 [MESA_SHADER_FRAGMENT] = 23, 3076 [MESA_SHADER_COMPUTE] = 0, 3077}; 3078 3079static void 3080genX(upload_push_constant_packets)(struct brw_context *brw) 3081{ 3082 const struct intel_device_info *devinfo = &brw->screen->devinfo; 3083 struct gl_context *ctx = &brw->ctx; 3084 3085 UNUSED uint32_t mocs = GFX_VER < 8 ? GFX7_MOCS_L3 : 0; 3086 3087 struct brw_stage_state *stage_states[] = { 3088 &brw->vs.base, 3089 &brw->tcs.base, 3090 &brw->tes.base, 3091 &brw->gs.base, 3092 &brw->wm.base, 3093 }; 3094 3095 if (GFX_VERx10 == 70 && !devinfo->is_baytrail && 3096 stage_states[MESA_SHADER_VERTEX]->push_constants_dirty) 3097 gfx7_emit_vs_workaround_flush(brw); 3098 3099 for (int stage = 0; stage <= MESA_SHADER_FRAGMENT; stage++) { 3100 struct brw_stage_state *stage_state = stage_states[stage]; 3101 UNUSED struct gl_program *prog = ctx->_Shader->CurrentProgram[stage]; 3102 3103 if (!stage_state->push_constants_dirty) 3104 continue; 3105 3106 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_VS), pkt) { 3107 pkt._3DCommandSubOpcode = push_constant_opcodes[stage]; 3108 if (stage_state->prog_data) { 3109#if GFX_VERx10 >= 75 3110 /* The Skylake PRM contains the following restriction: 3111 * 3112 * "The driver must ensure The following case does not occur 3113 * without a flush to the 3D engine: 3DSTATE_CONSTANT_* with 3114 * buffer 3 read length equal to zero committed followed by a 3115 * 3DSTATE_CONSTANT_* with buffer 0 read length not equal to 3116 * zero committed." 3117 * 3118 * To avoid this, we program the buffers in the highest slots. 3119 * This way, slot 0 is only used if slot 3 is also used. 3120 */ 3121 int n = 3; 3122 3123 for (int i = 3; i >= 0; i--) { 3124 const struct brw_ubo_range *range = 3125 &stage_state->prog_data->ubo_ranges[i]; 3126 3127 if (range->length == 0) 3128 continue; 3129 3130 const struct gl_uniform_block *block = 3131 prog->sh.UniformBlocks[range->block]; 3132 const struct gl_buffer_binding *binding = 3133 &ctx->UniformBufferBindings[block->Binding]; 3134 3135 if (!binding->BufferObject) { 3136 static unsigned msg_id = 0; 3137 _mesa_gl_debugf(ctx, &msg_id, MESA_DEBUG_SOURCE_API, 3138 MESA_DEBUG_TYPE_UNDEFINED, 3139 MESA_DEBUG_SEVERITY_HIGH, 3140 "UBO %d unbound, %s shader uniform data " 3141 "will be undefined.", 3142 range->block, 3143 _mesa_shader_stage_to_string(stage)); 3144 continue; 3145 } 3146 3147 assert(binding->Offset % 32 == 0); 3148 3149 struct brw_bo *bo = brw_bufferobj_buffer(brw, 3150 brw_buffer_object(binding->BufferObject), 3151 binding->Offset, range->length * 32, false); 3152 3153 pkt.ConstantBody.ReadLength[n] = range->length; 3154 pkt.ConstantBody.Buffer[n] = 3155 ro_bo(bo, range->start * 32 + binding->Offset); 3156 n--; 3157 } 3158 3159 if (stage_state->push_const_size > 0) { 3160 assert(n >= 0); 3161 pkt.ConstantBody.ReadLength[n] = stage_state->push_const_size; 3162 pkt.ConstantBody.Buffer[n] = 3163 ro_bo(stage_state->push_const_bo, 3164 stage_state->push_const_offset); 3165 } 3166#else 3167 pkt.ConstantBody.ReadLength[0] = stage_state->push_const_size; 3168 pkt.ConstantBody.Buffer[0].offset = 3169 stage_state->push_const_offset | mocs; 3170#endif 3171 } 3172 } 3173 3174 stage_state->push_constants_dirty = false; 3175 brw->ctx.NewDriverState |= GFX_VER >= 9 ? BRW_NEW_SURFACES : 0; 3176 } 3177} 3178 3179const struct brw_tracked_state genX(push_constant_packets) = { 3180 .dirty = { 3181 .mesa = 0, 3182 .brw = BRW_NEW_DRAW_CALL, 3183 }, 3184 .emit = genX(upload_push_constant_packets), 3185}; 3186#endif 3187 3188#if GFX_VER >= 6 3189static void 3190genX(upload_vs_push_constants)(struct brw_context *brw) 3191{ 3192 struct brw_stage_state *stage_state = &brw->vs.base; 3193 3194 /* BRW_NEW_VERTEX_PROGRAM */ 3195 const struct gl_program *vp = brw->programs[MESA_SHADER_VERTEX]; 3196 /* BRW_NEW_VS_PROG_DATA */ 3197 const struct brw_stage_prog_data *prog_data = brw->vs.base.prog_data; 3198 3199 gfx6_upload_push_constants(brw, vp, prog_data, stage_state); 3200} 3201 3202static const struct brw_tracked_state genX(vs_push_constants) = { 3203 .dirty = { 3204 .mesa = _NEW_PROGRAM_CONSTANTS | 3205 _NEW_TRANSFORM, 3206 .brw = BRW_NEW_BATCH | 3207 BRW_NEW_BLORP | 3208 BRW_NEW_VERTEX_PROGRAM | 3209 BRW_NEW_VS_PROG_DATA, 3210 }, 3211 .emit = genX(upload_vs_push_constants), 3212}; 3213 3214static void 3215genX(upload_gs_push_constants)(struct brw_context *brw) 3216{ 3217 struct brw_stage_state *stage_state = &brw->gs.base; 3218 3219 /* BRW_NEW_GEOMETRY_PROGRAM */ 3220 const struct gl_program *gp = brw->programs[MESA_SHADER_GEOMETRY]; 3221 3222 /* BRW_NEW_GS_PROG_DATA */ 3223 struct brw_stage_prog_data *prog_data = brw->gs.base.prog_data; 3224 3225 gfx6_upload_push_constants(brw, gp, prog_data, stage_state); 3226} 3227 3228static const struct brw_tracked_state genX(gs_push_constants) = { 3229 .dirty = { 3230 .mesa = _NEW_PROGRAM_CONSTANTS | 3231 _NEW_TRANSFORM, 3232 .brw = BRW_NEW_BATCH | 3233 BRW_NEW_BLORP | 3234 BRW_NEW_GEOMETRY_PROGRAM | 3235 BRW_NEW_GS_PROG_DATA, 3236 }, 3237 .emit = genX(upload_gs_push_constants), 3238}; 3239 3240static void 3241genX(upload_wm_push_constants)(struct brw_context *brw) 3242{ 3243 struct brw_stage_state *stage_state = &brw->wm.base; 3244 /* BRW_NEW_FRAGMENT_PROGRAM */ 3245 const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 3246 /* BRW_NEW_FS_PROG_DATA */ 3247 const struct brw_stage_prog_data *prog_data = brw->wm.base.prog_data; 3248 3249 gfx6_upload_push_constants(brw, fp, prog_data, stage_state); 3250} 3251 3252static const struct brw_tracked_state genX(wm_push_constants) = { 3253 .dirty = { 3254 .mesa = _NEW_PROGRAM_CONSTANTS, 3255 .brw = BRW_NEW_BATCH | 3256 BRW_NEW_BLORP | 3257 BRW_NEW_FRAGMENT_PROGRAM | 3258 BRW_NEW_FS_PROG_DATA, 3259 }, 3260 .emit = genX(upload_wm_push_constants), 3261}; 3262#endif 3263 3264/* ---------------------------------------------------------------------- */ 3265 3266#if GFX_VER >= 6 3267static unsigned 3268genX(determine_sample_mask)(struct brw_context *brw) 3269{ 3270 struct gl_context *ctx = &brw->ctx; 3271 float coverage = 1.0f; 3272 float coverage_invert = false; 3273 unsigned sample_mask = ~0u; 3274 3275 /* BRW_NEW_NUM_SAMPLES */ 3276 unsigned num_samples = brw->num_samples; 3277 3278 if (_mesa_is_multisample_enabled(ctx)) { 3279 if (ctx->Multisample.SampleCoverage) { 3280 coverage = ctx->Multisample.SampleCoverageValue; 3281 coverage_invert = ctx->Multisample.SampleCoverageInvert; 3282 } 3283 if (ctx->Multisample.SampleMask) { 3284 sample_mask = ctx->Multisample.SampleMaskValue; 3285 } 3286 } 3287 3288 if (num_samples > 1) { 3289 int coverage_int = (int) (num_samples * coverage + 0.5f); 3290 uint32_t coverage_bits = (1 << coverage_int) - 1; 3291 if (coverage_invert) 3292 coverage_bits ^= (1 << num_samples) - 1; 3293 return coverage_bits & sample_mask; 3294 } else { 3295 return 1; 3296 } 3297} 3298 3299static void 3300genX(emit_3dstate_multisample2)(struct brw_context *brw, 3301 unsigned num_samples) 3302{ 3303 unsigned log2_samples = ffs(num_samples) - 1; 3304 3305 brw_batch_emit(brw, GENX(3DSTATE_MULTISAMPLE), multi) { 3306 multi.PixelLocation = CENTER; 3307 multi.NumberofMultisamples = log2_samples; 3308#if GFX_VER == 6 3309 INTEL_SAMPLE_POS_4X(multi.Sample); 3310#elif GFX_VER == 7 3311 switch (num_samples) { 3312 case 1: 3313 INTEL_SAMPLE_POS_1X(multi.Sample); 3314 break; 3315 case 2: 3316 INTEL_SAMPLE_POS_2X(multi.Sample); 3317 break; 3318 case 4: 3319 INTEL_SAMPLE_POS_4X(multi.Sample); 3320 break; 3321 case 8: 3322 INTEL_SAMPLE_POS_8X(multi.Sample); 3323 break; 3324 default: 3325 break; 3326 } 3327#endif 3328 } 3329} 3330 3331static void 3332genX(upload_multisample_state)(struct brw_context *brw) 3333{ 3334 assert(brw->num_samples > 0 && brw->num_samples <= 16); 3335 3336 genX(emit_3dstate_multisample2)(brw, brw->num_samples); 3337 3338 brw_batch_emit(brw, GENX(3DSTATE_SAMPLE_MASK), sm) { 3339 sm.SampleMask = genX(determine_sample_mask)(brw); 3340 } 3341} 3342 3343static const struct brw_tracked_state genX(multisample_state) = { 3344 .dirty = { 3345 .mesa = _NEW_MULTISAMPLE | 3346 (GFX_VER == 10 ? _NEW_BUFFERS : 0), 3347 .brw = BRW_NEW_BLORP | 3348 BRW_NEW_CONTEXT | 3349 BRW_NEW_NUM_SAMPLES, 3350 }, 3351 .emit = genX(upload_multisample_state) 3352}; 3353#endif 3354 3355/* ---------------------------------------------------------------------- */ 3356 3357static void 3358genX(upload_color_calc_state)(struct brw_context *brw) 3359{ 3360 struct gl_context *ctx = &brw->ctx; 3361 3362 brw_state_emit(brw, GENX(COLOR_CALC_STATE), 64, &brw->cc.state_offset, cc) { 3363#if GFX_VER <= 5 3364 cc.IndependentAlphaBlendEnable = 3365 set_blend_entry_bits(brw, &cc, 0, false); 3366 set_depth_stencil_bits(brw, &cc); 3367 3368 if (ctx->Color.AlphaEnabled && 3369 ctx->DrawBuffer->_NumColorDrawBuffers <= 1) { 3370 cc.AlphaTestEnable = true; 3371 cc.AlphaTestFunction = 3372 brw_translate_compare_func(ctx->Color.AlphaFunc); 3373 } 3374 3375 cc.ColorDitherEnable = ctx->Color.DitherFlag; 3376 3377 cc.StatisticsEnable = brw->stats_wm; 3378 3379 cc.CCViewportStatePointer = 3380 ro_bo(brw->batch.state.bo, brw->cc.vp_offset); 3381#else 3382 /* _NEW_COLOR */ 3383 cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0]; 3384 cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1]; 3385 cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2]; 3386 cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3]; 3387 3388#if GFX_VER < 9 3389 /* _NEW_STENCIL */ 3390 cc.StencilReferenceValue = _mesa_get_stencil_ref(ctx, 0); 3391 cc.BackfaceStencilReferenceValue = 3392 _mesa_get_stencil_ref(ctx, ctx->Stencil._BackFace); 3393#endif 3394 3395#endif 3396 3397 /* _NEW_COLOR */ 3398 UNCLAMPED_FLOAT_TO_UBYTE(cc.AlphaReferenceValueAsUNORM8, 3399 ctx->Color.AlphaRef); 3400 } 3401 3402#if GFX_VER >= 6 3403 brw_batch_emit(brw, GENX(3DSTATE_CC_STATE_POINTERS), ptr) { 3404 ptr.ColorCalcStatePointer = brw->cc.state_offset; 3405#if GFX_VER != 7 3406 ptr.ColorCalcStatePointerValid = true; 3407#endif 3408 } 3409#else 3410 brw->ctx.NewDriverState |= BRW_NEW_GFX4_UNIT_STATE; 3411#endif 3412} 3413 3414UNUSED static const struct brw_tracked_state genX(color_calc_state) = { 3415 .dirty = { 3416 .mesa = _NEW_COLOR | 3417 _NEW_STENCIL | 3418 (GFX_VER <= 5 ? _NEW_BUFFERS | 3419 _NEW_DEPTH 3420 : 0), 3421 .brw = BRW_NEW_BATCH | 3422 BRW_NEW_BLORP | 3423 (GFX_VER <= 5 ? BRW_NEW_CC_VP | 3424 BRW_NEW_STATS_WM 3425 : BRW_NEW_CC_STATE | 3426 BRW_NEW_STATE_BASE_ADDRESS), 3427 }, 3428 .emit = genX(upload_color_calc_state), 3429}; 3430 3431 3432/* ---------------------------------------------------------------------- */ 3433 3434#if GFX_VERx10 == 75 3435static void 3436genX(upload_color_calc_and_blend_state)(struct brw_context *brw) 3437{ 3438 genX(upload_blend_state)(brw); 3439 genX(upload_color_calc_state)(brw); 3440} 3441 3442/* On Haswell when BLEND_STATE is emitted CC_STATE should also be re-emitted, 3443 * this workarounds the flickering shadows in several games. 3444 */ 3445static const struct brw_tracked_state genX(cc_and_blend_state) = { 3446 .dirty = { 3447 .mesa = _NEW_BUFFERS | 3448 _NEW_COLOR | 3449 _NEW_STENCIL | 3450 _NEW_MULTISAMPLE, 3451 .brw = BRW_NEW_BATCH | 3452 BRW_NEW_BLORP | 3453 BRW_NEW_CC_STATE | 3454 BRW_NEW_FS_PROG_DATA | 3455 BRW_NEW_STATE_BASE_ADDRESS, 3456 }, 3457 .emit = genX(upload_color_calc_and_blend_state), 3458}; 3459#endif 3460 3461/* ---------------------------------------------------------------------- */ 3462 3463#if GFX_VER >= 7 3464static void 3465genX(upload_sbe)(struct brw_context *brw) 3466{ 3467 struct gl_context *ctx = &brw->ctx; 3468 /* BRW_NEW_FRAGMENT_PROGRAM */ 3469 UNUSED const struct gl_program *fp = brw->programs[MESA_SHADER_FRAGMENT]; 3470 /* BRW_NEW_FS_PROG_DATA */ 3471 const struct brw_wm_prog_data *wm_prog_data = 3472 brw_wm_prog_data(brw->wm.base.prog_data); 3473#if GFX_VER >= 8 3474 struct GENX(SF_OUTPUT_ATTRIBUTE_DETAIL) attr_overrides[16] = { { 0 } }; 3475#else 3476#define attr_overrides sbe.Attribute 3477#endif 3478 uint32_t urb_entry_read_length; 3479 uint32_t urb_entry_read_offset; 3480 uint32_t point_sprite_enables; 3481 3482 brw_batch_emit(brw, GENX(3DSTATE_SBE), sbe) { 3483 sbe.AttributeSwizzleEnable = true; 3484 sbe.NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs; 3485 3486 /* _NEW_BUFFERS */ 3487 bool flip_y = ctx->DrawBuffer->FlipY; 3488 3489 /* _NEW_POINT 3490 * 3491 * Window coordinates in an FBO are inverted, which means point 3492 * sprite origin must be inverted. 3493 */ 3494 if ((ctx->Point.SpriteOrigin == GL_LOWER_LEFT) == flip_y) 3495 sbe.PointSpriteTextureCoordinateOrigin = LOWERLEFT; 3496 else 3497 sbe.PointSpriteTextureCoordinateOrigin = UPPERLEFT; 3498 3499 /* _NEW_POINT | _NEW_LIGHT | _NEW_PROGRAM, 3500 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | 3501 * BRW_NEW_GS_PROG_DATA | BRW_NEW_PRIMITIVE | BRW_NEW_TES_PROG_DATA | 3502 * BRW_NEW_VUE_MAP_GEOM_OUT 3503 */ 3504 genX(calculate_attr_overrides)(brw, 3505 attr_overrides, 3506 &point_sprite_enables, 3507 &urb_entry_read_length, 3508 &urb_entry_read_offset); 3509 3510 /* Typically, the URB entry read length and offset should be programmed 3511 * in 3DSTATE_VS and 3DSTATE_GS; SBE inherits it from the last active 3512 * stage which produces geometry. However, we don't know the proper 3513 * value until we call calculate_attr_overrides(). 3514 * 3515 * To fit with our existing code, we override the inherited values and 3516 * specify it here directly, as we did on previous generations. 3517 */ 3518 sbe.VertexURBEntryReadLength = urb_entry_read_length; 3519 sbe.VertexURBEntryReadOffset = urb_entry_read_offset; 3520 sbe.PointSpriteTextureCoordinateEnable = point_sprite_enables; 3521 sbe.ConstantInterpolationEnable = wm_prog_data->flat_inputs; 3522 3523#if GFX_VER >= 8 3524 sbe.ForceVertexURBEntryReadLength = true; 3525 sbe.ForceVertexURBEntryReadOffset = true; 3526#endif 3527 3528#if GFX_VER >= 9 3529 /* prepare the active component dwords */ 3530 for (int i = 0; i < 32; i++) 3531 sbe.AttributeActiveComponentFormat[i] = ACTIVE_COMPONENT_XYZW; 3532#endif 3533 } 3534 3535#if GFX_VER >= 8 3536 brw_batch_emit(brw, GENX(3DSTATE_SBE_SWIZ), sbes) { 3537 for (int i = 0; i < 16; i++) 3538 sbes.Attribute[i] = attr_overrides[i]; 3539 } 3540#endif 3541 3542#undef attr_overrides 3543} 3544 3545static const struct brw_tracked_state genX(sbe_state) = { 3546 .dirty = { 3547 .mesa = _NEW_BUFFERS | 3548 _NEW_LIGHT | 3549 _NEW_POINT | 3550 _NEW_POLYGON | 3551 _NEW_PROGRAM, 3552 .brw = BRW_NEW_BLORP | 3553 BRW_NEW_CONTEXT | 3554 BRW_NEW_FRAGMENT_PROGRAM | 3555 BRW_NEW_FS_PROG_DATA | 3556 BRW_NEW_GS_PROG_DATA | 3557 BRW_NEW_TES_PROG_DATA | 3558 BRW_NEW_VUE_MAP_GEOM_OUT | 3559 (GFX_VER == 7 ? BRW_NEW_PRIMITIVE 3560 : 0), 3561 }, 3562 .emit = genX(upload_sbe), 3563}; 3564#endif 3565 3566/* ---------------------------------------------------------------------- */ 3567 3568#if GFX_VER >= 7 3569/** 3570 * Outputs the 3DSTATE_SO_DECL_LIST command. 3571 * 3572 * The data output is a series of 64-bit entries containing a SO_DECL per 3573 * stream. We only have one stream of rendering coming out of the GS unit, so 3574 * we only emit stream 0 (low 16 bits) SO_DECLs. 3575 */ 3576static void 3577genX(upload_3dstate_so_decl_list)(struct brw_context *brw, 3578 const struct brw_vue_map *vue_map) 3579{ 3580 struct gl_context *ctx = &brw->ctx; 3581 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3582 struct gl_transform_feedback_object *xfb_obj = 3583 ctx->TransformFeedback.CurrentObject; 3584 const struct gl_transform_feedback_info *linked_xfb_info = 3585 xfb_obj->program->sh.LinkedTransformFeedback; 3586 struct GENX(SO_DECL) so_decl[MAX_VERTEX_STREAMS][128]; 3587 int buffer_mask[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3588 int next_offset[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3589 int decls[MAX_VERTEX_STREAMS] = {0, 0, 0, 0}; 3590 int max_decls = 0; 3591 STATIC_ASSERT(ARRAY_SIZE(so_decl[0]) >= MAX_PROGRAM_OUTPUTS); 3592 3593 memset(so_decl, 0, sizeof(so_decl)); 3594 3595 /* Construct the list of SO_DECLs to be emitted. The formatting of the 3596 * command feels strange -- each dword pair contains a SO_DECL per stream. 3597 */ 3598 for (unsigned i = 0; i < linked_xfb_info->NumOutputs; i++) { 3599 const struct gl_transform_feedback_output *output = 3600 &linked_xfb_info->Outputs[i]; 3601 const int buffer = output->OutputBuffer; 3602 const int varying = output->OutputRegister; 3603 const unsigned stream_id = output->StreamId; 3604 assert(stream_id < MAX_VERTEX_STREAMS); 3605 3606 buffer_mask[stream_id] |= 1 << buffer; 3607 3608 assert(vue_map->varying_to_slot[varying] >= 0); 3609 3610 /* Mesa doesn't store entries for gl_SkipComponents in the Outputs[] 3611 * array. Instead, it simply increments DstOffset for the following 3612 * input by the number of components that should be skipped. 3613 * 3614 * Our hardware is unusual in that it requires us to program SO_DECLs 3615 * for fake "hole" components, rather than simply taking the offset 3616 * for each real varying. Each hole can have size 1, 2, 3, or 4; we 3617 * program as many size = 4 holes as we can, then a final hole to 3618 * accommodate the final 1, 2, or 3 remaining. 3619 */ 3620 int skip_components = output->DstOffset - next_offset[buffer]; 3621 3622 while (skip_components > 0) { 3623 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 3624 .HoleFlag = 1, 3625 .OutputBufferSlot = output->OutputBuffer, 3626 .ComponentMask = (1 << MIN2(skip_components, 4)) - 1, 3627 }; 3628 skip_components -= 4; 3629 } 3630 3631 next_offset[buffer] = output->DstOffset + output->NumComponents; 3632 3633 so_decl[stream_id][decls[stream_id]++] = (struct GENX(SO_DECL)) { 3634 .OutputBufferSlot = output->OutputBuffer, 3635 .RegisterIndex = vue_map->varying_to_slot[varying], 3636 .ComponentMask = 3637 ((1 << output->NumComponents) - 1) << output->ComponentOffset, 3638 }; 3639 3640 if (decls[stream_id] > max_decls) 3641 max_decls = decls[stream_id]; 3642 } 3643 3644 uint32_t *dw; 3645 dw = brw_batch_emitn(brw, GENX(3DSTATE_SO_DECL_LIST), 3 + 2 * max_decls, 3646 .StreamtoBufferSelects0 = buffer_mask[0], 3647 .StreamtoBufferSelects1 = buffer_mask[1], 3648 .StreamtoBufferSelects2 = buffer_mask[2], 3649 .StreamtoBufferSelects3 = buffer_mask[3], 3650 .NumEntries0 = decls[0], 3651 .NumEntries1 = decls[1], 3652 .NumEntries2 = decls[2], 3653 .NumEntries3 = decls[3]); 3654 3655 for (int i = 0; i < max_decls; i++) { 3656 GENX(SO_DECL_ENTRY_pack)( 3657 brw, dw + 2 + i * 2, 3658 &(struct GENX(SO_DECL_ENTRY)) { 3659 .Stream0Decl = so_decl[0][i], 3660 .Stream1Decl = so_decl[1][i], 3661 .Stream2Decl = so_decl[2][i], 3662 .Stream3Decl = so_decl[3][i], 3663 }); 3664 } 3665} 3666 3667static void 3668genX(upload_3dstate_so_buffers)(struct brw_context *brw) 3669{ 3670 struct gl_context *ctx = &brw->ctx; 3671 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3672 struct gl_transform_feedback_object *xfb_obj = 3673 ctx->TransformFeedback.CurrentObject; 3674#if GFX_VER < 8 3675 const struct gl_transform_feedback_info *linked_xfb_info = 3676 xfb_obj->program->sh.LinkedTransformFeedback; 3677#else 3678 struct brw_transform_feedback_object *brw_obj = 3679 (struct brw_transform_feedback_object *) xfb_obj; 3680 uint32_t mocs_wb = GFX_VER >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB; 3681#endif 3682 3683 /* Set up the up to 4 output buffers. These are the ranges defined in the 3684 * gl_transform_feedback_object. 3685 */ 3686 for (int i = 0; i < 4; i++) { 3687 struct brw_buffer_object *bufferobj = 3688 brw_buffer_object(xfb_obj->Buffers[i]); 3689 uint32_t start = xfb_obj->Offset[i]; 3690 uint32_t end = ALIGN(start + xfb_obj->Size[i], 4); 3691 uint32_t const size = end - start; 3692 3693 if (!bufferobj || !size) { 3694 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { 3695 sob.SOBufferIndex = i; 3696 } 3697 continue; 3698 } 3699 3700 assert(start % 4 == 0); 3701 struct brw_bo *bo = 3702 brw_bufferobj_buffer(brw, bufferobj, start, size, true); 3703 assert(end <= bo->size); 3704 3705 brw_batch_emit(brw, GENX(3DSTATE_SO_BUFFER), sob) { 3706 sob.SOBufferIndex = i; 3707 3708 sob.SurfaceBaseAddress = rw_bo(bo, start); 3709#if GFX_VER < 8 3710 sob.SurfacePitch = linked_xfb_info->Buffers[i].Stride * 4; 3711 sob.SurfaceEndAddress = rw_bo(bo, end); 3712#else 3713 sob.SOBufferEnable = true; 3714 sob.StreamOffsetWriteEnable = true; 3715 sob.StreamOutputBufferOffsetAddressEnable = true; 3716 sob.MOCS = mocs_wb; 3717 3718 sob.SurfaceSize = MAX2(xfb_obj->Size[i] / 4, 1) - 1; 3719 sob.StreamOutputBufferOffsetAddress = 3720 rw_bo(brw_obj->offset_bo, i * sizeof(uint32_t)); 3721 3722 if (brw_obj->zero_offsets) { 3723 /* Zero out the offset and write that to offset_bo */ 3724 sob.StreamOffset = 0; 3725 } else { 3726 /* Use offset_bo as the "Stream Offset." */ 3727 sob.StreamOffset = 0xFFFFFFFF; 3728 } 3729#endif 3730 } 3731 } 3732 3733#if GFX_VER >= 8 3734 brw_obj->zero_offsets = false; 3735#endif 3736} 3737 3738static bool 3739query_active(struct gl_query_object *q) 3740{ 3741 return q && q->Active; 3742} 3743 3744static void 3745genX(upload_3dstate_streamout)(struct brw_context *brw, bool active, 3746 const struct brw_vue_map *vue_map) 3747{ 3748 struct gl_context *ctx = &brw->ctx; 3749 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3750 struct gl_transform_feedback_object *xfb_obj = 3751 ctx->TransformFeedback.CurrentObject; 3752 3753 brw_batch_emit(brw, GENX(3DSTATE_STREAMOUT), sos) { 3754 if (active) { 3755 int urb_entry_read_offset = 0; 3756 int urb_entry_read_length = (vue_map->num_slots + 1) / 2 - 3757 urb_entry_read_offset; 3758 3759 sos.SOFunctionEnable = true; 3760 sos.SOStatisticsEnable = true; 3761 3762 /* BRW_NEW_RASTERIZER_DISCARD */ 3763 if (ctx->RasterDiscard) { 3764 if (!query_active(ctx->Query.PrimitivesGenerated[0])) { 3765 sos.RenderingDisable = true; 3766 } else { 3767 perf_debug("Rasterizer discard with a GL_PRIMITIVES_GENERATED " 3768 "query active relies on the clipper.\n"); 3769 } 3770 } 3771 3772 /* _NEW_LIGHT */ 3773 if (ctx->Light.ProvokingVertex != GL_FIRST_VERTEX_CONVENTION) 3774 sos.ReorderMode = TRAILING; 3775 3776#if GFX_VER < 8 3777 sos.SOBufferEnable0 = xfb_obj->Buffers[0] != NULL; 3778 sos.SOBufferEnable1 = xfb_obj->Buffers[1] != NULL; 3779 sos.SOBufferEnable2 = xfb_obj->Buffers[2] != NULL; 3780 sos.SOBufferEnable3 = xfb_obj->Buffers[3] != NULL; 3781#else 3782 const struct gl_transform_feedback_info *linked_xfb_info = 3783 xfb_obj->program->sh.LinkedTransformFeedback; 3784 /* Set buffer pitches; 0 means unbound. */ 3785 if (xfb_obj->Buffers[0]) 3786 sos.Buffer0SurfacePitch = linked_xfb_info->Buffers[0].Stride * 4; 3787 if (xfb_obj->Buffers[1]) 3788 sos.Buffer1SurfacePitch = linked_xfb_info->Buffers[1].Stride * 4; 3789 if (xfb_obj->Buffers[2]) 3790 sos.Buffer2SurfacePitch = linked_xfb_info->Buffers[2].Stride * 4; 3791 if (xfb_obj->Buffers[3]) 3792 sos.Buffer3SurfacePitch = linked_xfb_info->Buffers[3].Stride * 4; 3793#endif 3794 3795 /* We always read the whole vertex. This could be reduced at some 3796 * point by reading less and offsetting the register index in the 3797 * SO_DECLs. 3798 */ 3799 sos.Stream0VertexReadOffset = urb_entry_read_offset; 3800 sos.Stream0VertexReadLength = urb_entry_read_length - 1; 3801 sos.Stream1VertexReadOffset = urb_entry_read_offset; 3802 sos.Stream1VertexReadLength = urb_entry_read_length - 1; 3803 sos.Stream2VertexReadOffset = urb_entry_read_offset; 3804 sos.Stream2VertexReadLength = urb_entry_read_length - 1; 3805 sos.Stream3VertexReadOffset = urb_entry_read_offset; 3806 sos.Stream3VertexReadLength = urb_entry_read_length - 1; 3807 } 3808 } 3809} 3810 3811static void 3812genX(upload_sol)(struct brw_context *brw) 3813{ 3814 struct gl_context *ctx = &brw->ctx; 3815 /* BRW_NEW_TRANSFORM_FEEDBACK */ 3816 bool active = _mesa_is_xfb_active_and_unpaused(ctx); 3817 3818 if (active) { 3819 genX(upload_3dstate_so_buffers)(brw); 3820 3821 /* BRW_NEW_VUE_MAP_GEOM_OUT */ 3822 genX(upload_3dstate_so_decl_list)(brw, &brw->vue_map_geom_out); 3823 } 3824 3825 /* Finally, set up the SOL stage. This command must always follow updates to 3826 * the nonpipelined SOL state (3DSTATE_SO_BUFFER, 3DSTATE_SO_DECL_LIST) or 3827 * MMIO register updates (current performed by the kernel at each batch 3828 * emit). 3829 */ 3830 genX(upload_3dstate_streamout)(brw, active, &brw->vue_map_geom_out); 3831} 3832 3833static const struct brw_tracked_state genX(sol_state) = { 3834 .dirty = { 3835 .mesa = _NEW_LIGHT, 3836 .brw = BRW_NEW_BATCH | 3837 BRW_NEW_BLORP | 3838 BRW_NEW_RASTERIZER_DISCARD | 3839 BRW_NEW_VUE_MAP_GEOM_OUT | 3840 BRW_NEW_TRANSFORM_FEEDBACK, 3841 }, 3842 .emit = genX(upload_sol), 3843}; 3844#endif 3845 3846/* ---------------------------------------------------------------------- */ 3847 3848#if GFX_VER >= 7 3849static void 3850genX(upload_ps)(struct brw_context *brw) 3851{ 3852 UNUSED const struct gl_context *ctx = &brw->ctx; 3853 UNUSED const struct intel_device_info *devinfo = &brw->screen->devinfo; 3854 3855 /* BRW_NEW_FS_PROG_DATA */ 3856 const struct brw_wm_prog_data *prog_data = 3857 brw_wm_prog_data(brw->wm.base.prog_data); 3858 const struct brw_stage_state *stage_state = &brw->wm.base; 3859 3860#if GFX_VER < 8 3861#endif 3862 3863 brw_batch_emit(brw, GENX(3DSTATE_PS), ps) { 3864 /* Initialize the execution mask with VMask. Otherwise, derivatives are 3865 * incorrect for subspans where some of the pixels are unlit. We believe 3866 * the bit just didn't take effect in previous generations. 3867 */ 3868 ps.VectorMaskEnable = GFX_VER >= 8; 3869 3870 /* Wa_1606682166: 3871 * "Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. 3872 * Disable the Sampler state prefetch functionality in the SARB by 3873 * programming 0xB000[30] to '1'." 3874 */ 3875 ps.SamplerCount = GFX_VER == 11 ? 3876 0 : DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4); 3877 3878 /* BRW_NEW_FS_PROG_DATA */ 3879 ps.BindingTableEntryCount = prog_data->base.binding_table.size_bytes / 4; 3880 3881 if (prog_data->base.use_alt_mode) 3882 ps.FloatingPointMode = Alternate; 3883 3884 /* Haswell requires the sample mask to be set in this packet as well as 3885 * in 3DSTATE_SAMPLE_MASK; the values should match. 3886 */ 3887 3888 /* _NEW_BUFFERS, _NEW_MULTISAMPLE */ 3889#if GFX_VERx10 == 75 3890 ps.SampleMask = genX(determine_sample_mask(brw)); 3891#endif 3892 3893 /* 3DSTATE_PS expects the number of threads per PSD, which is always 64 3894 * for pre Gfx11 and 128 for gfx11+; On gfx11+ If a programmed value is 3895 * k, it implies 2(k+1) threads. It implicitly scales for different GT 3896 * levels (which have some # of PSDs). 3897 * 3898 * In Gfx8 the format is U8-2 whereas in Gfx9+ it is U9-1. 3899 */ 3900#if GFX_VER >= 9 3901 ps.MaximumNumberofThreadsPerPSD = 64 - 1; 3902#elif GFX_VER >= 8 3903 ps.MaximumNumberofThreadsPerPSD = 64 - 2; 3904#else 3905 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 3906#endif 3907 3908 if (prog_data->base.nr_params > 0 || 3909 prog_data->base.ubo_ranges[0].length > 0) 3910 ps.PushConstantEnable = true; 3911 3912#if GFX_VER < 8 3913 /* From the IVB PRM, volume 2 part 1, page 287: 3914 * "This bit is inserted in the PS payload header and made available to 3915 * the DataPort (either via the message header or via header bypass) to 3916 * indicate that oMask data (one or two phases) is included in Render 3917 * Target Write messages. If present, the oMask data is used to mask off 3918 * samples." 3919 */ 3920 ps.oMaskPresenttoRenderTarget = prog_data->uses_omask; 3921 3922 /* The hardware wedges if you have this bit set but don't turn on any 3923 * dual source blend factors. 3924 * 3925 * BRW_NEW_FS_PROG_DATA | _NEW_COLOR 3926 */ 3927 ps.DualSourceBlendEnable = prog_data->dual_src_blend && 3928 (ctx->Color.BlendEnabled & 1) && 3929 ctx->Color._BlendUsesDualSrc & 0x1; 3930 3931 /* BRW_NEW_FS_PROG_DATA */ 3932 ps.AttributeEnable = (prog_data->num_varying_inputs != 0); 3933#endif 3934 3935 /* From the documentation for this packet: 3936 * "If the PS kernel does not need the Position XY Offsets to 3937 * compute a Position Value, then this field should be programmed 3938 * to POSOFFSET_NONE." 3939 * 3940 * "SW Recommendation: If the PS kernel needs the Position Offsets 3941 * to compute a Position XY value, this field should match Position 3942 * ZW Interpolation Mode to ensure a consistent position.xyzw 3943 * computation." 3944 * 3945 * We only require XY sample offsets. So, this recommendation doesn't 3946 * look useful at the moment. We might need this in future. 3947 */ 3948 if (prog_data->uses_pos_offset) 3949 ps.PositionXYOffsetSelect = POSOFFSET_SAMPLE; 3950 else 3951 ps.PositionXYOffsetSelect = POSOFFSET_NONE; 3952 3953 ps._8PixelDispatchEnable = prog_data->dispatch_8; 3954 ps._16PixelDispatchEnable = prog_data->dispatch_16; 3955 ps._32PixelDispatchEnable = prog_data->dispatch_32; 3956 3957 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: 3958 * 3959 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 3960 * Dispatch must not be enabled for PER_PIXEL dispatch mode." 3961 * 3962 * Since 16x MSAA is first introduced on SKL, we don't need to apply 3963 * the workaround on any older hardware. 3964 * 3965 * BRW_NEW_NUM_SAMPLES 3966 */ 3967 if (GFX_VER >= 9 && !prog_data->persample_dispatch && 3968 brw->num_samples == 16) { 3969 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); 3970 ps._32PixelDispatchEnable = false; 3971 } 3972 3973 ps.DispatchGRFStartRegisterForConstantSetupData0 = 3974 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 0); 3975 ps.DispatchGRFStartRegisterForConstantSetupData1 = 3976 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 1); 3977 ps.DispatchGRFStartRegisterForConstantSetupData2 = 3978 brw_wm_prog_data_dispatch_grf_start_reg(prog_data, ps, 2); 3979 3980 ps.KernelStartPointer0 = stage_state->prog_offset + 3981 brw_wm_prog_data_prog_offset(prog_data, ps, 0); 3982 ps.KernelStartPointer1 = stage_state->prog_offset + 3983 brw_wm_prog_data_prog_offset(prog_data, ps, 1); 3984 ps.KernelStartPointer2 = stage_state->prog_offset + 3985 brw_wm_prog_data_prog_offset(prog_data, ps, 2); 3986 3987 if (prog_data->base.total_scratch) { 3988 ps.ScratchSpaceBasePointer = 3989 rw_32_bo(stage_state->scratch_bo, 3990 ffs(stage_state->per_thread_scratch) - 11); 3991 } 3992 } 3993} 3994 3995static const struct brw_tracked_state genX(ps_state) = { 3996 .dirty = { 3997 .mesa = _NEW_MULTISAMPLE | 3998 (GFX_VER < 8 ? _NEW_BUFFERS | 3999 _NEW_COLOR 4000 : 0), 4001 .brw = BRW_NEW_BATCH | 4002 BRW_NEW_BLORP | 4003 BRW_NEW_FS_PROG_DATA | 4004 (GFX_VER >= 9 ? BRW_NEW_NUM_SAMPLES : 0), 4005 }, 4006 .emit = genX(upload_ps), 4007}; 4008#endif 4009 4010/* ---------------------------------------------------------------------- */ 4011 4012#if GFX_VER >= 7 4013static void 4014genX(upload_hs_state)(struct brw_context *brw) 4015{ 4016 const struct intel_device_info *devinfo = &brw->screen->devinfo; 4017 struct brw_stage_state *stage_state = &brw->tcs.base; 4018 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 4019 const struct brw_vue_prog_data *vue_prog_data = 4020 brw_vue_prog_data(stage_prog_data); 4021 4022 /* BRW_NEW_TES_PROG_DATA */ 4023 struct brw_tcs_prog_data *tcs_prog_data = 4024 brw_tcs_prog_data(stage_prog_data); 4025 4026 if (!tcs_prog_data) { 4027 brw_batch_emit(brw, GENX(3DSTATE_HS), hs); 4028 } else { 4029 brw_batch_emit(brw, GENX(3DSTATE_HS), hs) { 4030 INIT_THREAD_DISPATCH_FIELDS(hs, Vertex); 4031 4032 hs.InstanceCount = tcs_prog_data->instances - 1; 4033 hs.IncludeVertexHandles = true; 4034 4035 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; 4036 4037#if GFX_VER >= 9 4038 hs.DispatchMode = vue_prog_data->dispatch_mode; 4039 hs.IncludePrimitiveID = tcs_prog_data->include_primitive_id; 4040#endif 4041 } 4042 } 4043} 4044 4045static const struct brw_tracked_state genX(hs_state) = { 4046 .dirty = { 4047 .mesa = 0, 4048 .brw = BRW_NEW_BATCH | 4049 BRW_NEW_BLORP | 4050 BRW_NEW_TCS_PROG_DATA | 4051 BRW_NEW_TESS_PROGRAMS, 4052 }, 4053 .emit = genX(upload_hs_state), 4054}; 4055 4056static void 4057genX(upload_ds_state)(struct brw_context *brw) 4058{ 4059 const struct intel_device_info *devinfo = &brw->screen->devinfo; 4060 const struct brw_stage_state *stage_state = &brw->tes.base; 4061 struct brw_stage_prog_data *stage_prog_data = stage_state->prog_data; 4062 4063 /* BRW_NEW_TES_PROG_DATA */ 4064 const struct brw_tes_prog_data *tes_prog_data = 4065 brw_tes_prog_data(stage_prog_data); 4066 const struct brw_vue_prog_data *vue_prog_data = 4067 brw_vue_prog_data(stage_prog_data); 4068 4069 if (!tes_prog_data) { 4070 brw_batch_emit(brw, GENX(3DSTATE_DS), ds); 4071 } else { 4072 assert(GFX_VER < 11 || 4073 vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8); 4074 4075 brw_batch_emit(brw, GENX(3DSTATE_DS), ds) { 4076 INIT_THREAD_DISPATCH_FIELDS(ds, Patch); 4077 4078 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; 4079 ds.ComputeWCoordinateEnable = 4080 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; 4081 4082#if GFX_VER >= 8 4083 if (vue_prog_data->dispatch_mode == DISPATCH_MODE_SIMD8) 4084 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; 4085 ds.UserClipDistanceCullTestEnableBitmask = 4086 vue_prog_data->cull_distance_mask; 4087#endif 4088 } 4089 } 4090} 4091 4092static const struct brw_tracked_state genX(ds_state) = { 4093 .dirty = { 4094 .mesa = 0, 4095 .brw = BRW_NEW_BATCH | 4096 BRW_NEW_BLORP | 4097 BRW_NEW_TESS_PROGRAMS | 4098 BRW_NEW_TES_PROG_DATA, 4099 }, 4100 .emit = genX(upload_ds_state), 4101}; 4102 4103/* ---------------------------------------------------------------------- */ 4104 4105static void 4106upload_te_state(struct brw_context *brw) 4107{ 4108 /* BRW_NEW_TESS_PROGRAMS */ 4109 bool active = brw->programs[MESA_SHADER_TESS_EVAL]; 4110 4111 /* BRW_NEW_TES_PROG_DATA */ 4112 const struct brw_tes_prog_data *tes_prog_data = 4113 brw_tes_prog_data(brw->tes.base.prog_data); 4114 4115 if (active) { 4116 brw_batch_emit(brw, GENX(3DSTATE_TE), te) { 4117 te.Partitioning = tes_prog_data->partitioning; 4118 te.OutputTopology = tes_prog_data->output_topology; 4119 te.TEDomain = tes_prog_data->domain; 4120 te.TEEnable = true; 4121 te.MaximumTessellationFactorOdd = 63.0; 4122 te.MaximumTessellationFactorNotOdd = 64.0; 4123 } 4124 } else { 4125 brw_batch_emit(brw, GENX(3DSTATE_TE), te); 4126 } 4127} 4128 4129static const struct brw_tracked_state genX(te_state) = { 4130 .dirty = { 4131 .mesa = 0, 4132 .brw = BRW_NEW_BLORP | 4133 BRW_NEW_CONTEXT | 4134 BRW_NEW_TES_PROG_DATA | 4135 BRW_NEW_TESS_PROGRAMS, 4136 }, 4137 .emit = upload_te_state, 4138}; 4139 4140/* ---------------------------------------------------------------------- */ 4141 4142static void 4143genX(upload_tes_push_constants)(struct brw_context *brw) 4144{ 4145 struct brw_stage_state *stage_state = &brw->tes.base; 4146 /* BRW_NEW_TESS_PROGRAMS */ 4147 const struct gl_program *tep = brw->programs[MESA_SHADER_TESS_EVAL]; 4148 4149 /* BRW_NEW_TES_PROG_DATA */ 4150 const struct brw_stage_prog_data *prog_data = brw->tes.base.prog_data; 4151 gfx6_upload_push_constants(brw, tep, prog_data, stage_state); 4152} 4153 4154static const struct brw_tracked_state genX(tes_push_constants) = { 4155 .dirty = { 4156 .mesa = _NEW_PROGRAM_CONSTANTS, 4157 .brw = BRW_NEW_BATCH | 4158 BRW_NEW_BLORP | 4159 BRW_NEW_TESS_PROGRAMS | 4160 BRW_NEW_TES_PROG_DATA, 4161 }, 4162 .emit = genX(upload_tes_push_constants), 4163}; 4164 4165static void 4166genX(upload_tcs_push_constants)(struct brw_context *brw) 4167{ 4168 struct brw_stage_state *stage_state = &brw->tcs.base; 4169 /* BRW_NEW_TESS_PROGRAMS */ 4170 const struct gl_program *tcp = brw->programs[MESA_SHADER_TESS_CTRL]; 4171 4172 /* BRW_NEW_TCS_PROG_DATA */ 4173 const struct brw_stage_prog_data *prog_data = brw->tcs.base.prog_data; 4174 4175 gfx6_upload_push_constants(brw, tcp, prog_data, stage_state); 4176} 4177 4178static const struct brw_tracked_state genX(tcs_push_constants) = { 4179 .dirty = { 4180 .mesa = _NEW_PROGRAM_CONSTANTS, 4181 .brw = BRW_NEW_BATCH | 4182 BRW_NEW_BLORP | 4183 BRW_NEW_DEFAULT_TESS_LEVELS | 4184 BRW_NEW_TESS_PROGRAMS | 4185 BRW_NEW_TCS_PROG_DATA, 4186 }, 4187 .emit = genX(upload_tcs_push_constants), 4188}; 4189 4190#endif 4191 4192/* ---------------------------------------------------------------------- */ 4193 4194#if GFX_VER >= 7 4195static void 4196genX(upload_cs_push_constants)(struct brw_context *brw) 4197{ 4198 struct brw_stage_state *stage_state = &brw->cs.base; 4199 4200 /* BRW_NEW_COMPUTE_PROGRAM */ 4201 const struct gl_program *cp = brw->programs[MESA_SHADER_COMPUTE]; 4202 4203 if (cp) { 4204 /* BRW_NEW_CS_PROG_DATA */ 4205 struct brw_cs_prog_data *cs_prog_data = 4206 brw_cs_prog_data(brw->cs.base.prog_data); 4207 4208 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 4209 brw_upload_cs_push_constants(brw, cp, cs_prog_data, stage_state); 4210 } 4211} 4212 4213const struct brw_tracked_state genX(cs_push_constants) = { 4214 .dirty = { 4215 .mesa = _NEW_PROGRAM_CONSTANTS, 4216 .brw = BRW_NEW_BATCH | 4217 BRW_NEW_BLORP | 4218 BRW_NEW_COMPUTE_PROGRAM | 4219 BRW_NEW_CS_PROG_DATA, 4220 }, 4221 .emit = genX(upload_cs_push_constants), 4222}; 4223 4224/** 4225 * Creates a new CS constant buffer reflecting the current CS program's 4226 * constants, if needed by the CS program. 4227 */ 4228static void 4229genX(upload_cs_pull_constants)(struct brw_context *brw) 4230{ 4231 struct brw_stage_state *stage_state = &brw->cs.base; 4232 4233 /* BRW_NEW_COMPUTE_PROGRAM */ 4234 struct brw_program *cp = 4235 (struct brw_program *) brw->programs[MESA_SHADER_COMPUTE]; 4236 4237 /* BRW_NEW_CS_PROG_DATA */ 4238 const struct brw_stage_prog_data *prog_data = brw->cs.base.prog_data; 4239 4240 _mesa_shader_write_subroutine_indices(&brw->ctx, MESA_SHADER_COMPUTE); 4241 /* _NEW_PROGRAM_CONSTANTS */ 4242 brw_upload_pull_constants(brw, BRW_NEW_SURFACES, &cp->program, 4243 stage_state, prog_data); 4244} 4245 4246const struct brw_tracked_state genX(cs_pull_constants) = { 4247 .dirty = { 4248 .mesa = _NEW_PROGRAM_CONSTANTS, 4249 .brw = BRW_NEW_BATCH | 4250 BRW_NEW_BLORP | 4251 BRW_NEW_COMPUTE_PROGRAM | 4252 BRW_NEW_CS_PROG_DATA, 4253 }, 4254 .emit = genX(upload_cs_pull_constants), 4255}; 4256 4257static void 4258genX(upload_cs_state)(struct brw_context *brw) 4259{ 4260 if (!brw->cs.base.prog_data) 4261 return; 4262 4263 uint32_t offset; 4264 uint32_t *desc = (uint32_t*) brw_state_batch( 4265 brw, GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t), 64, 4266 &offset); 4267 4268 struct brw_stage_state *stage_state = &brw->cs.base; 4269 struct brw_stage_prog_data *prog_data = stage_state->prog_data; 4270 struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); 4271 const struct intel_device_info *devinfo = &brw->screen->devinfo; 4272 4273 const struct brw_cs_dispatch_info dispatch = 4274 brw_cs_get_dispatch_info(devinfo, cs_prog_data, brw->compute.group_size); 4275 4276 if (INTEL_DEBUG(DEBUG_SHADER_TIME)) { 4277 brw_emit_buffer_surface_state( 4278 brw, &stage_state->surf_offset[ 4279 prog_data->binding_table.shader_time_start], 4280 brw->shader_time.bo, 0, ISL_FORMAT_RAW, 4281 brw->shader_time.bo->size, 1, 4282 RELOC_WRITE); 4283 } 4284 4285 uint32_t *bind = brw_state_batch(brw, prog_data->binding_table.size_bytes, 4286 32, &stage_state->bind_bo_offset); 4287 4288 /* The MEDIA_VFE_STATE documentation for Gfx8+ says: 4289 * 4290 * "A stalling PIPE_CONTROL is required before MEDIA_VFE_STATE unless 4291 * the only bits that are changed are scoreboard related: Scoreboard 4292 * Enable, Scoreboard Type, Scoreboard Mask, Scoreboard * Delta. For 4293 * these scoreboard related states, a MEDIA_STATE_FLUSH is sufficient." 4294 * 4295 * Earlier generations say "MI_FLUSH" instead of "stalling PIPE_CONTROL", 4296 * but MI_FLUSH isn't really a thing, so we assume they meant PIPE_CONTROL. 4297 */ 4298 brw_emit_pipe_control_flush(brw, PIPE_CONTROL_CS_STALL); 4299 4300 brw_batch_emit(brw, GENX(MEDIA_VFE_STATE), vfe) { 4301 if (prog_data->total_scratch) { 4302 uint32_t per_thread_scratch_value; 4303 4304 if (GFX_VER >= 8) { 4305 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 4306 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 4307 */ 4308 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 11; 4309 } else if (GFX_VERx10 == 75) { 4310 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 4311 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 4312 */ 4313 per_thread_scratch_value = ffs(stage_state->per_thread_scratch) - 12; 4314 } else { 4315 /* Earlier platforms use the range [0, 11] to mean [1kB, 12kB] 4316 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 4317 */ 4318 per_thread_scratch_value = stage_state->per_thread_scratch / 1024 - 1; 4319 } 4320 vfe.ScratchSpaceBasePointer = rw_32_bo(stage_state->scratch_bo, 0); 4321 vfe.PerThreadScratchSpace = per_thread_scratch_value; 4322 } 4323 4324 vfe.MaximumNumberofThreads = 4325 devinfo->max_cs_threads * devinfo->subslice_total - 1; 4326 vfe.NumberofURBEntries = GFX_VER >= 8 ? 2 : 0; 4327#if GFX_VER < 11 4328 vfe.ResetGatewayTimer = 4329 Resettingrelativetimerandlatchingtheglobaltimestamp; 4330#endif 4331#if GFX_VER < 9 4332 vfe.BypassGatewayControl = BypassingOpenGatewayCloseGatewayprotocol; 4333#endif 4334#if GFX_VER == 7 4335 vfe.GPGPUMode = true; 4336#endif 4337 4338 /* We are uploading duplicated copies of push constant uniforms for each 4339 * thread. Although the local id data needs to vary per thread, it won't 4340 * change for other uniform data. Unfortunately this duplication is 4341 * required for gfx7. As of Haswell, this duplication can be avoided, 4342 * but this older mechanism with duplicated data continues to work. 4343 * 4344 * FINISHME: As of Haswell, we could make use of the 4345 * INTERFACE_DESCRIPTOR_DATA "Cross-Thread Constant Data Read Length" 4346 * field to only store one copy of uniform data. 4347 * 4348 * FINISHME: Broadwell adds a new alternative "Indirect Payload Storage" 4349 * which is described in the GPGPU_WALKER command and in the Broadwell 4350 * PRM Volume 7: 3D Media GPGPU, under Media GPGPU Pipeline => Mode of 4351 * Operations => GPGPU Mode => Indirect Payload Storage. 4352 * 4353 * Note: The constant data is built in brw_upload_cs_push_constants 4354 * below. 4355 */ 4356 vfe.URBEntryAllocationSize = GFX_VER >= 8 ? 2 : 0; 4357 4358 const uint32_t vfe_curbe_allocation = 4359 ALIGN(cs_prog_data->push.per_thread.regs * dispatch.threads + 4360 cs_prog_data->push.cross_thread.regs, 2); 4361 vfe.CURBEAllocationSize = vfe_curbe_allocation; 4362 } 4363 4364 const unsigned push_const_size = 4365 brw_cs_push_const_total_size(cs_prog_data, dispatch.threads); 4366 if (push_const_size > 0) { 4367 brw_batch_emit(brw, GENX(MEDIA_CURBE_LOAD), curbe) { 4368 curbe.CURBETotalDataLength = ALIGN(push_const_size, 64); 4369 curbe.CURBEDataStartAddress = stage_state->push_const_offset; 4370 } 4371 } 4372 4373 /* BRW_NEW_SURFACES and BRW_NEW_*_CONSTBUF */ 4374 memcpy(bind, stage_state->surf_offset, 4375 prog_data->binding_table.size_bytes); 4376 const uint64_t ksp = brw->cs.base.prog_offset + 4377 brw_cs_prog_data_prog_offset(cs_prog_data, 4378 dispatch.simd_size); 4379 const struct GENX(INTERFACE_DESCRIPTOR_DATA) idd = { 4380 .KernelStartPointer = ksp, 4381 .SamplerStatePointer = stage_state->sampler_offset, 4382 /* Wa_1606682166 */ 4383 .SamplerCount = GFX_VER == 11 ? 0 : 4384 DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), 4385 .BindingTablePointer = stage_state->bind_bo_offset, 4386 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, 4387 .NumberofThreadsinGPGPUThreadGroup = dispatch.threads, 4388 .SharedLocalMemorySize = encode_slm_size(GFX_VER, 4389 prog_data->total_shared), 4390 .BarrierEnable = cs_prog_data->uses_barrier, 4391#if GFX_VERx10 >= 75 4392 .CrossThreadConstantDataReadLength = 4393 cs_prog_data->push.cross_thread.regs, 4394#endif 4395 }; 4396 4397 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(brw, desc, &idd); 4398 4399 brw_batch_emit(brw, GENX(MEDIA_INTERFACE_DESCRIPTOR_LOAD), load) { 4400 load.InterfaceDescriptorTotalLength = 4401 GENX(INTERFACE_DESCRIPTOR_DATA_length) * sizeof(uint32_t); 4402 load.InterfaceDescriptorDataStartAddress = offset; 4403 } 4404} 4405 4406static const struct brw_tracked_state genX(cs_state) = { 4407 .dirty = { 4408 .mesa = _NEW_PROGRAM_CONSTANTS, 4409 .brw = BRW_NEW_BATCH | 4410 BRW_NEW_BLORP | 4411 BRW_NEW_CS_PROG_DATA | 4412 BRW_NEW_SAMPLER_STATE_TABLE | 4413 BRW_NEW_SURFACES, 4414 }, 4415 .emit = genX(upload_cs_state) 4416}; 4417 4418#define GPGPU_DISPATCHDIMX 0x2500 4419#define GPGPU_DISPATCHDIMY 0x2504 4420#define GPGPU_DISPATCHDIMZ 0x2508 4421 4422#define MI_PREDICATE_SRC0 0x2400 4423#define MI_PREDICATE_SRC1 0x2408 4424 4425static void 4426prepare_indirect_gpgpu_walker(struct brw_context *brw) 4427{ 4428 GLintptr indirect_offset = brw->compute.num_work_groups_offset; 4429 struct brw_bo *bo = brw->compute.num_work_groups_bo; 4430 4431 emit_lrm(brw, GPGPU_DISPATCHDIMX, ro_bo(bo, indirect_offset + 0)); 4432 emit_lrm(brw, GPGPU_DISPATCHDIMY, ro_bo(bo, indirect_offset + 4)); 4433 emit_lrm(brw, GPGPU_DISPATCHDIMZ, ro_bo(bo, indirect_offset + 8)); 4434 4435#if GFX_VER <= 7 4436 /* Clear upper 32-bits of SRC0 and all 64-bits of SRC1 */ 4437 emit_lri(brw, MI_PREDICATE_SRC0 + 4, 0); 4438 emit_lri(brw, MI_PREDICATE_SRC1 , 0); 4439 emit_lri(brw, MI_PREDICATE_SRC1 + 4, 0); 4440 4441 /* Load compute_dispatch_indirect_x_size into SRC0 */ 4442 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 0)); 4443 4444 /* predicate = (compute_dispatch_indirect_x_size == 0); */ 4445 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4446 mip.LoadOperation = LOAD_LOAD; 4447 mip.CombineOperation = COMBINE_SET; 4448 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4449 } 4450 4451 /* Load compute_dispatch_indirect_y_size into SRC0 */ 4452 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 4)); 4453 4454 /* predicate |= (compute_dispatch_indirect_y_size == 0); */ 4455 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4456 mip.LoadOperation = LOAD_LOAD; 4457 mip.CombineOperation = COMBINE_OR; 4458 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4459 } 4460 4461 /* Load compute_dispatch_indirect_z_size into SRC0 */ 4462 emit_lrm(brw, MI_PREDICATE_SRC0, ro_bo(bo, indirect_offset + 8)); 4463 4464 /* predicate |= (compute_dispatch_indirect_z_size == 0); */ 4465 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4466 mip.LoadOperation = LOAD_LOAD; 4467 mip.CombineOperation = COMBINE_OR; 4468 mip.CompareOperation = COMPARE_SRCS_EQUAL; 4469 } 4470 4471 /* predicate = !predicate; */ 4472#define COMPARE_FALSE 1 4473 brw_batch_emit(brw, GENX(MI_PREDICATE), mip) { 4474 mip.LoadOperation = LOAD_LOADINV; 4475 mip.CombineOperation = COMBINE_OR; 4476 mip.CompareOperation = COMPARE_FALSE; 4477 } 4478#endif 4479} 4480 4481static void 4482genX(emit_gpgpu_walker)(struct brw_context *brw) 4483{ 4484 const GLuint *num_groups = brw->compute.num_work_groups; 4485 4486 bool indirect = brw->compute.num_work_groups_bo != NULL; 4487 if (indirect) 4488 prepare_indirect_gpgpu_walker(brw); 4489 4490 const struct brw_cs_dispatch_info dispatch = 4491 brw_cs_get_dispatch_info(&brw->screen->devinfo, 4492 brw_cs_prog_data(brw->cs.base.prog_data), 4493 brw->compute.group_size); 4494 4495 brw_batch_emit(brw, GENX(GPGPU_WALKER), ggw) { 4496 ggw.IndirectParameterEnable = indirect; 4497 ggw.PredicateEnable = GFX_VER <= 7 && indirect; 4498 ggw.SIMDSize = dispatch.simd_size / 16; 4499 ggw.ThreadDepthCounterMaximum = 0; 4500 ggw.ThreadHeightCounterMaximum = 0; 4501 ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; 4502 ggw.ThreadGroupIDXDimension = num_groups[0]; 4503 ggw.ThreadGroupIDYDimension = num_groups[1]; 4504 ggw.ThreadGroupIDZDimension = num_groups[2]; 4505 ggw.RightExecutionMask = dispatch.right_mask; 4506 ggw.BottomExecutionMask = 0xffffffff; 4507 } 4508 4509 brw_batch_emit(brw, GENX(MEDIA_STATE_FLUSH), msf); 4510} 4511 4512#endif 4513 4514/* ---------------------------------------------------------------------- */ 4515 4516#if GFX_VER >= 8 4517static void 4518genX(upload_raster)(struct brw_context *brw) 4519{ 4520 const struct gl_context *ctx = &brw->ctx; 4521 4522 /* _NEW_BUFFERS */ 4523 const bool flip_y = ctx->DrawBuffer->FlipY; 4524 4525 /* _NEW_POLYGON */ 4526 const struct gl_polygon_attrib *polygon = &ctx->Polygon; 4527 4528 /* _NEW_POINT */ 4529 const struct gl_point_attrib *point = &ctx->Point; 4530 4531 brw_batch_emit(brw, GENX(3DSTATE_RASTER), raster) { 4532 if (brw->polygon_front_bit != flip_y) 4533 raster.FrontWinding = CounterClockwise; 4534 4535 if (polygon->CullFlag) { 4536 switch (polygon->CullFaceMode) { 4537 case GL_FRONT: 4538 raster.CullMode = CULLMODE_FRONT; 4539 break; 4540 case GL_BACK: 4541 raster.CullMode = CULLMODE_BACK; 4542 break; 4543 case GL_FRONT_AND_BACK: 4544 raster.CullMode = CULLMODE_BOTH; 4545 break; 4546 default: 4547 unreachable("not reached"); 4548 } 4549 } else { 4550 raster.CullMode = CULLMODE_NONE; 4551 } 4552 4553 raster.SmoothPointEnable = point->SmoothFlag; 4554 4555 raster.DXMultisampleRasterizationEnable = 4556 _mesa_is_multisample_enabled(ctx); 4557 4558 raster.GlobalDepthOffsetEnableSolid = polygon->OffsetFill; 4559 raster.GlobalDepthOffsetEnableWireframe = polygon->OffsetLine; 4560 raster.GlobalDepthOffsetEnablePoint = polygon->OffsetPoint; 4561 4562 switch (polygon->FrontMode) { 4563 case GL_FILL: 4564 raster.FrontFaceFillMode = FILL_MODE_SOLID; 4565 break; 4566 case GL_LINE: 4567 raster.FrontFaceFillMode = FILL_MODE_WIREFRAME; 4568 break; 4569 case GL_POINT: 4570 raster.FrontFaceFillMode = FILL_MODE_POINT; 4571 break; 4572 default: 4573 unreachable("not reached"); 4574 } 4575 4576 switch (polygon->BackMode) { 4577 case GL_FILL: 4578 raster.BackFaceFillMode = FILL_MODE_SOLID; 4579 break; 4580 case GL_LINE: 4581 raster.BackFaceFillMode = FILL_MODE_WIREFRAME; 4582 break; 4583 case GL_POINT: 4584 raster.BackFaceFillMode = FILL_MODE_POINT; 4585 break; 4586 default: 4587 unreachable("not reached"); 4588 } 4589 4590 /* _NEW_LINE */ 4591 raster.AntialiasingEnable = ctx->Line.SmoothFlag; 4592 4593#if GFX_VER == 10 4594 /* _NEW_BUFFERS 4595 * Antialiasing Enable bit MUST not be set when NUM_MULTISAMPLES > 1. 4596 */ 4597 const bool multisampled_fbo = 4598 _mesa_geometric_samples(ctx->DrawBuffer) > 1; 4599 if (multisampled_fbo) 4600 raster.AntialiasingEnable = false; 4601#endif 4602 4603 /* _NEW_SCISSOR */ 4604 raster.ScissorRectangleEnable = ctx->Scissor.EnableFlags; 4605 4606 /* _NEW_TRANSFORM */ 4607#if GFX_VER < 9 4608 if (!(ctx->Transform.DepthClampNear && 4609 ctx->Transform.DepthClampFar)) 4610 raster.ViewportZClipTestEnable = true; 4611#endif 4612 4613#if GFX_VER >= 9 4614 if (!ctx->Transform.DepthClampNear) 4615 raster.ViewportZNearClipTestEnable = true; 4616 4617 if (!ctx->Transform.DepthClampFar) 4618 raster.ViewportZFarClipTestEnable = true; 4619#endif 4620 4621 /* BRW_NEW_CONSERVATIVE_RASTERIZATION */ 4622#if GFX_VER >= 9 4623 raster.ConservativeRasterizationEnable = 4624 ctx->IntelConservativeRasterization; 4625#endif 4626 4627 raster.GlobalDepthOffsetClamp = polygon->OffsetClamp; 4628 raster.GlobalDepthOffsetScale = polygon->OffsetFactor; 4629 4630 raster.GlobalDepthOffsetConstant = polygon->OffsetUnits * 2; 4631 } 4632} 4633 4634static const struct brw_tracked_state genX(raster_state) = { 4635 .dirty = { 4636 .mesa = _NEW_BUFFERS | 4637 _NEW_LINE | 4638 _NEW_MULTISAMPLE | 4639 _NEW_POINT | 4640 _NEW_POLYGON | 4641 _NEW_SCISSOR | 4642 _NEW_TRANSFORM, 4643 .brw = BRW_NEW_BLORP | 4644 BRW_NEW_CONTEXT | 4645 BRW_NEW_CONSERVATIVE_RASTERIZATION, 4646 }, 4647 .emit = genX(upload_raster), 4648}; 4649#endif 4650 4651/* ---------------------------------------------------------------------- */ 4652 4653#if GFX_VER >= 8 4654static void 4655genX(upload_ps_extra)(struct brw_context *brw) 4656{ 4657 UNUSED struct gl_context *ctx = &brw->ctx; 4658 4659 const struct brw_wm_prog_data *prog_data = 4660 brw_wm_prog_data(brw->wm.base.prog_data); 4661 4662 brw_batch_emit(brw, GENX(3DSTATE_PS_EXTRA), psx) { 4663 psx.PixelShaderValid = true; 4664 psx.PixelShaderComputedDepthMode = prog_data->computed_depth_mode; 4665 psx.PixelShaderKillsPixel = prog_data->uses_kill; 4666 psx.AttributeEnable = prog_data->num_varying_inputs != 0; 4667 psx.PixelShaderUsesSourceDepth = prog_data->uses_src_depth; 4668 psx.PixelShaderUsesSourceW = prog_data->uses_src_w; 4669 psx.PixelShaderIsPerSample = prog_data->persample_dispatch; 4670 4671 /* _NEW_MULTISAMPLE | BRW_NEW_CONSERVATIVE_RASTERIZATION */ 4672 if (prog_data->uses_sample_mask) { 4673#if GFX_VER >= 9 4674 if (prog_data->post_depth_coverage) 4675 psx.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; 4676 else if (prog_data->inner_coverage && ctx->IntelConservativeRasterization) 4677 psx.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE; 4678 else 4679 psx.InputCoverageMaskState = ICMS_NORMAL; 4680#else 4681 psx.PixelShaderUsesInputCoverageMask = true; 4682#endif 4683 } 4684 4685 psx.oMaskPresenttoRenderTarget = prog_data->uses_omask; 4686#if GFX_VER >= 9 4687 psx.PixelShaderPullsBary = prog_data->pulls_bary; 4688 psx.PixelShaderComputesStencil = prog_data->computed_stencil; 4689#endif 4690 4691 /* The stricter cross-primitive coherency guarantees that the hardware 4692 * gives us with the "Accesses UAV" bit set for at least one shader stage 4693 * and the "UAV coherency required" bit set on the 3DPRIMITIVE command 4694 * are redundant within the current image, atomic counter and SSBO GL 4695 * APIs, which all have very loose ordering and coherency requirements 4696 * and generally rely on the application to insert explicit barriers when 4697 * a shader invocation is expected to see the memory writes performed by 4698 * the invocations of some previous primitive. Regardless of the value 4699 * of "UAV coherency required", the "Accesses UAV" bits will implicitly 4700 * cause an in most cases useless DC flush when the lowermost stage with 4701 * the bit set finishes execution. 4702 * 4703 * It would be nice to disable it, but in some cases we can't because on 4704 * Gfx8+ it also has an influence on rasterization via the PS UAV-only 4705 * signal (which could be set independently from the coherency mechanism 4706 * in the 3DSTATE_WM command on Gfx7), and because in some cases it will 4707 * determine whether the hardware skips execution of the fragment shader 4708 * or not via the ThreadDispatchEnable signal. However if we know that 4709 * GFX8_PS_BLEND_HAS_WRITEABLE_RT is going to be set and 4710 * GFX8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any 4711 * difference so we may just disable it here. 4712 * 4713 * Gfx8 hardware tries to compute ThreadDispatchEnable for us but doesn't 4714 * take into account KillPixels when no depth or stencil writes are 4715 * enabled. In order for occlusion queries to work correctly with no 4716 * attachments, we need to force-enable here. 4717 * 4718 * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | 4719 * _NEW_COLOR 4720 */ 4721 if ((prog_data->has_side_effects || prog_data->uses_kill) && 4722 !brw_color_buffer_write_enabled(brw)) 4723 psx.PixelShaderHasUAV = true; 4724 } 4725} 4726 4727const struct brw_tracked_state genX(ps_extra) = { 4728 .dirty = { 4729 .mesa = _NEW_BUFFERS | _NEW_COLOR, 4730 .brw = BRW_NEW_BLORP | 4731 BRW_NEW_CONTEXT | 4732 BRW_NEW_FRAGMENT_PROGRAM | 4733 BRW_NEW_FS_PROG_DATA | 4734 BRW_NEW_CONSERVATIVE_RASTERIZATION, 4735 }, 4736 .emit = genX(upload_ps_extra), 4737}; 4738#endif 4739 4740/* ---------------------------------------------------------------------- */ 4741 4742#if GFX_VER >= 8 4743static void 4744genX(upload_ps_blend)(struct brw_context *brw) 4745{ 4746 struct gl_context *ctx = &brw->ctx; 4747 4748 /* _NEW_BUFFERS */ 4749 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0]; 4750 const bool buffer0_is_integer = ctx->DrawBuffer->_IntegerBuffers & 0x1; 4751 4752 /* _NEW_COLOR */ 4753 struct gl_colorbuffer_attrib *color = &ctx->Color; 4754 4755 brw_batch_emit(brw, GENX(3DSTATE_PS_BLEND), pb) { 4756 /* BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR */ 4757 pb.HasWriteableRT = brw_color_buffer_write_enabled(brw); 4758 4759 bool alpha_to_one = false; 4760 4761 if (!buffer0_is_integer) { 4762 /* _NEW_MULTISAMPLE */ 4763 4764 if (_mesa_is_multisample_enabled(ctx)) { 4765 pb.AlphaToCoverageEnable = ctx->Multisample.SampleAlphaToCoverage; 4766 alpha_to_one = ctx->Multisample.SampleAlphaToOne; 4767 } 4768 4769 pb.AlphaTestEnable = color->AlphaEnabled; 4770 } 4771 4772 /* Used for implementing the following bit of GL_EXT_texture_integer: 4773 * "Per-fragment operations that require floating-point color 4774 * components, including multisample alpha operations, alpha test, 4775 * blending, and dithering, have no effect when the corresponding 4776 * colors are written to an integer color buffer." 4777 * 4778 * The OpenGL specification 3.3 (page 196), section 4.1.3 says: 4779 * "If drawbuffer zero is not NONE and the buffer it references has an 4780 * integer format, the SAMPLE_ALPHA_TO_COVERAGE and SAMPLE_ALPHA_TO_ONE 4781 * operations are skipped." 4782 */ 4783 if (rb && !buffer0_is_integer && (color->BlendEnabled & 1)) { 4784 GLenum eqRGB = color->Blend[0].EquationRGB; 4785 GLenum eqA = color->Blend[0].EquationA; 4786 GLenum srcRGB = color->Blend[0].SrcRGB; 4787 GLenum dstRGB = color->Blend[0].DstRGB; 4788 GLenum srcA = color->Blend[0].SrcA; 4789 GLenum dstA = color->Blend[0].DstA; 4790 4791 if (eqRGB == GL_MIN || eqRGB == GL_MAX) 4792 srcRGB = dstRGB = GL_ONE; 4793 4794 if (eqA == GL_MIN || eqA == GL_MAX) 4795 srcA = dstA = GL_ONE; 4796 4797 /* Due to hardware limitations, the destination may have information 4798 * in an alpha channel even when the format specifies no alpha 4799 * channel. In order to avoid getting any incorrect blending due to 4800 * that alpha channel, coerce the blend factors to values that will 4801 * not read the alpha channel, but will instead use the correct 4802 * implicit value for alpha. 4803 */ 4804 if (!_mesa_base_format_has_channel(rb->_BaseFormat, 4805 GL_TEXTURE_ALPHA_TYPE)) { 4806 srcRGB = brw_fix_xRGB_alpha(srcRGB); 4807 srcA = brw_fix_xRGB_alpha(srcA); 4808 dstRGB = brw_fix_xRGB_alpha(dstRGB); 4809 dstA = brw_fix_xRGB_alpha(dstA); 4810 } 4811 4812 /* Alpha to One doesn't work with Dual Color Blending. Override 4813 * SRC1_ALPHA to ONE and ONE_MINUS_SRC1_ALPHA to ZERO. 4814 */ 4815 if (alpha_to_one && color->_BlendUsesDualSrc & 0x1) { 4816 srcRGB = fix_dual_blend_alpha_to_one(srcRGB); 4817 srcA = fix_dual_blend_alpha_to_one(srcA); 4818 dstRGB = fix_dual_blend_alpha_to_one(dstRGB); 4819 dstA = fix_dual_blend_alpha_to_one(dstA); 4820 } 4821 4822 /* BRW_NEW_FS_PROG_DATA */ 4823 const struct brw_wm_prog_data *wm_prog_data = 4824 brw_wm_prog_data(brw->wm.base.prog_data); 4825 4826 /* The Dual Source Blending documentation says: 4827 * 4828 * "If SRC1 is included in a src/dst blend factor and 4829 * a DualSource RT Write message is not used, results 4830 * are UNDEFINED. (This reflects the same restriction in DX APIs, 4831 * where undefined results are produced if “o1” is not written 4832 * by a PS – there are no default values defined). 4833 * If SRC1 is not included in a src/dst blend factor, 4834 * dual source blending must be disabled." 4835 * 4836 * There is no way to gracefully fix this undefined situation 4837 * so we just disable the blending to prevent possible issues. 4838 */ 4839 pb.ColorBufferBlendEnable = 4840 !(color->_BlendUsesDualSrc & 0x1) || wm_prog_data->dual_src_blend; 4841 pb.SourceAlphaBlendFactor = brw_translate_blend_factor(srcA); 4842 pb.DestinationAlphaBlendFactor = brw_translate_blend_factor(dstA); 4843 pb.SourceBlendFactor = brw_translate_blend_factor(srcRGB); 4844 pb.DestinationBlendFactor = brw_translate_blend_factor(dstRGB); 4845 4846 pb.IndependentAlphaBlendEnable = 4847 srcA != srcRGB || dstA != dstRGB || eqA != eqRGB; 4848 } 4849 } 4850} 4851 4852static const struct brw_tracked_state genX(ps_blend) = { 4853 .dirty = { 4854 .mesa = _NEW_BUFFERS | 4855 _NEW_COLOR | 4856 _NEW_MULTISAMPLE, 4857 .brw = BRW_NEW_BLORP | 4858 BRW_NEW_CONTEXT | 4859 BRW_NEW_FRAGMENT_PROGRAM | 4860 BRW_NEW_FS_PROG_DATA, 4861 }, 4862 .emit = genX(upload_ps_blend) 4863}; 4864#endif 4865 4866/* ---------------------------------------------------------------------- */ 4867 4868#if GFX_VER >= 8 4869static void 4870genX(emit_vf_topology)(struct brw_context *brw) 4871{ 4872 brw_batch_emit(brw, GENX(3DSTATE_VF_TOPOLOGY), vftopo) { 4873 vftopo.PrimitiveTopologyType = brw->primitive; 4874 } 4875} 4876 4877static const struct brw_tracked_state genX(vf_topology) = { 4878 .dirty = { 4879 .mesa = 0, 4880 .brw = BRW_NEW_BLORP | 4881 BRW_NEW_PRIMITIVE, 4882 }, 4883 .emit = genX(emit_vf_topology), 4884}; 4885#endif 4886 4887/* ---------------------------------------------------------------------- */ 4888 4889#if GFX_VER >= 7 4890static void 4891genX(emit_mi_report_perf_count)(struct brw_context *brw, 4892 struct brw_bo *bo, 4893 uint32_t offset_in_bytes, 4894 uint32_t report_id) 4895{ 4896 brw_batch_emit(brw, GENX(MI_REPORT_PERF_COUNT), mi_rpc) { 4897 mi_rpc.MemoryAddress = ggtt_bo(bo, offset_in_bytes); 4898 mi_rpc.ReportID = report_id; 4899 } 4900} 4901#endif 4902 4903/* ---------------------------------------------------------------------- */ 4904 4905/** 4906 * Emit a 3DSTATE_SAMPLER_STATE_POINTERS_{VS,HS,GS,DS,PS} packet. 4907 */ 4908static void 4909genX(emit_sampler_state_pointers_xs)(UNUSED struct brw_context *brw, 4910 UNUSED struct brw_stage_state *stage_state) 4911{ 4912#if GFX_VER >= 7 4913 static const uint16_t packet_headers[] = { 4914 [MESA_SHADER_VERTEX] = 43, 4915 [MESA_SHADER_TESS_CTRL] = 44, 4916 [MESA_SHADER_TESS_EVAL] = 45, 4917 [MESA_SHADER_GEOMETRY] = 46, 4918 [MESA_SHADER_FRAGMENT] = 47, 4919 }; 4920 4921 /* Ivybridge requires a workaround flush before VS packets. */ 4922 if (GFX_VERx10 == 70 && 4923 stage_state->stage == MESA_SHADER_VERTEX) { 4924 gfx7_emit_vs_workaround_flush(brw); 4925 } 4926 4927 brw_batch_emit(brw, GENX(3DSTATE_SAMPLER_STATE_POINTERS_VS), ptr) { 4928 ptr._3DCommandSubOpcode = packet_headers[stage_state->stage]; 4929 ptr.PointertoVSSamplerState = stage_state->sampler_offset; 4930 } 4931#endif 4932} 4933 4934UNUSED static bool 4935has_component(mesa_format format, int i) 4936{ 4937 if (_mesa_is_format_color_format(format)) 4938 return _mesa_format_has_color_component(format, i); 4939 4940 /* depth and stencil have only one component */ 4941 return i == 0; 4942} 4943 4944/** 4945 * Upload SAMPLER_BORDER_COLOR_STATE. 4946 */ 4947static void 4948genX(upload_default_color)(struct brw_context *brw, 4949 const struct gl_sampler_object *sampler, 4950 UNUSED mesa_format format, 4951 GLenum base_format, 4952 bool is_integer_format, bool is_stencil_sampling, 4953 uint32_t *sdc_offset) 4954{ 4955 union gl_color_union color; 4956 4957 switch (base_format) { 4958 case GL_DEPTH_COMPONENT: 4959 /* GL specs that border color for depth textures is taken from the 4960 * R channel, while the hardware uses A. Spam R into all the 4961 * channels for safety. 4962 */ 4963 color.ui[0] = sampler->Attrib.state.border_color.ui[0]; 4964 color.ui[1] = sampler->Attrib.state.border_color.ui[0]; 4965 color.ui[2] = sampler->Attrib.state.border_color.ui[0]; 4966 color.ui[3] = sampler->Attrib.state.border_color.ui[0]; 4967 break; 4968 case GL_ALPHA: 4969 color.ui[0] = 0u; 4970 color.ui[1] = 0u; 4971 color.ui[2] = 0u; 4972 color.ui[3] = sampler->Attrib.state.border_color.ui[3]; 4973 break; 4974 case GL_INTENSITY: 4975 color.ui[0] = sampler->Attrib.state.border_color.ui[0]; 4976 color.ui[1] = sampler->Attrib.state.border_color.ui[0]; 4977 color.ui[2] = sampler->Attrib.state.border_color.ui[0]; 4978 color.ui[3] = sampler->Attrib.state.border_color.ui[0]; 4979 break; 4980 case GL_LUMINANCE: 4981 color.ui[0] = sampler->Attrib.state.border_color.ui[0]; 4982 color.ui[1] = sampler->Attrib.state.border_color.ui[0]; 4983 color.ui[2] = sampler->Attrib.state.border_color.ui[0]; 4984 color.ui[3] = float_as_int(1.0); 4985 break; 4986 case GL_LUMINANCE_ALPHA: 4987 color.ui[0] = sampler->Attrib.state.border_color.ui[0]; 4988 color.ui[1] = sampler->Attrib.state.border_color.ui[0]; 4989 color.ui[2] = sampler->Attrib.state.border_color.ui[0]; 4990 color.ui[3] = sampler->Attrib.state.border_color.ui[3]; 4991 break; 4992 default: 4993 color.ui[0] = sampler->Attrib.state.border_color.ui[0]; 4994 color.ui[1] = sampler->Attrib.state.border_color.ui[1]; 4995 color.ui[2] = sampler->Attrib.state.border_color.ui[2]; 4996 color.ui[3] = sampler->Attrib.state.border_color.ui[3]; 4997 break; 4998 } 4999 5000 /* In some cases we use an RGBA surface format for GL RGB textures, 5001 * where we've initialized the A channel to 1.0. We also have to set 5002 * the border color alpha to 1.0 in that case. 5003 */ 5004 if (base_format == GL_RGB) 5005 color.ui[3] = float_as_int(1.0); 5006 5007 int alignment = 32; 5008 if (GFX_VER >= 8) { 5009 alignment = 64; 5010 } else if (GFX_VERx10 == 75 && (is_integer_format || is_stencil_sampling)) { 5011 alignment = 512; 5012 } 5013 5014 uint32_t *sdc = brw_state_batch( 5015 brw, GENX(SAMPLER_BORDER_COLOR_STATE_length) * sizeof(uint32_t), 5016 alignment, sdc_offset); 5017 5018 struct GENX(SAMPLER_BORDER_COLOR_STATE) state = { 0 }; 5019 5020#define ASSIGN(dst, src) \ 5021 do { \ 5022 dst = src; \ 5023 } while (0) 5024 5025#define ASSIGNu16(dst, src) \ 5026 do { \ 5027 dst = (uint16_t)src; \ 5028 } while (0) 5029 5030#define ASSIGNu8(dst, src) \ 5031 do { \ 5032 dst = (uint8_t)src; \ 5033 } while (0) 5034 5035#define BORDER_COLOR_ATTR(macro, _color_type, src) \ 5036 macro(state.BorderColor ## _color_type ## Red, src[0]); \ 5037 macro(state.BorderColor ## _color_type ## Green, src[1]); \ 5038 macro(state.BorderColor ## _color_type ## Blue, src[2]); \ 5039 macro(state.BorderColor ## _color_type ## Alpha, src[3]); 5040 5041#if GFX_VER >= 8 5042 /* On Broadwell, the border color is represented as four 32-bit floats, 5043 * integers, or unsigned values, interpreted according to the surface 5044 * format. This matches the sampler->BorderColor union exactly; just 5045 * memcpy the values. 5046 */ 5047 BORDER_COLOR_ATTR(ASSIGN, 32bit, color.ui); 5048#elif GFX_VERx10 == 75 5049 if (is_integer_format || is_stencil_sampling) { 5050 bool stencil = format == MESA_FORMAT_S_UINT8 || is_stencil_sampling; 5051 const int bits_per_channel = 5052 _mesa_get_format_bits(format, stencil ? GL_STENCIL_BITS : GL_RED_BITS); 5053 5054 /* From the Haswell PRM, "Command Reference: Structures", Page 36: 5055 * "If any color channel is missing from the surface format, 5056 * corresponding border color should be programmed as zero and if 5057 * alpha channel is missing, corresponding Alpha border color should 5058 * be programmed as 1." 5059 */ 5060 unsigned c[4] = { 0, 0, 0, 1 }; 5061 for (int i = 0; i < 4; i++) { 5062 if (has_component(format, i)) 5063 c[i] = color.ui[i]; 5064 } 5065 5066 switch (bits_per_channel) { 5067 case 8: 5068 /* Copy RGBA in order. */ 5069 BORDER_COLOR_ATTR(ASSIGNu8, 8bit, c); 5070 break; 5071 case 10: 5072 /* R10G10B10A2_UINT is treated like a 16-bit format. */ 5073 case 16: 5074 BORDER_COLOR_ATTR(ASSIGNu16, 16bit, c); 5075 break; 5076 case 32: 5077 if (base_format == GL_RG) { 5078 /* Careful inspection of the tables reveals that for RG32 formats, 5079 * the green channel needs to go where blue normally belongs. 5080 */ 5081 state.BorderColor32bitRed = c[0]; 5082 state.BorderColor32bitBlue = c[1]; 5083 state.BorderColor32bitAlpha = 1; 5084 } else { 5085 /* Copy RGBA in order. */ 5086 BORDER_COLOR_ATTR(ASSIGN, 32bit, c); 5087 } 5088 break; 5089 default: 5090 assert(!"Invalid number of bits per channel in integer format."); 5091 break; 5092 } 5093 } else { 5094 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5095 } 5096#elif GFX_VER == 5 || GFX_VER == 6 5097 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_UBYTE, Unorm, color.f); 5098 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_USHORT, Unorm16, color.f); 5099 BORDER_COLOR_ATTR(UNCLAMPED_FLOAT_TO_SHORT, Snorm16, color.f); 5100 5101#define MESA_FLOAT_TO_HALF(dst, src) \ 5102 dst = _mesa_float_to_half(src); 5103 5104 BORDER_COLOR_ATTR(MESA_FLOAT_TO_HALF, Float16, color.f); 5105 5106#undef MESA_FLOAT_TO_HALF 5107 5108 state.BorderColorSnorm8Red = state.BorderColorSnorm16Red >> 8; 5109 state.BorderColorSnorm8Green = state.BorderColorSnorm16Green >> 8; 5110 state.BorderColorSnorm8Blue = state.BorderColorSnorm16Blue >> 8; 5111 state.BorderColorSnorm8Alpha = state.BorderColorSnorm16Alpha >> 8; 5112 5113 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5114#elif GFX_VER == 4 5115 BORDER_COLOR_ATTR(ASSIGN, , color.f); 5116#else 5117 BORDER_COLOR_ATTR(ASSIGN, Float, color.f); 5118#endif 5119 5120#undef ASSIGN 5121#undef BORDER_COLOR_ATTR 5122 5123 GENX(SAMPLER_BORDER_COLOR_STATE_pack)(brw, sdc, &state); 5124} 5125 5126static uint32_t 5127translate_wrap_mode(GLenum wrap, UNUSED bool using_nearest) 5128{ 5129 switch (wrap) { 5130 case GL_REPEAT: 5131 return TCM_WRAP; 5132 case GL_CLAMP: 5133#if GFX_VER >= 8 5134 /* GL_CLAMP is the weird mode where coordinates are clamped to 5135 * [0.0, 1.0], so linear filtering of coordinates outside of 5136 * [0.0, 1.0] give you half edge texel value and half border 5137 * color. 5138 * 5139 * Gfx8+ supports this natively. 5140 */ 5141 return TCM_HALF_BORDER; 5142#else 5143 /* On Gfx4-7.5, we clamp the coordinates in the fragment shader 5144 * and set clamp_border here, which gets the result desired. 5145 * We just use clamp(_to_edge) for nearest, because for nearest 5146 * clamping to 1.0 gives border color instead of the desired 5147 * edge texels. 5148 */ 5149 if (using_nearest) 5150 return TCM_CLAMP; 5151 else 5152 return TCM_CLAMP_BORDER; 5153#endif 5154 case GL_CLAMP_TO_EDGE: 5155 return TCM_CLAMP; 5156 case GL_CLAMP_TO_BORDER: 5157 return TCM_CLAMP_BORDER; 5158 case GL_MIRRORED_REPEAT: 5159 return TCM_MIRROR; 5160 case GL_MIRROR_CLAMP_TO_EDGE: 5161 return TCM_MIRROR_ONCE; 5162 default: 5163 return TCM_WRAP; 5164 } 5165} 5166 5167/** 5168 * Return true if the given wrap mode requires the border color to exist. 5169 */ 5170static bool 5171wrap_mode_needs_border_color(unsigned wrap_mode) 5172{ 5173#if GFX_VER >= 8 5174 return wrap_mode == TCM_CLAMP_BORDER || 5175 wrap_mode == TCM_HALF_BORDER; 5176#else 5177 return wrap_mode == TCM_CLAMP_BORDER; 5178#endif 5179} 5180 5181/** 5182 * Sets the sampler state for a single unit based off of the sampler key 5183 * entry. 5184 */ 5185static void 5186genX(update_sampler_state)(struct brw_context *brw, 5187 GLenum target, bool tex_cube_map_seamless, 5188 GLfloat tex_unit_lod_bias, 5189 mesa_format format, GLenum base_format, 5190 const struct gl_texture_object *texObj, 5191 const struct gl_sampler_object *sampler, 5192 uint32_t *sampler_state) 5193{ 5194 struct GENX(SAMPLER_STATE) samp_st = { 0 }; 5195 5196 /* Select min and mip filters. */ 5197 switch (sampler->Attrib.MinFilter) { 5198 case GL_NEAREST: 5199 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5200 samp_st.MipModeFilter = MIPFILTER_NONE; 5201 break; 5202 case GL_LINEAR: 5203 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5204 samp_st.MipModeFilter = MIPFILTER_NONE; 5205 break; 5206 case GL_NEAREST_MIPMAP_NEAREST: 5207 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5208 samp_st.MipModeFilter = MIPFILTER_NEAREST; 5209 break; 5210 case GL_LINEAR_MIPMAP_NEAREST: 5211 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5212 samp_st.MipModeFilter = MIPFILTER_NEAREST; 5213 break; 5214 case GL_NEAREST_MIPMAP_LINEAR: 5215 samp_st.MinModeFilter = MAPFILTER_NEAREST; 5216 samp_st.MipModeFilter = MIPFILTER_LINEAR; 5217 break; 5218 case GL_LINEAR_MIPMAP_LINEAR: 5219 samp_st.MinModeFilter = MAPFILTER_LINEAR; 5220 samp_st.MipModeFilter = MIPFILTER_LINEAR; 5221 break; 5222 default: 5223 unreachable("not reached"); 5224 } 5225 5226 /* Select mag filter. */ 5227 samp_st.MagModeFilter = sampler->Attrib.MagFilter == GL_LINEAR ? 5228 MAPFILTER_LINEAR : MAPFILTER_NEAREST; 5229 5230 /* Enable anisotropic filtering if desired. */ 5231 samp_st.MaximumAnisotropy = RATIO21; 5232 5233 if (sampler->Attrib.MaxAnisotropy > 1.0f) { 5234 if (samp_st.MinModeFilter == MAPFILTER_LINEAR) 5235 samp_st.MinModeFilter = MAPFILTER_ANISOTROPIC; 5236 if (samp_st.MagModeFilter == MAPFILTER_LINEAR) 5237 samp_st.MagModeFilter = MAPFILTER_ANISOTROPIC; 5238 5239 if (sampler->Attrib.MaxAnisotropy > 2.0f) { 5240 samp_st.MaximumAnisotropy = 5241 MIN2((sampler->Attrib.MaxAnisotropy - 2) / 2, RATIO161); 5242 } 5243 } 5244 5245 /* Set address rounding bits if not using nearest filtering. */ 5246 if (samp_st.MinModeFilter != MAPFILTER_NEAREST) { 5247 samp_st.UAddressMinFilterRoundingEnable = true; 5248 samp_st.VAddressMinFilterRoundingEnable = true; 5249 samp_st.RAddressMinFilterRoundingEnable = true; 5250 } 5251 5252 if (samp_st.MagModeFilter != MAPFILTER_NEAREST) { 5253 samp_st.UAddressMagFilterRoundingEnable = true; 5254 samp_st.VAddressMagFilterRoundingEnable = true; 5255 samp_st.RAddressMagFilterRoundingEnable = true; 5256 } 5257 5258 bool either_nearest = 5259 sampler->Attrib.MinFilter == GL_NEAREST || sampler->Attrib.MagFilter == GL_NEAREST; 5260 unsigned wrap_s = translate_wrap_mode(sampler->Attrib.WrapS, either_nearest); 5261 unsigned wrap_t = translate_wrap_mode(sampler->Attrib.WrapT, either_nearest); 5262 unsigned wrap_r = translate_wrap_mode(sampler->Attrib.WrapR, either_nearest); 5263 5264 if (target == GL_TEXTURE_CUBE_MAP || 5265 target == GL_TEXTURE_CUBE_MAP_ARRAY) { 5266 /* Cube maps must use the same wrap mode for all three coordinate 5267 * dimensions. Prior to Haswell, only CUBE and CLAMP are valid. 5268 * 5269 * Ivybridge and Baytrail seem to have problems with CUBE mode and 5270 * integer formats. Fall back to CLAMP for now. 5271 */ 5272 if ((tex_cube_map_seamless || sampler->Attrib.CubeMapSeamless) && 5273 !(GFX_VERx10 == 70 && texObj->_IsIntegerFormat)) { 5274 wrap_s = TCM_CUBE; 5275 wrap_t = TCM_CUBE; 5276 wrap_r = TCM_CUBE; 5277 } else { 5278 wrap_s = TCM_CLAMP; 5279 wrap_t = TCM_CLAMP; 5280 wrap_r = TCM_CLAMP; 5281 } 5282 } else if (target == GL_TEXTURE_1D) { 5283 /* There's a bug in 1D texture sampling - it actually pays 5284 * attention to the wrap_t value, though it should not. 5285 * Override the wrap_t value here to GL_REPEAT to keep 5286 * any nonexistent border pixels from floating in. 5287 */ 5288 wrap_t = TCM_WRAP; 5289 } 5290 5291 samp_st.TCXAddressControlMode = wrap_s; 5292 samp_st.TCYAddressControlMode = wrap_t; 5293 samp_st.TCZAddressControlMode = wrap_r; 5294 5295 samp_st.ShadowFunction = 5296 sampler->Attrib.CompareMode == GL_COMPARE_R_TO_TEXTURE_ARB ? 5297 brw_translate_shadow_compare_func(sampler->Attrib.CompareFunc) : 0; 5298 5299#if GFX_VER >= 7 5300 /* Set shadow function. */ 5301 samp_st.AnisotropicAlgorithm = 5302 samp_st.MinModeFilter == MAPFILTER_ANISOTROPIC ? 5303 EWAApproximation : LEGACY; 5304#endif 5305 5306#if GFX_VER >= 6 5307 samp_st.NonnormalizedCoordinateEnable = target == GL_TEXTURE_RECTANGLE; 5308#endif 5309 5310 const float hw_max_lod = GFX_VER >= 7 ? 14 : 13; 5311 samp_st.MinLOD = CLAMP(sampler->Attrib.MinLod, 0, hw_max_lod); 5312 samp_st.MaxLOD = CLAMP(sampler->Attrib.MaxLod, 0, hw_max_lod); 5313 samp_st.TextureLODBias = 5314 CLAMP(tex_unit_lod_bias + sampler->Attrib.LodBias, -16, 15); 5315 5316#if GFX_VER == 6 5317 samp_st.BaseMipLevel = 5318 CLAMP(texObj->Attrib.MinLevel + texObj->Attrib.BaseLevel, 0, hw_max_lod); 5319 samp_st.MinandMagStateNotEqual = 5320 samp_st.MinModeFilter != samp_st.MagModeFilter; 5321#endif 5322 5323 /* Upload the border color if necessary. If not, just point it at 5324 * offset 0 (the start of the batch) - the color should be ignored, 5325 * but that address won't fault in case something reads it anyway. 5326 */ 5327 uint32_t border_color_offset = 0; 5328 if (wrap_mode_needs_border_color(wrap_s) || 5329 wrap_mode_needs_border_color(wrap_t) || 5330 wrap_mode_needs_border_color(wrap_r)) { 5331 genX(upload_default_color)(brw, sampler, format, base_format, 5332 texObj->_IsIntegerFormat, 5333 texObj->StencilSampling, 5334 &border_color_offset); 5335 } 5336#if GFX_VER < 6 5337 samp_st.BorderColorPointer = 5338 ro_bo(brw->batch.state.bo, border_color_offset); 5339#else 5340 samp_st.BorderColorPointer = border_color_offset; 5341#endif 5342 5343#if GFX_VER >= 8 5344 samp_st.LODPreClampMode = CLAMP_MODE_OGL; 5345#else 5346 samp_st.LODPreClampEnable = true; 5347#endif 5348 5349 GENX(SAMPLER_STATE_pack)(brw, sampler_state, &samp_st); 5350} 5351 5352static void 5353update_sampler_state(struct brw_context *brw, 5354 int unit, 5355 uint32_t *sampler_state) 5356{ 5357 struct gl_context *ctx = &brw->ctx; 5358 const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit]; 5359 const struct gl_texture_object *texObj = texUnit->_Current; 5360 const struct gl_sampler_object *sampler = _mesa_get_samplerobj(ctx, unit); 5361 5362 /* These don't use samplers at all. */ 5363 if (texObj->Target == GL_TEXTURE_BUFFER) 5364 return; 5365 5366 struct gl_texture_image *firstImage = texObj->Image[0][texObj->Attrib.BaseLevel]; 5367 genX(update_sampler_state)(brw, texObj->Target, 5368 ctx->Texture.CubeMapSeamless, 5369 texUnit->LodBias, 5370 firstImage->TexFormat, firstImage->_BaseFormat, 5371 texObj, sampler, 5372 sampler_state); 5373} 5374 5375static void 5376genX(upload_sampler_state_table)(struct brw_context *brw, 5377 struct gl_program *prog, 5378 struct brw_stage_state *stage_state) 5379{ 5380 struct gl_context *ctx = &brw->ctx; 5381 uint32_t sampler_count = stage_state->sampler_count; 5382 5383 GLbitfield SamplersUsed = prog->SamplersUsed; 5384 5385 if (sampler_count == 0) 5386 return; 5387 5388 /* SAMPLER_STATE is 4 DWords on all platforms. */ 5389 const int dwords = GENX(SAMPLER_STATE_length); 5390 const int size_in_bytes = dwords * sizeof(uint32_t); 5391 5392 uint32_t *sampler_state = brw_state_batch(brw, 5393 sampler_count * size_in_bytes, 5394 32, &stage_state->sampler_offset); 5395 /* memset(sampler_state, 0, sampler_count * size_in_bytes); */ 5396 5397 for (unsigned s = 0; s < sampler_count; s++) { 5398 if (SamplersUsed & (1 << s)) { 5399 const unsigned unit = prog->SamplerUnits[s]; 5400 if (ctx->Texture.Unit[unit]._Current) { 5401 update_sampler_state(brw, unit, sampler_state); 5402 } 5403 } 5404 5405 sampler_state += dwords; 5406 } 5407 5408 if (GFX_VER >= 7 && stage_state->stage != MESA_SHADER_COMPUTE) { 5409 /* Emit a 3DSTATE_SAMPLER_STATE_POINTERS_XS packet. */ 5410 genX(emit_sampler_state_pointers_xs)(brw, stage_state); 5411 } else { 5412 /* Flag that the sampler state table pointer has changed; later atoms 5413 * will handle it. 5414 */ 5415 brw->ctx.NewDriverState |= BRW_NEW_SAMPLER_STATE_TABLE; 5416 } 5417} 5418 5419static void 5420genX(upload_fs_samplers)(struct brw_context *brw) 5421{ 5422 /* BRW_NEW_FRAGMENT_PROGRAM */ 5423 struct gl_program *fs = brw->programs[MESA_SHADER_FRAGMENT]; 5424 genX(upload_sampler_state_table)(brw, fs, &brw->wm.base); 5425} 5426 5427static const struct brw_tracked_state genX(fs_samplers) = { 5428 .dirty = { 5429 .mesa = _NEW_TEXTURE, 5430 .brw = BRW_NEW_BATCH | 5431 BRW_NEW_BLORP | 5432 BRW_NEW_FRAGMENT_PROGRAM, 5433 }, 5434 .emit = genX(upload_fs_samplers), 5435}; 5436 5437static void 5438genX(upload_vs_samplers)(struct brw_context *brw) 5439{ 5440 /* BRW_NEW_VERTEX_PROGRAM */ 5441 struct gl_program *vs = brw->programs[MESA_SHADER_VERTEX]; 5442 genX(upload_sampler_state_table)(brw, vs, &brw->vs.base); 5443} 5444 5445static const struct brw_tracked_state genX(vs_samplers) = { 5446 .dirty = { 5447 .mesa = _NEW_TEXTURE, 5448 .brw = BRW_NEW_BATCH | 5449 BRW_NEW_BLORP | 5450 BRW_NEW_VERTEX_PROGRAM, 5451 }, 5452 .emit = genX(upload_vs_samplers), 5453}; 5454 5455#if GFX_VER >= 6 5456static void 5457genX(upload_gs_samplers)(struct brw_context *brw) 5458{ 5459 /* BRW_NEW_GEOMETRY_PROGRAM */ 5460 struct gl_program *gs = brw->programs[MESA_SHADER_GEOMETRY]; 5461 if (!gs) 5462 return; 5463 5464 genX(upload_sampler_state_table)(brw, gs, &brw->gs.base); 5465} 5466 5467 5468static const struct brw_tracked_state genX(gs_samplers) = { 5469 .dirty = { 5470 .mesa = _NEW_TEXTURE, 5471 .brw = BRW_NEW_BATCH | 5472 BRW_NEW_BLORP | 5473 BRW_NEW_GEOMETRY_PROGRAM, 5474 }, 5475 .emit = genX(upload_gs_samplers), 5476}; 5477#endif 5478 5479#if GFX_VER >= 7 5480static void 5481genX(upload_tcs_samplers)(struct brw_context *brw) 5482{ 5483 /* BRW_NEW_TESS_PROGRAMS */ 5484 struct gl_program *tcs = brw->programs[MESA_SHADER_TESS_CTRL]; 5485 if (!tcs) 5486 return; 5487 5488 genX(upload_sampler_state_table)(brw, tcs, &brw->tcs.base); 5489} 5490 5491static const struct brw_tracked_state genX(tcs_samplers) = { 5492 .dirty = { 5493 .mesa = _NEW_TEXTURE, 5494 .brw = BRW_NEW_BATCH | 5495 BRW_NEW_BLORP | 5496 BRW_NEW_TESS_PROGRAMS, 5497 }, 5498 .emit = genX(upload_tcs_samplers), 5499}; 5500#endif 5501 5502#if GFX_VER >= 7 5503static void 5504genX(upload_tes_samplers)(struct brw_context *brw) 5505{ 5506 /* BRW_NEW_TESS_PROGRAMS */ 5507 struct gl_program *tes = brw->programs[MESA_SHADER_TESS_EVAL]; 5508 if (!tes) 5509 return; 5510 5511 genX(upload_sampler_state_table)(brw, tes, &brw->tes.base); 5512} 5513 5514static const struct brw_tracked_state genX(tes_samplers) = { 5515 .dirty = { 5516 .mesa = _NEW_TEXTURE, 5517 .brw = BRW_NEW_BATCH | 5518 BRW_NEW_BLORP | 5519 BRW_NEW_TESS_PROGRAMS, 5520 }, 5521 .emit = genX(upload_tes_samplers), 5522}; 5523#endif 5524 5525#if GFX_VER >= 7 5526static void 5527genX(upload_cs_samplers)(struct brw_context *brw) 5528{ 5529 /* BRW_NEW_COMPUTE_PROGRAM */ 5530 struct gl_program *cs = brw->programs[MESA_SHADER_COMPUTE]; 5531 if (!cs) 5532 return; 5533 5534 genX(upload_sampler_state_table)(brw, cs, &brw->cs.base); 5535} 5536 5537const struct brw_tracked_state genX(cs_samplers) = { 5538 .dirty = { 5539 .mesa = _NEW_TEXTURE, 5540 .brw = BRW_NEW_BATCH | 5541 BRW_NEW_BLORP | 5542 BRW_NEW_COMPUTE_PROGRAM, 5543 }, 5544 .emit = genX(upload_cs_samplers), 5545}; 5546#endif 5547 5548/* ---------------------------------------------------------------------- */ 5549 5550#if GFX_VER <= 5 5551 5552static void genX(upload_blend_constant_color)(struct brw_context *brw) 5553{ 5554 struct gl_context *ctx = &brw->ctx; 5555 5556 brw_batch_emit(brw, GENX(3DSTATE_CONSTANT_COLOR), blend_cc) { 5557 blend_cc.BlendConstantColorRed = ctx->Color.BlendColorUnclamped[0]; 5558 blend_cc.BlendConstantColorGreen = ctx->Color.BlendColorUnclamped[1]; 5559 blend_cc.BlendConstantColorBlue = ctx->Color.BlendColorUnclamped[2]; 5560 blend_cc.BlendConstantColorAlpha = ctx->Color.BlendColorUnclamped[3]; 5561 } 5562} 5563 5564static const struct brw_tracked_state genX(blend_constant_color) = { 5565 .dirty = { 5566 .mesa = _NEW_COLOR, 5567 .brw = BRW_NEW_CONTEXT | 5568 BRW_NEW_BLORP, 5569 }, 5570 .emit = genX(upload_blend_constant_color) 5571}; 5572#endif 5573 5574/* ---------------------------------------------------------------------- */ 5575 5576void 5577genX(init_atoms)(struct brw_context *brw) 5578{ 5579#if GFX_VER < 6 5580 static const struct brw_tracked_state *render_atoms[] = 5581 { 5582 &genX(vf_statistics), 5583 5584 /* Once all the programs are done, we know how large urb entry 5585 * sizes need to be and can decide if we need to change the urb 5586 * layout. 5587 */ 5588 &brw_curbe_offsets, 5589 &brw_recalculate_urb_fence, 5590 5591 &genX(cc_vp), 5592 &genX(color_calc_state), 5593 5594 /* Surface state setup. Must come before the VS/WM unit. The binding 5595 * table upload must be last. 5596 */ 5597 &brw_vs_pull_constants, 5598 &brw_wm_pull_constants, 5599 &brw_renderbuffer_surfaces, 5600 &brw_renderbuffer_read_surfaces, 5601 &brw_texture_surfaces, 5602 &brw_vs_binding_table, 5603 &brw_wm_binding_table, 5604 5605 &genX(fs_samplers), 5606 &genX(vs_samplers), 5607 5608 /* These set up state for brw_psp_urb_cbs */ 5609 &genX(wm_state), 5610 &genX(sf_clip_viewport), 5611 &genX(sf_state), 5612 &genX(vs_state), /* always required, enabled or not */ 5613 &genX(clip_state), 5614 &genX(gs_state), 5615 5616 /* Command packets: 5617 */ 5618 &brw_binding_table_pointers, 5619 &genX(blend_constant_color), 5620 5621 &brw_depthbuffer, 5622 5623 &genX(polygon_stipple), 5624 &genX(polygon_stipple_offset), 5625 5626 &genX(line_stipple), 5627 5628 &brw_psp_urb_cbs, 5629 5630 &genX(drawing_rect), 5631 &brw_indices, /* must come before brw_vertices */ 5632 &genX(index_buffer), 5633 &genX(vertices), 5634 5635 &brw_constant_buffer 5636 }; 5637#elif GFX_VER == 6 5638 static const struct brw_tracked_state *render_atoms[] = 5639 { 5640 &genX(vf_statistics), 5641 5642 &genX(sf_clip_viewport), 5643 5644 /* Command packets: */ 5645 5646 &genX(cc_vp), 5647 5648 &gfx6_urb, 5649 &genX(blend_state), /* must do before cc unit */ 5650 &genX(color_calc_state), /* must do before cc unit */ 5651 &genX(depth_stencil_state), /* must do before cc unit */ 5652 5653 &genX(vs_push_constants), /* Before vs_state */ 5654 &genX(gs_push_constants), /* Before gs_state */ 5655 &genX(wm_push_constants), /* Before wm_state */ 5656 5657 /* Surface state setup. Must come before the VS/WM unit. The binding 5658 * table upload must be last. 5659 */ 5660 &brw_vs_pull_constants, 5661 &brw_vs_ubo_surfaces, 5662 &brw_gs_pull_constants, 5663 &brw_gs_ubo_surfaces, 5664 &brw_wm_pull_constants, 5665 &brw_wm_ubo_surfaces, 5666 &gfx6_renderbuffer_surfaces, 5667 &brw_renderbuffer_read_surfaces, 5668 &brw_texture_surfaces, 5669 &gfx6_sol_surface, 5670 &brw_vs_binding_table, 5671 &gfx6_gs_binding_table, 5672 &brw_wm_binding_table, 5673 5674 &genX(fs_samplers), 5675 &genX(vs_samplers), 5676 &genX(gs_samplers), 5677 &gfx6_sampler_state, 5678 &genX(multisample_state), 5679 5680 &genX(vs_state), 5681 &genX(gs_state), 5682 &genX(clip_state), 5683 &genX(sf_state), 5684 &genX(wm_state), 5685 5686 &genX(scissor_state), 5687 5688 &gfx6_binding_table_pointers, 5689 5690 &brw_depthbuffer, 5691 5692 &genX(polygon_stipple), 5693 &genX(polygon_stipple_offset), 5694 5695 &genX(line_stipple), 5696 5697 &genX(drawing_rect), 5698 5699 &brw_indices, /* must come before brw_vertices */ 5700 &genX(index_buffer), 5701 &genX(vertices), 5702 }; 5703#elif GFX_VER == 7 5704 static const struct brw_tracked_state *render_atoms[] = 5705 { 5706 &genX(vf_statistics), 5707 5708 /* Command packets: */ 5709 5710 &genX(cc_vp), 5711 &genX(sf_clip_viewport), 5712 5713 &gfx7_l3_state, 5714 &gfx7_push_constant_space, 5715 &gfx7_urb, 5716#if GFX_VERx10 == 75 5717 &genX(cc_and_blend_state), 5718#else 5719 &genX(blend_state), /* must do before cc unit */ 5720 &genX(color_calc_state), /* must do before cc unit */ 5721#endif 5722 &genX(depth_stencil_state), /* must do before cc unit */ 5723 5724 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */ 5725 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */ 5726 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */ 5727 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */ 5728 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */ 5729 5730 &genX(vs_push_constants), /* Before vs_state */ 5731 &genX(tcs_push_constants), 5732 &genX(tes_push_constants), 5733 &genX(gs_push_constants), /* Before gs_state */ 5734 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */ 5735 5736 /* Surface state setup. Must come before the VS/WM unit. The binding 5737 * table upload must be last. 5738 */ 5739 &brw_vs_pull_constants, 5740 &brw_vs_ubo_surfaces, 5741 &brw_tcs_pull_constants, 5742 &brw_tcs_ubo_surfaces, 5743 &brw_tes_pull_constants, 5744 &brw_tes_ubo_surfaces, 5745 &brw_gs_pull_constants, 5746 &brw_gs_ubo_surfaces, 5747 &brw_wm_pull_constants, 5748 &brw_wm_ubo_surfaces, 5749 &gfx6_renderbuffer_surfaces, 5750 &brw_renderbuffer_read_surfaces, 5751 &brw_texture_surfaces, 5752 5753 &genX(push_constant_packets), 5754 5755 &brw_vs_binding_table, 5756 &brw_tcs_binding_table, 5757 &brw_tes_binding_table, 5758 &brw_gs_binding_table, 5759 &brw_wm_binding_table, 5760 5761 &genX(fs_samplers), 5762 &genX(vs_samplers), 5763 &genX(tcs_samplers), 5764 &genX(tes_samplers), 5765 &genX(gs_samplers), 5766 &genX(multisample_state), 5767 5768 &genX(vs_state), 5769 &genX(hs_state), 5770 &genX(te_state), 5771 &genX(ds_state), 5772 &genX(gs_state), 5773 &genX(sol_state), 5774 &genX(clip_state), 5775 &genX(sbe_state), 5776 &genX(sf_state), 5777 &genX(wm_state), 5778 &genX(ps_state), 5779 5780 &genX(scissor_state), 5781 5782 &brw_depthbuffer, 5783 5784 &genX(polygon_stipple), 5785 &genX(polygon_stipple_offset), 5786 5787 &genX(line_stipple), 5788 5789 &genX(drawing_rect), 5790 5791 &brw_indices, /* must come before brw_vertices */ 5792 &genX(index_buffer), 5793 &genX(vertices), 5794 5795#if GFX_VERx10 == 75 5796 &genX(cut_index), 5797#endif 5798 }; 5799#elif GFX_VER >= 8 5800 static const struct brw_tracked_state *render_atoms[] = 5801 { 5802 &genX(vf_statistics), 5803 5804 &genX(cc_vp), 5805 &genX(sf_clip_viewport), 5806 5807 &gfx7_l3_state, 5808 &gfx7_push_constant_space, 5809 &gfx7_urb, 5810 &genX(blend_state), 5811 &genX(color_calc_state), 5812 5813 &brw_vs_image_surfaces, /* Before vs push/pull constants and binding table */ 5814 &brw_tcs_image_surfaces, /* Before tcs push/pull constants and binding table */ 5815 &brw_tes_image_surfaces, /* Before tes push/pull constants and binding table */ 5816 &brw_gs_image_surfaces, /* Before gs push/pull constants and binding table */ 5817 &brw_wm_image_surfaces, /* Before wm push/pull constants and binding table */ 5818 5819 &genX(vs_push_constants), /* Before vs_state */ 5820 &genX(tcs_push_constants), 5821 &genX(tes_push_constants), 5822 &genX(gs_push_constants), /* Before gs_state */ 5823 &genX(wm_push_constants), /* Before wm_surfaces and constant_buffer */ 5824 5825 /* Surface state setup. Must come before the VS/WM unit. The binding 5826 * table upload must be last. 5827 */ 5828 &brw_vs_pull_constants, 5829 &brw_vs_ubo_surfaces, 5830 &brw_tcs_pull_constants, 5831 &brw_tcs_ubo_surfaces, 5832 &brw_tes_pull_constants, 5833 &brw_tes_ubo_surfaces, 5834 &brw_gs_pull_constants, 5835 &brw_gs_ubo_surfaces, 5836 &brw_wm_pull_constants, 5837 &brw_wm_ubo_surfaces, 5838 &gfx6_renderbuffer_surfaces, 5839 &brw_renderbuffer_read_surfaces, 5840 &brw_texture_surfaces, 5841 5842 &genX(push_constant_packets), 5843 5844 &brw_vs_binding_table, 5845 &brw_tcs_binding_table, 5846 &brw_tes_binding_table, 5847 &brw_gs_binding_table, 5848 &brw_wm_binding_table, 5849 5850 &genX(fs_samplers), 5851 &genX(vs_samplers), 5852 &genX(tcs_samplers), 5853 &genX(tes_samplers), 5854 &genX(gs_samplers), 5855 &genX(multisample_state), 5856 5857 &genX(vs_state), 5858 &genX(hs_state), 5859 &genX(te_state), 5860 &genX(ds_state), 5861 &genX(gs_state), 5862 &genX(sol_state), 5863 &genX(clip_state), 5864 &genX(raster_state), 5865 &genX(sbe_state), 5866 &genX(sf_state), 5867 &genX(ps_blend), 5868 &genX(ps_extra), 5869 &genX(ps_state), 5870 &genX(depth_stencil_state), 5871 &genX(wm_state), 5872 5873 &genX(scissor_state), 5874 5875 &brw_depthbuffer, 5876 5877 &genX(polygon_stipple), 5878 &genX(polygon_stipple_offset), 5879 5880 &genX(line_stipple), 5881 5882 &genX(drawing_rect), 5883 5884 &genX(vf_topology), 5885 5886 &brw_indices, 5887 &genX(index_buffer), 5888 &genX(vertices), 5889 5890 &genX(cut_index), 5891 &gfx8_pma_fix, 5892 }; 5893#endif 5894 5895 STATIC_ASSERT(ARRAY_SIZE(render_atoms) <= ARRAY_SIZE(brw->render_atoms)); 5896 brw_copy_pipeline_atoms(brw, BRW_RENDER_PIPELINE, 5897 render_atoms, ARRAY_SIZE(render_atoms)); 5898 5899#if GFX_VER >= 7 5900 static const struct brw_tracked_state *compute_atoms[] = 5901 { 5902 &gfx7_l3_state, 5903 &brw_cs_image_surfaces, 5904 &genX(cs_push_constants), 5905 &genX(cs_pull_constants), 5906 &brw_cs_ubo_surfaces, 5907 &brw_cs_texture_surfaces, 5908 &brw_cs_work_groups_surface, 5909 &genX(cs_samplers), 5910 &genX(cs_state), 5911 }; 5912 5913 STATIC_ASSERT(ARRAY_SIZE(compute_atoms) <= ARRAY_SIZE(brw->compute_atoms)); 5914 brw_copy_pipeline_atoms(brw, BRW_COMPUTE_PIPELINE, 5915 compute_atoms, ARRAY_SIZE(compute_atoms)); 5916 5917 brw->vtbl.emit_mi_report_perf_count = genX(emit_mi_report_perf_count); 5918 brw->vtbl.emit_compute_walker = genX(emit_gpgpu_walker); 5919#endif 5920 5921 assert(brw->screen->devinfo.verx10 == GFX_VERx10); 5922} 5923