1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "anv_private.h" 25 26#include "genxml/gen_macros.h" 27#include "genxml/genX_pack.h" 28 29#include "common/gen_l3_config.h" 30#include "common/gen_sample_positions.h" 31#include "nir/nir_xfb_info.h" 32#include "vk_util.h" 33#include "vk_format_info.h" 34 35static uint32_t 36vertex_element_comp_control(enum isl_format format, unsigned comp) 37{ 38 uint8_t bits; 39 switch (comp) { 40 case 0: bits = isl_format_layouts[format].channels.r.bits; break; 41 case 1: bits = isl_format_layouts[format].channels.g.bits; break; 42 case 2: bits = isl_format_layouts[format].channels.b.bits; break; 43 case 3: bits = isl_format_layouts[format].channels.a.bits; break; 44 default: unreachable("Invalid component"); 45 } 46 47 /* 48 * Take in account hardware restrictions when dealing with 64-bit floats. 49 * 50 * From Broadwell spec, command reference structures, page 586: 51 * "When SourceElementFormat is set to one of the *64*_PASSTHRU formats, 52 * 64-bit components are stored * in the URB without any conversion. In 53 * this case, vertex elements must be written as 128 or 256 bits, with 54 * VFCOMP_STORE_0 being used to pad the output as required. E.g., if 55 * R64_PASSTHRU is used to copy a 64-bit Red component into the URB, 56 * Component 1 must be specified as VFCOMP_STORE_0 (with Components 2,3 57 * set to VFCOMP_NOSTORE) in order to output a 128-bit vertex element, or 58 * Components 1-3 must be specified as VFCOMP_STORE_0 in order to output 59 * a 256-bit vertex element. Likewise, use of R64G64B64_PASSTHRU requires 60 * Component 3 to be specified as VFCOMP_STORE_0 in order to output a 61 * 256-bit vertex element." 62 */ 63 if (bits) { 64 return VFCOMP_STORE_SRC; 65 } else if (comp >= 2 && 66 !isl_format_layouts[format].channels.b.bits && 67 isl_format_layouts[format].channels.r.type == ISL_RAW) { 68 /* When emitting 64-bit attributes, we need to write either 128 or 256 69 * bit chunks, using VFCOMP_NOSTORE when not writing the chunk, and 70 * VFCOMP_STORE_0 to pad the written chunk */ 71 return VFCOMP_NOSTORE; 72 } else if (comp < 3 || 73 isl_format_layouts[format].channels.r.type == ISL_RAW) { 74 /* Note we need to pad with value 0, not 1, due hardware restrictions 75 * (see comment above) */ 76 return VFCOMP_STORE_0; 77 } else if (isl_format_layouts[format].channels.r.type == ISL_UINT || 78 isl_format_layouts[format].channels.r.type == ISL_SINT) { 79 assert(comp == 3); 80 return VFCOMP_STORE_1_INT; 81 } else { 82 assert(comp == 3); 83 return VFCOMP_STORE_1_FP; 84 } 85} 86 87static void 88emit_vertex_input(struct anv_pipeline *pipeline, 89 const VkPipelineVertexInputStateCreateInfo *info) 90{ 91 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 92 93 /* Pull inputs_read out of the VS prog data */ 94 const uint64_t inputs_read = vs_prog_data->inputs_read; 95 const uint64_t double_inputs_read = 96 vs_prog_data->double_inputs_read & inputs_read; 97 assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0); 98 const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0; 99 const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0; 100 const bool needs_svgs_elem = vs_prog_data->uses_vertexid || 101 vs_prog_data->uses_instanceid || 102 vs_prog_data->uses_firstvertex || 103 vs_prog_data->uses_baseinstance; 104 105 uint32_t elem_count = __builtin_popcount(elements) - 106 __builtin_popcount(elements_double) / 2; 107 108 const uint32_t total_elems = 109 MAX2(1, elem_count + needs_svgs_elem + vs_prog_data->uses_drawid); 110 111 uint32_t *p; 112 113 const uint32_t num_dwords = 1 + total_elems * 2; 114 p = anv_batch_emitn(&pipeline->batch, num_dwords, 115 GENX(3DSTATE_VERTEX_ELEMENTS)); 116 if (!p) 117 return; 118 119 for (uint32_t i = 0; i < total_elems; i++) { 120 /* The SKL docs for VERTEX_ELEMENT_STATE say: 121 * 122 * "All elements must be valid from Element[0] to the last valid 123 * element. (I.e. if Element[2] is valid then Element[1] and 124 * Element[0] must also be valid)." 125 * 126 * The SKL docs for 3D_Vertex_Component_Control say: 127 * 128 * "Don't store this component. (Not valid for Component 0, but can 129 * be used for Component 1-3)." 130 * 131 * So we can't just leave a vertex element blank and hope for the best. 132 * We have to tell the VF hardware to put something in it; so we just 133 * store a bunch of zero. 134 * 135 * TODO: Compact vertex elements so we never end up with holes. 136 */ 137 struct GENX(VERTEX_ELEMENT_STATE) element = { 138 .Valid = true, 139 .Component0Control = VFCOMP_STORE_0, 140 .Component1Control = VFCOMP_STORE_0, 141 .Component2Control = VFCOMP_STORE_0, 142 .Component3Control = VFCOMP_STORE_0, 143 }; 144 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + i * 2], &element); 145 } 146 147 for (uint32_t i = 0; i < info->vertexAttributeDescriptionCount; i++) { 148 const VkVertexInputAttributeDescription *desc = 149 &info->pVertexAttributeDescriptions[i]; 150 enum isl_format format = anv_get_isl_format(&pipeline->device->info, 151 desc->format, 152 VK_IMAGE_ASPECT_COLOR_BIT, 153 VK_IMAGE_TILING_LINEAR); 154 155 assert(desc->binding < MAX_VBS); 156 157 if ((elements & (1 << desc->location)) == 0) 158 continue; /* Binding unused */ 159 160 uint32_t slot = 161 __builtin_popcount(elements & ((1 << desc->location) - 1)) - 162 DIV_ROUND_UP(__builtin_popcount(elements_double & 163 ((1 << desc->location) -1)), 2); 164 165 struct GENX(VERTEX_ELEMENT_STATE) element = { 166 .VertexBufferIndex = desc->binding, 167 .Valid = true, 168 .SourceElementFormat = format, 169 .EdgeFlagEnable = false, 170 .SourceElementOffset = desc->offset, 171 .Component0Control = vertex_element_comp_control(format, 0), 172 .Component1Control = vertex_element_comp_control(format, 1), 173 .Component2Control = vertex_element_comp_control(format, 2), 174 .Component3Control = vertex_element_comp_control(format, 3), 175 }; 176 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + slot * 2], &element); 177 178#if GEN_GEN >= 8 179 /* On Broadwell and later, we have a separate VF_INSTANCING packet 180 * that controls instancing. On Haswell and prior, that's part of 181 * VERTEX_BUFFER_STATE which we emit later. 182 */ 183 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 184 vfi.InstancingEnable = pipeline->vb[desc->binding].instanced; 185 vfi.VertexElementIndex = slot; 186 vfi.InstanceDataStepRate = 187 pipeline->vb[desc->binding].instance_divisor; 188 } 189#endif 190 } 191 192 const uint32_t id_slot = elem_count; 193 if (needs_svgs_elem) { 194 /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum: 195 * "Within a VERTEX_ELEMENT_STATE structure, if a Component 196 * Control field is set to something other than VFCOMP_STORE_SRC, 197 * no higher-numbered Component Control fields may be set to 198 * VFCOMP_STORE_SRC" 199 * 200 * This means, that if we have BaseInstance, we need BaseVertex as 201 * well. Just do all or nothing. 202 */ 203 uint32_t base_ctrl = (vs_prog_data->uses_firstvertex || 204 vs_prog_data->uses_baseinstance) ? 205 VFCOMP_STORE_SRC : VFCOMP_STORE_0; 206 207 struct GENX(VERTEX_ELEMENT_STATE) element = { 208 .VertexBufferIndex = ANV_SVGS_VB_INDEX, 209 .Valid = true, 210 .SourceElementFormat = ISL_FORMAT_R32G32_UINT, 211 .Component0Control = base_ctrl, 212 .Component1Control = base_ctrl, 213#if GEN_GEN >= 8 214 .Component2Control = VFCOMP_STORE_0, 215 .Component3Control = VFCOMP_STORE_0, 216#else 217 .Component2Control = VFCOMP_STORE_VID, 218 .Component3Control = VFCOMP_STORE_IID, 219#endif 220 }; 221 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, &p[1 + id_slot * 2], &element); 222 } 223 224#if GEN_GEN >= 8 225 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_SGVS), sgvs) { 226 sgvs.VertexIDEnable = vs_prog_data->uses_vertexid; 227 sgvs.VertexIDComponentNumber = 2; 228 sgvs.VertexIDElementOffset = id_slot; 229 sgvs.InstanceIDEnable = vs_prog_data->uses_instanceid; 230 sgvs.InstanceIDComponentNumber = 3; 231 sgvs.InstanceIDElementOffset = id_slot; 232 } 233#endif 234 235 const uint32_t drawid_slot = elem_count + needs_svgs_elem; 236 if (vs_prog_data->uses_drawid) { 237 struct GENX(VERTEX_ELEMENT_STATE) element = { 238 .VertexBufferIndex = ANV_DRAWID_VB_INDEX, 239 .Valid = true, 240 .SourceElementFormat = ISL_FORMAT_R32_UINT, 241 .Component0Control = VFCOMP_STORE_SRC, 242 .Component1Control = VFCOMP_STORE_0, 243 .Component2Control = VFCOMP_STORE_0, 244 .Component3Control = VFCOMP_STORE_0, 245 }; 246 GENX(VERTEX_ELEMENT_STATE_pack)(NULL, 247 &p[1 + drawid_slot * 2], 248 &element); 249 250#if GEN_GEN >= 8 251 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_INSTANCING), vfi) { 252 vfi.VertexElementIndex = drawid_slot; 253 } 254#endif 255 } 256} 257 258void 259genX(emit_urb_setup)(struct anv_device *device, struct anv_batch *batch, 260 const struct gen_l3_config *l3_config, 261 VkShaderStageFlags active_stages, 262 const unsigned entry_size[4]) 263{ 264 const struct gen_device_info *devinfo = &device->info; 265#if GEN_IS_HASWELL 266 const unsigned push_constant_kb = devinfo->gt == 3 ? 32 : 16; 267#else 268 const unsigned push_constant_kb = GEN_GEN >= 8 ? 32 : 16; 269#endif 270 271 const unsigned urb_size_kb = gen_get_l3_config_urb_size(devinfo, l3_config); 272 273 unsigned entries[4]; 274 unsigned start[4]; 275 gen_get_urb_config(devinfo, 276 1024 * push_constant_kb, 1024 * urb_size_kb, 277 active_stages & 278 VK_SHADER_STAGE_TESSELLATION_EVALUATION_BIT, 279 active_stages & VK_SHADER_STAGE_GEOMETRY_BIT, 280 entry_size, entries, start); 281 282#if GEN_GEN == 7 && !GEN_IS_HASWELL 283 /* From the IVB PRM Vol. 2, Part 1, Section 3.2.1: 284 * 285 * "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth stall 286 * needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS, 287 * 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS, 288 * 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL 289 * needs to be sent before any combination of VS associated 3DSTATE." 290 */ 291 anv_batch_emit(batch, GEN7_PIPE_CONTROL, pc) { 292 pc.DepthStallEnable = true; 293 pc.PostSyncOperation = WriteImmediateData; 294 pc.Address = (struct anv_address) { &device->workaround_bo, 0 }; 295 } 296#endif 297 298 for (int i = 0; i <= MESA_SHADER_GEOMETRY; i++) { 299 anv_batch_emit(batch, GENX(3DSTATE_URB_VS), urb) { 300 urb._3DCommandSubOpcode += i; 301 urb.VSURBStartingAddress = start[i]; 302 urb.VSURBEntryAllocationSize = entry_size[i] - 1; 303 urb.VSNumberofURBEntries = entries[i]; 304 } 305 } 306} 307 308static void 309emit_urb_setup(struct anv_pipeline *pipeline) 310{ 311 unsigned entry_size[4]; 312 for (int i = MESA_SHADER_VERTEX; i <= MESA_SHADER_GEOMETRY; i++) { 313 const struct brw_vue_prog_data *prog_data = 314 !anv_pipeline_has_stage(pipeline, i) ? NULL : 315 (const struct brw_vue_prog_data *) pipeline->shaders[i]->prog_data; 316 317 entry_size[i] = prog_data ? prog_data->urb_entry_size : 1; 318 } 319 320 genX(emit_urb_setup)(pipeline->device, &pipeline->batch, 321 pipeline->urb.l3_config, 322 pipeline->active_stages, entry_size); 323} 324 325static void 326emit_3dstate_sbe(struct anv_pipeline *pipeline) 327{ 328 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 329 330 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 331 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SBE), sbe); 332#if GEN_GEN >= 8 333 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SBE_SWIZ), sbe); 334#endif 335 return; 336 } 337 338 const struct brw_vue_map *fs_input_map = 339 &anv_pipeline_get_last_vue_prog_data(pipeline)->vue_map; 340 341 struct GENX(3DSTATE_SBE) sbe = { 342 GENX(3DSTATE_SBE_header), 343 .AttributeSwizzleEnable = true, 344 .PointSpriteTextureCoordinateOrigin = UPPERLEFT, 345 .NumberofSFOutputAttributes = wm_prog_data->num_varying_inputs, 346 .ConstantInterpolationEnable = wm_prog_data->flat_inputs, 347 }; 348 349#if GEN_GEN >= 9 350 for (unsigned i = 0; i < 32; i++) 351 sbe.AttributeActiveComponentFormat[i] = ACF_XYZW; 352#endif 353 354#if GEN_GEN >= 8 355 /* On Broadwell, they broke 3DSTATE_SBE into two packets */ 356 struct GENX(3DSTATE_SBE_SWIZ) swiz = { 357 GENX(3DSTATE_SBE_SWIZ_header), 358 }; 359#else 360# define swiz sbe 361#endif 362 363 /* Skip the VUE header and position slots by default */ 364 unsigned urb_entry_read_offset = 1; 365 int max_source_attr = 0; 366 for (int attr = 0; attr < VARYING_SLOT_MAX; attr++) { 367 int input_index = wm_prog_data->urb_setup[attr]; 368 369 if (input_index < 0) 370 continue; 371 372 /* gl_Layer is stored in the VUE header */ 373 if (attr == VARYING_SLOT_LAYER) { 374 urb_entry_read_offset = 0; 375 continue; 376 } 377 378 if (attr == VARYING_SLOT_PNTC) { 379 sbe.PointSpriteTextureCoordinateEnable = 1 << input_index; 380 continue; 381 } 382 383 const int slot = fs_input_map->varying_to_slot[attr]; 384 385 if (input_index >= 16) 386 continue; 387 388 if (slot == -1) { 389 /* This attribute does not exist in the VUE--that means that the 390 * vertex shader did not write to it. It could be that it's a 391 * regular varying read by the fragment shader but not written by 392 * the vertex shader or it's gl_PrimitiveID. In the first case the 393 * value is undefined, in the second it needs to be 394 * gl_PrimitiveID. 395 */ 396 swiz.Attribute[input_index].ConstantSource = PRIM_ID; 397 swiz.Attribute[input_index].ComponentOverrideX = true; 398 swiz.Attribute[input_index].ComponentOverrideY = true; 399 swiz.Attribute[input_index].ComponentOverrideZ = true; 400 swiz.Attribute[input_index].ComponentOverrideW = true; 401 } else { 402 /* We have to subtract two slots to accout for the URB entry output 403 * read offset in the VS and GS stages. 404 */ 405 const int source_attr = slot - 2 * urb_entry_read_offset; 406 assert(source_attr >= 0 && source_attr < 32); 407 max_source_attr = MAX2(max_source_attr, source_attr); 408 swiz.Attribute[input_index].SourceAttribute = source_attr; 409 } 410 } 411 412 sbe.VertexURBEntryReadOffset = urb_entry_read_offset; 413 sbe.VertexURBEntryReadLength = DIV_ROUND_UP(max_source_attr + 1, 2); 414#if GEN_GEN >= 8 415 sbe.ForceVertexURBEntryReadOffset = true; 416 sbe.ForceVertexURBEntryReadLength = true; 417#endif 418 419 uint32_t *dw = anv_batch_emit_dwords(&pipeline->batch, 420 GENX(3DSTATE_SBE_length)); 421 if (!dw) 422 return; 423 GENX(3DSTATE_SBE_pack)(&pipeline->batch, dw, &sbe); 424 425#if GEN_GEN >= 8 426 dw = anv_batch_emit_dwords(&pipeline->batch, GENX(3DSTATE_SBE_SWIZ_length)); 427 if (!dw) 428 return; 429 GENX(3DSTATE_SBE_SWIZ_pack)(&pipeline->batch, dw, &swiz); 430#endif 431} 432 433static const uint32_t vk_to_gen_cullmode[] = { 434 [VK_CULL_MODE_NONE] = CULLMODE_NONE, 435 [VK_CULL_MODE_FRONT_BIT] = CULLMODE_FRONT, 436 [VK_CULL_MODE_BACK_BIT] = CULLMODE_BACK, 437 [VK_CULL_MODE_FRONT_AND_BACK] = CULLMODE_BOTH 438}; 439 440static const uint32_t vk_to_gen_fillmode[] = { 441 [VK_POLYGON_MODE_FILL] = FILL_MODE_SOLID, 442 [VK_POLYGON_MODE_LINE] = FILL_MODE_WIREFRAME, 443 [VK_POLYGON_MODE_POINT] = FILL_MODE_POINT, 444}; 445 446static const uint32_t vk_to_gen_front_face[] = { 447 [VK_FRONT_FACE_COUNTER_CLOCKWISE] = 1, 448 [VK_FRONT_FACE_CLOCKWISE] = 0 449}; 450 451static void 452emit_rs_state(struct anv_pipeline *pipeline, 453 const VkPipelineRasterizationStateCreateInfo *rs_info, 454 const VkPipelineMultisampleStateCreateInfo *ms_info, 455 const struct anv_render_pass *pass, 456 const struct anv_subpass *subpass) 457{ 458 struct GENX(3DSTATE_SF) sf = { 459 GENX(3DSTATE_SF_header), 460 }; 461 462 sf.ViewportTransformEnable = true; 463 sf.StatisticsEnable = true; 464 sf.TriangleStripListProvokingVertexSelect = 0; 465 sf.LineStripListProvokingVertexSelect = 0; 466 sf.TriangleFanProvokingVertexSelect = 1; 467 sf.VertexSubPixelPrecisionSelect = _8Bit; 468 469 const struct brw_vue_prog_data *last_vue_prog_data = 470 anv_pipeline_get_last_vue_prog_data(pipeline); 471 472 if (last_vue_prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { 473 sf.PointWidthSource = Vertex; 474 } else { 475 sf.PointWidthSource = State; 476 sf.PointWidth = 1.0; 477 } 478 479#if GEN_GEN >= 8 480 struct GENX(3DSTATE_RASTER) raster = { 481 GENX(3DSTATE_RASTER_header), 482 }; 483#else 484# define raster sf 485#endif 486 487 /* For details on 3DSTATE_RASTER multisample state, see the BSpec table 488 * "Multisample Modes State". 489 */ 490#if GEN_GEN >= 8 491 raster.DXMultisampleRasterizationEnable = true; 492 /* NOTE: 3DSTATE_RASTER::ForcedSampleCount affects the BDW and SKL PMA fix 493 * computations. If we ever set this bit to a different value, they will 494 * need to be updated accordingly. 495 */ 496 raster.ForcedSampleCount = FSC_NUMRASTSAMPLES_0; 497 raster.ForceMultisampling = false; 498#else 499 raster.MultisampleRasterizationMode = 500 (ms_info && ms_info->rasterizationSamples > 1) ? 501 MSRASTMODE_ON_PATTERN : MSRASTMODE_OFF_PIXEL; 502#endif 503 504 raster.FrontWinding = vk_to_gen_front_face[rs_info->frontFace]; 505 raster.CullMode = vk_to_gen_cullmode[rs_info->cullMode]; 506 raster.FrontFaceFillMode = vk_to_gen_fillmode[rs_info->polygonMode]; 507 raster.BackFaceFillMode = vk_to_gen_fillmode[rs_info->polygonMode]; 508 raster.ScissorRectangleEnable = true; 509 510#if GEN_GEN >= 9 511 /* GEN9+ splits ViewportZClipTestEnable into near and far enable bits */ 512 raster.ViewportZFarClipTestEnable = pipeline->depth_clip_enable; 513 raster.ViewportZNearClipTestEnable = pipeline->depth_clip_enable; 514#elif GEN_GEN >= 8 515 raster.ViewportZClipTestEnable = pipeline->depth_clip_enable; 516#endif 517 518 raster.GlobalDepthOffsetEnableSolid = rs_info->depthBiasEnable; 519 raster.GlobalDepthOffsetEnableWireframe = rs_info->depthBiasEnable; 520 raster.GlobalDepthOffsetEnablePoint = rs_info->depthBiasEnable; 521 522#if GEN_GEN == 7 523 /* Gen7 requires that we provide the depth format in 3DSTATE_SF so that it 524 * can get the depth offsets correct. 525 */ 526 if (subpass->depth_stencil_attachment) { 527 VkFormat vk_format = 528 pass->attachments[subpass->depth_stencil_attachment->attachment].format; 529 assert(vk_format_is_depth_or_stencil(vk_format)); 530 if (vk_format_aspects(vk_format) & VK_IMAGE_ASPECT_DEPTH_BIT) { 531 enum isl_format isl_format = 532 anv_get_isl_format(&pipeline->device->info, vk_format, 533 VK_IMAGE_ASPECT_DEPTH_BIT, 534 VK_IMAGE_TILING_OPTIMAL); 535 sf.DepthBufferSurfaceFormat = 536 isl_format_get_depth_format(isl_format, false); 537 } 538 } 539#endif 540 541#if GEN_GEN >= 8 542 GENX(3DSTATE_SF_pack)(NULL, pipeline->gen8.sf, &sf); 543 GENX(3DSTATE_RASTER_pack)(NULL, pipeline->gen8.raster, &raster); 544#else 545# undef raster 546 GENX(3DSTATE_SF_pack)(NULL, &pipeline->gen7.sf, &sf); 547#endif 548} 549 550static void 551emit_ms_state(struct anv_pipeline *pipeline, 552 const VkPipelineMultisampleStateCreateInfo *info) 553{ 554 uint32_t samples = 1; 555 uint32_t log2_samples = 0; 556 557 /* From the Vulkan 1.0 spec: 558 * If pSampleMask is NULL, it is treated as if the mask has all bits 559 * enabled, i.e. no coverage is removed from fragments. 560 * 561 * 3DSTATE_SAMPLE_MASK.SampleMask is 16 bits. 562 */ 563#if GEN_GEN >= 8 564 uint32_t sample_mask = 0xffff; 565#else 566 uint32_t sample_mask = 0xff; 567#endif 568 569 if (info) { 570 samples = info->rasterizationSamples; 571 log2_samples = __builtin_ffs(samples) - 1; 572 } 573 574 if (info && info->pSampleMask) 575 sample_mask &= info->pSampleMask[0]; 576 577 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_MULTISAMPLE), ms) { 578 ms.NumberofMultisamples = log2_samples; 579 580 ms.PixelLocation = CENTER; 581#if GEN_GEN >= 8 582 /* The PRM says that this bit is valid only for DX9: 583 * 584 * SW can choose to set this bit only for DX9 API. DX10/OGL API's 585 * should not have any effect by setting or not setting this bit. 586 */ 587 ms.PixelPositionOffsetEnable = false; 588#else 589 590 switch (samples) { 591 case 1: 592 GEN_SAMPLE_POS_1X(ms.Sample); 593 break; 594 case 2: 595 GEN_SAMPLE_POS_2X(ms.Sample); 596 break; 597 case 4: 598 GEN_SAMPLE_POS_4X(ms.Sample); 599 break; 600 case 8: 601 GEN_SAMPLE_POS_8X(ms.Sample); 602 break; 603 default: 604 break; 605 } 606#endif 607 } 608 609 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_SAMPLE_MASK), sm) { 610 sm.SampleMask = sample_mask; 611 } 612} 613 614static const uint32_t vk_to_gen_logic_op[] = { 615 [VK_LOGIC_OP_COPY] = LOGICOP_COPY, 616 [VK_LOGIC_OP_CLEAR] = LOGICOP_CLEAR, 617 [VK_LOGIC_OP_AND] = LOGICOP_AND, 618 [VK_LOGIC_OP_AND_REVERSE] = LOGICOP_AND_REVERSE, 619 [VK_LOGIC_OP_AND_INVERTED] = LOGICOP_AND_INVERTED, 620 [VK_LOGIC_OP_NO_OP] = LOGICOP_NOOP, 621 [VK_LOGIC_OP_XOR] = LOGICOP_XOR, 622 [VK_LOGIC_OP_OR] = LOGICOP_OR, 623 [VK_LOGIC_OP_NOR] = LOGICOP_NOR, 624 [VK_LOGIC_OP_EQUIVALENT] = LOGICOP_EQUIV, 625 [VK_LOGIC_OP_INVERT] = LOGICOP_INVERT, 626 [VK_LOGIC_OP_OR_REVERSE] = LOGICOP_OR_REVERSE, 627 [VK_LOGIC_OP_COPY_INVERTED] = LOGICOP_COPY_INVERTED, 628 [VK_LOGIC_OP_OR_INVERTED] = LOGICOP_OR_INVERTED, 629 [VK_LOGIC_OP_NAND] = LOGICOP_NAND, 630 [VK_LOGIC_OP_SET] = LOGICOP_SET, 631}; 632 633static const uint32_t vk_to_gen_blend[] = { 634 [VK_BLEND_FACTOR_ZERO] = BLENDFACTOR_ZERO, 635 [VK_BLEND_FACTOR_ONE] = BLENDFACTOR_ONE, 636 [VK_BLEND_FACTOR_SRC_COLOR] = BLENDFACTOR_SRC_COLOR, 637 [VK_BLEND_FACTOR_ONE_MINUS_SRC_COLOR] = BLENDFACTOR_INV_SRC_COLOR, 638 [VK_BLEND_FACTOR_DST_COLOR] = BLENDFACTOR_DST_COLOR, 639 [VK_BLEND_FACTOR_ONE_MINUS_DST_COLOR] = BLENDFACTOR_INV_DST_COLOR, 640 [VK_BLEND_FACTOR_SRC_ALPHA] = BLENDFACTOR_SRC_ALPHA, 641 [VK_BLEND_FACTOR_ONE_MINUS_SRC_ALPHA] = BLENDFACTOR_INV_SRC_ALPHA, 642 [VK_BLEND_FACTOR_DST_ALPHA] = BLENDFACTOR_DST_ALPHA, 643 [VK_BLEND_FACTOR_ONE_MINUS_DST_ALPHA] = BLENDFACTOR_INV_DST_ALPHA, 644 [VK_BLEND_FACTOR_CONSTANT_COLOR] = BLENDFACTOR_CONST_COLOR, 645 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_COLOR]= BLENDFACTOR_INV_CONST_COLOR, 646 [VK_BLEND_FACTOR_CONSTANT_ALPHA] = BLENDFACTOR_CONST_ALPHA, 647 [VK_BLEND_FACTOR_ONE_MINUS_CONSTANT_ALPHA]= BLENDFACTOR_INV_CONST_ALPHA, 648 [VK_BLEND_FACTOR_SRC_ALPHA_SATURATE] = BLENDFACTOR_SRC_ALPHA_SATURATE, 649 [VK_BLEND_FACTOR_SRC1_COLOR] = BLENDFACTOR_SRC1_COLOR, 650 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR] = BLENDFACTOR_INV_SRC1_COLOR, 651 [VK_BLEND_FACTOR_SRC1_ALPHA] = BLENDFACTOR_SRC1_ALPHA, 652 [VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA] = BLENDFACTOR_INV_SRC1_ALPHA, 653}; 654 655static const uint32_t vk_to_gen_blend_op[] = { 656 [VK_BLEND_OP_ADD] = BLENDFUNCTION_ADD, 657 [VK_BLEND_OP_SUBTRACT] = BLENDFUNCTION_SUBTRACT, 658 [VK_BLEND_OP_REVERSE_SUBTRACT] = BLENDFUNCTION_REVERSE_SUBTRACT, 659 [VK_BLEND_OP_MIN] = BLENDFUNCTION_MIN, 660 [VK_BLEND_OP_MAX] = BLENDFUNCTION_MAX, 661}; 662 663static const uint32_t vk_to_gen_compare_op[] = { 664 [VK_COMPARE_OP_NEVER] = PREFILTEROPNEVER, 665 [VK_COMPARE_OP_LESS] = PREFILTEROPLESS, 666 [VK_COMPARE_OP_EQUAL] = PREFILTEROPEQUAL, 667 [VK_COMPARE_OP_LESS_OR_EQUAL] = PREFILTEROPLEQUAL, 668 [VK_COMPARE_OP_GREATER] = PREFILTEROPGREATER, 669 [VK_COMPARE_OP_NOT_EQUAL] = PREFILTEROPNOTEQUAL, 670 [VK_COMPARE_OP_GREATER_OR_EQUAL] = PREFILTEROPGEQUAL, 671 [VK_COMPARE_OP_ALWAYS] = PREFILTEROPALWAYS, 672}; 673 674static const uint32_t vk_to_gen_stencil_op[] = { 675 [VK_STENCIL_OP_KEEP] = STENCILOP_KEEP, 676 [VK_STENCIL_OP_ZERO] = STENCILOP_ZERO, 677 [VK_STENCIL_OP_REPLACE] = STENCILOP_REPLACE, 678 [VK_STENCIL_OP_INCREMENT_AND_CLAMP] = STENCILOP_INCRSAT, 679 [VK_STENCIL_OP_DECREMENT_AND_CLAMP] = STENCILOP_DECRSAT, 680 [VK_STENCIL_OP_INVERT] = STENCILOP_INVERT, 681 [VK_STENCIL_OP_INCREMENT_AND_WRAP] = STENCILOP_INCR, 682 [VK_STENCIL_OP_DECREMENT_AND_WRAP] = STENCILOP_DECR, 683}; 684 685/* This function sanitizes the VkStencilOpState by looking at the compare ops 686 * and trying to determine whether or not a given stencil op can ever actually 687 * occur. Stencil ops which can never occur are set to VK_STENCIL_OP_KEEP. 688 * This function returns true if, after sanitation, any of the stencil ops are 689 * set to something other than VK_STENCIL_OP_KEEP. 690 */ 691static bool 692sanitize_stencil_face(VkStencilOpState *face, 693 VkCompareOp depthCompareOp) 694{ 695 /* If compareOp is ALWAYS then the stencil test will never fail and failOp 696 * will never happen. Set failOp to KEEP in this case. 697 */ 698 if (face->compareOp == VK_COMPARE_OP_ALWAYS) 699 face->failOp = VK_STENCIL_OP_KEEP; 700 701 /* If compareOp is NEVER or depthCompareOp is NEVER then one of the depth 702 * or stencil tests will fail and passOp will never happen. 703 */ 704 if (face->compareOp == VK_COMPARE_OP_NEVER || 705 depthCompareOp == VK_COMPARE_OP_NEVER) 706 face->passOp = VK_STENCIL_OP_KEEP; 707 708 /* If compareOp is NEVER or depthCompareOp is ALWAYS then either the 709 * stencil test will fail or the depth test will pass. In either case, 710 * depthFailOp will never happen. 711 */ 712 if (face->compareOp == VK_COMPARE_OP_NEVER || 713 depthCompareOp == VK_COMPARE_OP_ALWAYS) 714 face->depthFailOp = VK_STENCIL_OP_KEEP; 715 716 return face->failOp != VK_STENCIL_OP_KEEP || 717 face->depthFailOp != VK_STENCIL_OP_KEEP || 718 face->passOp != VK_STENCIL_OP_KEEP; 719} 720 721/* Intel hardware is fairly sensitive to whether or not depth/stencil writes 722 * are enabled. In the presence of discards, it's fairly easy to get into the 723 * non-promoted case which means a fairly big performance hit. From the Iron 724 * Lake PRM, Vol 2, pt. 1, section 8.4.3.2, "Early Depth Test Cases": 725 * 726 * "Non-promoted depth (N) is active whenever the depth test can be done 727 * early but it cannot determine whether or not to write source depth to 728 * the depth buffer, therefore the depth write must be performed post pixel 729 * shader. This includes cases where the pixel shader can kill pixels, 730 * including via sampler chroma key, as well as cases where the alpha test 731 * function is enabled, which kills pixels based on a programmable alpha 732 * test. In this case, even if the depth test fails, the pixel cannot be 733 * killed if a stencil write is indicated. Whether or not the stencil write 734 * happens depends on whether or not the pixel is killed later. In these 735 * cases if stencil test fails and stencil writes are off, the pixels can 736 * also be killed early. If stencil writes are enabled, the pixels must be 737 * treated as Computed depth (described above)." 738 * 739 * The same thing as mentioned in the stencil case can happen in the depth 740 * case as well if it thinks it writes depth but, thanks to the depth test 741 * being GL_EQUAL, the write doesn't actually matter. A little extra work 742 * up-front to try and disable depth and stencil writes can make a big 743 * difference. 744 * 745 * Unfortunately, the way depth and stencil testing is specified, there are 746 * many case where, regardless of depth/stencil writes being enabled, nothing 747 * actually gets written due to some other bit of state being set. This 748 * function attempts to "sanitize" the depth stencil state and disable writes 749 * and sometimes even testing whenever possible. 750 */ 751static void 752sanitize_ds_state(VkPipelineDepthStencilStateCreateInfo *state, 753 bool *stencilWriteEnable, 754 VkImageAspectFlags ds_aspects) 755{ 756 *stencilWriteEnable = state->stencilTestEnable; 757 758 /* If the depth test is disabled, we won't be writing anything. Make sure we 759 * treat the test as always passing later on as well. 760 * 761 * Also, the Vulkan spec requires that if either depth or stencil is not 762 * present, the pipeline is to act as if the test silently passes. In that 763 * case we won't write either. 764 */ 765 if (!state->depthTestEnable || !(ds_aspects & VK_IMAGE_ASPECT_DEPTH_BIT)) { 766 state->depthWriteEnable = false; 767 state->depthCompareOp = VK_COMPARE_OP_ALWAYS; 768 } 769 770 if (!(ds_aspects & VK_IMAGE_ASPECT_STENCIL_BIT)) { 771 *stencilWriteEnable = false; 772 state->front.compareOp = VK_COMPARE_OP_ALWAYS; 773 state->back.compareOp = VK_COMPARE_OP_ALWAYS; 774 } 775 776 /* If the stencil test is enabled and always fails, then we will never get 777 * to the depth test so we can just disable the depth test entirely. 778 */ 779 if (state->stencilTestEnable && 780 state->front.compareOp == VK_COMPARE_OP_NEVER && 781 state->back.compareOp == VK_COMPARE_OP_NEVER) { 782 state->depthTestEnable = false; 783 state->depthWriteEnable = false; 784 } 785 786 /* If depthCompareOp is EQUAL then the value we would be writing to the 787 * depth buffer is the same as the value that's already there so there's no 788 * point in writing it. 789 */ 790 if (state->depthCompareOp == VK_COMPARE_OP_EQUAL) 791 state->depthWriteEnable = false; 792 793 /* If the stencil ops are such that we don't actually ever modify the 794 * stencil buffer, we should disable writes. 795 */ 796 if (!sanitize_stencil_face(&state->front, state->depthCompareOp) && 797 !sanitize_stencil_face(&state->back, state->depthCompareOp)) 798 *stencilWriteEnable = false; 799 800 /* If the depth test always passes and we never write out depth, that's the 801 * same as if the depth test is disabled entirely. 802 */ 803 if (state->depthCompareOp == VK_COMPARE_OP_ALWAYS && 804 !state->depthWriteEnable) 805 state->depthTestEnable = false; 806 807 /* If the stencil test always passes and we never write out stencil, that's 808 * the same as if the stencil test is disabled entirely. 809 */ 810 if (state->front.compareOp == VK_COMPARE_OP_ALWAYS && 811 state->back.compareOp == VK_COMPARE_OP_ALWAYS && 812 !*stencilWriteEnable) 813 state->stencilTestEnable = false; 814} 815 816static void 817emit_ds_state(struct anv_pipeline *pipeline, 818 const VkPipelineDepthStencilStateCreateInfo *pCreateInfo, 819 const struct anv_render_pass *pass, 820 const struct anv_subpass *subpass) 821{ 822#if GEN_GEN == 7 823# define depth_stencil_dw pipeline->gen7.depth_stencil_state 824#elif GEN_GEN == 8 825# define depth_stencil_dw pipeline->gen8.wm_depth_stencil 826#else 827# define depth_stencil_dw pipeline->gen9.wm_depth_stencil 828#endif 829 830 if (pCreateInfo == NULL) { 831 /* We're going to OR this together with the dynamic state. We need 832 * to make sure it's initialized to something useful. 833 */ 834 pipeline->writes_stencil = false; 835 pipeline->stencil_test_enable = false; 836 pipeline->writes_depth = false; 837 pipeline->depth_test_enable = false; 838 memset(depth_stencil_dw, 0, sizeof(depth_stencil_dw)); 839 return; 840 } 841 842 VkImageAspectFlags ds_aspects = 0; 843 if (subpass->depth_stencil_attachment) { 844 VkFormat depth_stencil_format = 845 pass->attachments[subpass->depth_stencil_attachment->attachment].format; 846 ds_aspects = vk_format_aspects(depth_stencil_format); 847 } 848 849 VkPipelineDepthStencilStateCreateInfo info = *pCreateInfo; 850 sanitize_ds_state(&info, &pipeline->writes_stencil, ds_aspects); 851 pipeline->stencil_test_enable = info.stencilTestEnable; 852 pipeline->writes_depth = info.depthWriteEnable; 853 pipeline->depth_test_enable = info.depthTestEnable; 854 855 /* VkBool32 depthBoundsTestEnable; // optional (depth_bounds_test) */ 856 857#if GEN_GEN <= 7 858 struct GENX(DEPTH_STENCIL_STATE) depth_stencil = { 859#else 860 struct GENX(3DSTATE_WM_DEPTH_STENCIL) depth_stencil = { 861#endif 862 .DepthTestEnable = info.depthTestEnable, 863 .DepthBufferWriteEnable = info.depthWriteEnable, 864 .DepthTestFunction = vk_to_gen_compare_op[info.depthCompareOp], 865 .DoubleSidedStencilEnable = true, 866 867 .StencilTestEnable = info.stencilTestEnable, 868 .StencilFailOp = vk_to_gen_stencil_op[info.front.failOp], 869 .StencilPassDepthPassOp = vk_to_gen_stencil_op[info.front.passOp], 870 .StencilPassDepthFailOp = vk_to_gen_stencil_op[info.front.depthFailOp], 871 .StencilTestFunction = vk_to_gen_compare_op[info.front.compareOp], 872 .BackfaceStencilFailOp = vk_to_gen_stencil_op[info.back.failOp], 873 .BackfaceStencilPassDepthPassOp = vk_to_gen_stencil_op[info.back.passOp], 874 .BackfaceStencilPassDepthFailOp =vk_to_gen_stencil_op[info.back.depthFailOp], 875 .BackfaceStencilTestFunction = vk_to_gen_compare_op[info.back.compareOp], 876 }; 877 878#if GEN_GEN <= 7 879 GENX(DEPTH_STENCIL_STATE_pack)(NULL, depth_stencil_dw, &depth_stencil); 880#else 881 GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil); 882#endif 883} 884 885MAYBE_UNUSED static bool 886is_dual_src_blend_factor(VkBlendFactor factor) 887{ 888 return factor == VK_BLEND_FACTOR_SRC1_COLOR || 889 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_COLOR || 890 factor == VK_BLEND_FACTOR_SRC1_ALPHA || 891 factor == VK_BLEND_FACTOR_ONE_MINUS_SRC1_ALPHA; 892} 893 894static void 895emit_cb_state(struct anv_pipeline *pipeline, 896 const VkPipelineColorBlendStateCreateInfo *info, 897 const VkPipelineMultisampleStateCreateInfo *ms_info) 898{ 899 struct anv_device *device = pipeline->device; 900 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 901 902 struct GENX(BLEND_STATE) blend_state = { 903#if GEN_GEN >= 8 904 .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable, 905 .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable, 906#endif 907 }; 908 909 uint32_t surface_count = 0; 910 struct anv_pipeline_bind_map *map; 911 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 912 map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map; 913 surface_count = map->surface_count; 914 } 915 916 const uint32_t num_dwords = GENX(BLEND_STATE_length) + 917 GENX(BLEND_STATE_ENTRY_length) * surface_count; 918 pipeline->blend_state = 919 anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64); 920 921 bool has_writeable_rt = false; 922 uint32_t *state_pos = pipeline->blend_state.map; 923 state_pos += GENX(BLEND_STATE_length); 924#if GEN_GEN >= 8 925 struct GENX(BLEND_STATE_ENTRY) bs0 = { 0 }; 926#endif 927 for (unsigned i = 0; i < surface_count; i++) { 928 struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i]; 929 930 /* All color attachments are at the beginning of the binding table */ 931 if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) 932 break; 933 934 /* We can have at most 8 attachments */ 935 assert(i < 8); 936 937 if (info == NULL || binding->index >= info->attachmentCount) { 938 /* Default everything to disabled */ 939 struct GENX(BLEND_STATE_ENTRY) entry = { 940 .WriteDisableAlpha = true, 941 .WriteDisableRed = true, 942 .WriteDisableGreen = true, 943 .WriteDisableBlue = true, 944 }; 945 GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry); 946 state_pos += GENX(BLEND_STATE_ENTRY_length); 947 continue; 948 } 949 950 assert(binding->binding == 0); 951 const VkPipelineColorBlendAttachmentState *a = 952 &info->pAttachments[binding->index]; 953 954 struct GENX(BLEND_STATE_ENTRY) entry = { 955#if GEN_GEN < 8 956 .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable, 957 .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable, 958#endif 959 .LogicOpEnable = info->logicOpEnable, 960 .LogicOpFunction = vk_to_gen_logic_op[info->logicOp], 961 .ColorBufferBlendEnable = a->blendEnable, 962 .ColorClampRange = COLORCLAMP_RTFORMAT, 963 .PreBlendColorClampEnable = true, 964 .PostBlendColorClampEnable = true, 965 .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor], 966 .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor], 967 .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp], 968 .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor], 969 .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor], 970 .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp], 971 .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT), 972 .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT), 973 .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT), 974 .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT), 975 }; 976 977 if (a->srcColorBlendFactor != a->srcAlphaBlendFactor || 978 a->dstColorBlendFactor != a->dstAlphaBlendFactor || 979 a->colorBlendOp != a->alphaBlendOp) { 980#if GEN_GEN >= 8 981 blend_state.IndependentAlphaBlendEnable = true; 982#else 983 entry.IndependentAlphaBlendEnable = true; 984#endif 985 } 986 987 /* The Dual Source Blending documentation says: 988 * 989 * "If SRC1 is included in a src/dst blend factor and 990 * a DualSource RT Write message is not used, results 991 * are UNDEFINED. (This reflects the same restriction in DX APIs, 992 * where undefined results are produced if “o1” is not written 993 * by a PS – there are no default values defined)." 994 * 995 * There is no way to gracefully fix this undefined situation 996 * so we just disable the blending to prevent possible issues. 997 */ 998 if (!wm_prog_data->dual_src_blend && 999 (is_dual_src_blend_factor(a->srcColorBlendFactor) || 1000 is_dual_src_blend_factor(a->dstColorBlendFactor) || 1001 is_dual_src_blend_factor(a->srcAlphaBlendFactor) || 1002 is_dual_src_blend_factor(a->dstAlphaBlendFactor))) { 1003 vk_debug_report(&device->instance->debug_report_callbacks, 1004 VK_DEBUG_REPORT_WARNING_BIT_EXT, 1005 VK_DEBUG_REPORT_OBJECT_TYPE_DEVICE_EXT, 1006 (uint64_t)(uintptr_t)device, 1007 0, 0, "anv", 1008 "Enabled dual-src blend factors without writing both targets " 1009 "in the shader. Disabling blending to avoid GPU hangs."); 1010 entry.ColorBufferBlendEnable = false; 1011 } 1012 1013 if (a->colorWriteMask != 0) 1014 has_writeable_rt = true; 1015 1016 /* Our hardware applies the blend factor prior to the blend function 1017 * regardless of what function is used. Technically, this means the 1018 * hardware can do MORE than GL or Vulkan specify. However, it also 1019 * means that, for MIN and MAX, we have to stomp the blend factor to 1020 * ONE to make it a no-op. 1021 */ 1022 if (a->colorBlendOp == VK_BLEND_OP_MIN || 1023 a->colorBlendOp == VK_BLEND_OP_MAX) { 1024 entry.SourceBlendFactor = BLENDFACTOR_ONE; 1025 entry.DestinationBlendFactor = BLENDFACTOR_ONE; 1026 } 1027 if (a->alphaBlendOp == VK_BLEND_OP_MIN || 1028 a->alphaBlendOp == VK_BLEND_OP_MAX) { 1029 entry.SourceAlphaBlendFactor = BLENDFACTOR_ONE; 1030 entry.DestinationAlphaBlendFactor = BLENDFACTOR_ONE; 1031 } 1032 GENX(BLEND_STATE_ENTRY_pack)(NULL, state_pos, &entry); 1033 state_pos += GENX(BLEND_STATE_ENTRY_length); 1034#if GEN_GEN >= 8 1035 if (i == 0) 1036 bs0 = entry; 1037#endif 1038 } 1039 1040#if GEN_GEN >= 8 1041 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_BLEND), blend) { 1042 blend.AlphaToCoverageEnable = blend_state.AlphaToCoverageEnable; 1043 blend.HasWriteableRT = has_writeable_rt; 1044 blend.ColorBufferBlendEnable = bs0.ColorBufferBlendEnable; 1045 blend.SourceAlphaBlendFactor = bs0.SourceAlphaBlendFactor; 1046 blend.DestinationAlphaBlendFactor = bs0.DestinationAlphaBlendFactor; 1047 blend.SourceBlendFactor = bs0.SourceBlendFactor; 1048 blend.DestinationBlendFactor = bs0.DestinationBlendFactor; 1049 blend.AlphaTestEnable = false; 1050 blend.IndependentAlphaBlendEnable = 1051 blend_state.IndependentAlphaBlendEnable; 1052 } 1053#else 1054 (void)has_writeable_rt; 1055#endif 1056 1057 GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state); 1058 1059 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) { 1060 bsp.BlendStatePointer = pipeline->blend_state.offset; 1061#if GEN_GEN >= 8 1062 bsp.BlendStatePointerValid = true; 1063#endif 1064 } 1065} 1066 1067static void 1068emit_3dstate_clip(struct anv_pipeline *pipeline, 1069 const VkPipelineViewportStateCreateInfo *vp_info, 1070 const VkPipelineRasterizationStateCreateInfo *rs_info) 1071{ 1072 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1073 (void) wm_prog_data; 1074 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP), clip) { 1075 clip.ClipEnable = true; 1076 clip.StatisticsEnable = true; 1077 clip.EarlyCullEnable = true; 1078 clip.APIMode = APIMODE_D3D; 1079 clip.ViewportXYClipTestEnable = true; 1080 1081#if GEN_GEN >= 8 1082 clip.VertexSubPixelPrecisionSelect = _8Bit; 1083#endif 1084 1085 clip.ClipMode = CLIPMODE_NORMAL; 1086 1087 clip.TriangleStripListProvokingVertexSelect = 0; 1088 clip.LineStripListProvokingVertexSelect = 0; 1089 clip.TriangleFanProvokingVertexSelect = 1; 1090 1091 clip.MinimumPointWidth = 0.125; 1092 clip.MaximumPointWidth = 255.875; 1093 1094 const struct brw_vue_prog_data *last = 1095 anv_pipeline_get_last_vue_prog_data(pipeline); 1096 1097 /* From the Vulkan 1.0.45 spec: 1098 * 1099 * "If the last active vertex processing stage shader entry point's 1100 * interface does not include a variable decorated with 1101 * ViewportIndex, then the first viewport is used." 1102 */ 1103 if (vp_info && (last->vue_map.slots_valid & VARYING_BIT_VIEWPORT)) { 1104 clip.MaximumVPIndex = vp_info->viewportCount - 1; 1105 } else { 1106 clip.MaximumVPIndex = 0; 1107 } 1108 1109 /* From the Vulkan 1.0.45 spec: 1110 * 1111 * "If the last active vertex processing stage shader entry point's 1112 * interface does not include a variable decorated with Layer, then 1113 * the first layer is used." 1114 */ 1115 clip.ForceZeroRTAIndexEnable = 1116 !(last->vue_map.slots_valid & VARYING_BIT_LAYER); 1117 1118#if GEN_GEN == 7 1119 clip.FrontWinding = vk_to_gen_front_face[rs_info->frontFace]; 1120 clip.CullMode = vk_to_gen_cullmode[rs_info->cullMode]; 1121 clip.ViewportZClipTestEnable = pipeline->depth_clip_enable; 1122 clip.UserClipDistanceClipTestEnableBitmask = last->clip_distance_mask; 1123 clip.UserClipDistanceCullTestEnableBitmask = last->cull_distance_mask; 1124#else 1125 clip.NonPerspectiveBarycentricEnable = wm_prog_data ? 1126 (wm_prog_data->barycentric_interp_modes & 1127 BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0 : 0; 1128#endif 1129 } 1130} 1131 1132static void 1133emit_3dstate_streamout(struct anv_pipeline *pipeline, 1134 const VkPipelineRasterizationStateCreateInfo *rs_info) 1135{ 1136#if GEN_GEN >= 8 1137 const struct brw_vue_prog_data *prog_data = 1138 anv_pipeline_get_last_vue_prog_data(pipeline); 1139 const struct brw_vue_map *vue_map = &prog_data->vue_map; 1140#endif 1141 1142 nir_xfb_info *xfb_info; 1143 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) 1144 xfb_info = pipeline->shaders[MESA_SHADER_GEOMETRY]->xfb_info; 1145 else if (anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) 1146 xfb_info = pipeline->shaders[MESA_SHADER_TESS_EVAL]->xfb_info; 1147 else 1148 xfb_info = pipeline->shaders[MESA_SHADER_VERTEX]->xfb_info; 1149 1150 pipeline->xfb_used = xfb_info ? xfb_info->buffers_written : 0; 1151 1152 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_STREAMOUT), so) { 1153 so.RenderingDisable = rs_info->rasterizerDiscardEnable; 1154 1155#if GEN_GEN >= 8 1156 if (xfb_info) { 1157 so.SOFunctionEnable = true; 1158 so.SOStatisticsEnable = true; 1159 1160 const VkPipelineRasterizationStateStreamCreateInfoEXT *stream_info = 1161 vk_find_struct_const(rs_info, PIPELINE_RASTERIZATION_STATE_STREAM_CREATE_INFO_EXT); 1162 so.RenderStreamSelect = stream_info ? 1163 stream_info->rasterizationStream : 0; 1164 1165 so.Buffer0SurfacePitch = xfb_info->buffers[0].stride; 1166 so.Buffer1SurfacePitch = xfb_info->buffers[1].stride; 1167 so.Buffer2SurfacePitch = xfb_info->buffers[2].stride; 1168 so.Buffer3SurfacePitch = xfb_info->buffers[3].stride; 1169 1170 int urb_entry_read_offset = 0; 1171 int urb_entry_read_length = 1172 (prog_data->vue_map.num_slots + 1) / 2 - urb_entry_read_offset; 1173 1174 /* We always read the whole vertex. This could be reduced at some 1175 * point by reading less and offsetting the register index in the 1176 * SO_DECLs. 1177 */ 1178 so.Stream0VertexReadOffset = urb_entry_read_offset; 1179 so.Stream0VertexReadLength = urb_entry_read_length - 1; 1180 so.Stream1VertexReadOffset = urb_entry_read_offset; 1181 so.Stream1VertexReadLength = urb_entry_read_length - 1; 1182 so.Stream2VertexReadOffset = urb_entry_read_offset; 1183 so.Stream2VertexReadLength = urb_entry_read_length - 1; 1184 so.Stream3VertexReadOffset = urb_entry_read_offset; 1185 so.Stream3VertexReadLength = urb_entry_read_length - 1; 1186 } 1187#endif /* GEN_GEN >= 8 */ 1188 } 1189 1190#if GEN_GEN >= 8 1191 if (xfb_info) { 1192 struct GENX(SO_DECL) so_decl[MAX_XFB_STREAMS][128]; 1193 int next_offset[MAX_XFB_BUFFERS] = {0, 0, 0, 0}; 1194 int decls[MAX_XFB_STREAMS] = {0, 0, 0, 0}; 1195 1196 memset(so_decl, 0, sizeof(so_decl)); 1197 1198 for (unsigned i = 0; i < xfb_info->output_count; i++) { 1199 const nir_xfb_output_info *output = &xfb_info->outputs[i]; 1200 unsigned buffer = output->buffer; 1201 unsigned stream = xfb_info->buffer_to_stream[buffer]; 1202 1203 /* Our hardware is unusual in that it requires us to program SO_DECLs 1204 * for fake "hole" components, rather than simply taking the offset 1205 * for each real varying. Each hole can have size 1, 2, 3, or 4; we 1206 * program as many size = 4 holes as we can, then a final hole to 1207 * accommodate the final 1, 2, or 3 remaining. 1208 */ 1209 int hole_dwords = (output->offset - next_offset[buffer]) / 4; 1210 while (hole_dwords > 0) { 1211 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { 1212 .HoleFlag = 1, 1213 .OutputBufferSlot = buffer, 1214 .ComponentMask = (1 << MIN2(hole_dwords, 4)) - 1, 1215 }; 1216 hole_dwords -= 4; 1217 } 1218 1219 int varying = output->location; 1220 uint8_t component_mask = output->component_mask; 1221 /* VARYING_SLOT_PSIZ contains three scalar fields packed together: 1222 * - VARYING_SLOT_LAYER in VARYING_SLOT_PSIZ.y 1223 * - VARYING_SLOT_VIEWPORT in VARYING_SLOT_PSIZ.z 1224 * - VARYING_SLOT_PSIZ in VARYING_SLOT_PSIZ.w 1225 */ 1226 if (varying == VARYING_SLOT_LAYER) { 1227 varying = VARYING_SLOT_PSIZ; 1228 component_mask = 1 << 1; // SO_DECL_COMPMASK_Y 1229 } else if (varying == VARYING_SLOT_VIEWPORT) { 1230 varying = VARYING_SLOT_PSIZ; 1231 component_mask = 1 << 2; // SO_DECL_COMPMASK_Z 1232 } else if (varying == VARYING_SLOT_PSIZ) { 1233 component_mask = 1 << 3; // SO_DECL_COMPMASK_W 1234 } 1235 1236 next_offset[buffer] = output->offset + 1237 __builtin_popcount(component_mask) * 4; 1238 1239 so_decl[stream][decls[stream]++] = (struct GENX(SO_DECL)) { 1240 .OutputBufferSlot = buffer, 1241 .RegisterIndex = vue_map->varying_to_slot[varying], 1242 .ComponentMask = component_mask, 1243 }; 1244 } 1245 1246 int max_decls = 0; 1247 for (unsigned s = 0; s < MAX_XFB_STREAMS; s++) 1248 max_decls = MAX2(max_decls, decls[s]); 1249 1250 uint8_t sbs[MAX_XFB_STREAMS] = { }; 1251 for (unsigned b = 0; b < MAX_XFB_BUFFERS; b++) { 1252 if (xfb_info->buffers_written & (1 << b)) 1253 sbs[xfb_info->buffer_to_stream[b]] |= 1 << b; 1254 } 1255 1256 uint32_t *dw = anv_batch_emitn(&pipeline->batch, 3 + 2 * max_decls, 1257 GENX(3DSTATE_SO_DECL_LIST), 1258 .StreamtoBufferSelects0 = sbs[0], 1259 .StreamtoBufferSelects1 = sbs[1], 1260 .StreamtoBufferSelects2 = sbs[2], 1261 .StreamtoBufferSelects3 = sbs[3], 1262 .NumEntries0 = decls[0], 1263 .NumEntries1 = decls[1], 1264 .NumEntries2 = decls[2], 1265 .NumEntries3 = decls[3]); 1266 1267 for (int i = 0; i < max_decls; i++) { 1268 GENX(SO_DECL_ENTRY_pack)(NULL, dw + 3 + i * 2, 1269 &(struct GENX(SO_DECL_ENTRY)) { 1270 .Stream0Decl = so_decl[0][i], 1271 .Stream1Decl = so_decl[1][i], 1272 .Stream2Decl = so_decl[2][i], 1273 .Stream3Decl = so_decl[3][i], 1274 }); 1275 } 1276 } 1277#endif /* GEN_GEN >= 8 */ 1278} 1279 1280static uint32_t 1281get_sampler_count(const struct anv_shader_bin *bin) 1282{ 1283 uint32_t count_by_4 = DIV_ROUND_UP(bin->bind_map.sampler_count, 4); 1284 1285 /* We can potentially have way more than 32 samplers and that's ok. 1286 * However, the 3DSTATE_XS packets only have 3 bits to specify how 1287 * many to pre-fetch and all values above 4 are marked reserved. 1288 */ 1289 return MIN2(count_by_4, 4); 1290} 1291 1292static uint32_t 1293get_binding_table_entry_count(const struct anv_shader_bin *bin) 1294{ 1295 return DIV_ROUND_UP(bin->bind_map.surface_count, 32); 1296} 1297 1298static struct anv_address 1299get_scratch_address(struct anv_pipeline *pipeline, 1300 gl_shader_stage stage, 1301 const struct anv_shader_bin *bin) 1302{ 1303 return (struct anv_address) { 1304 .bo = anv_scratch_pool_alloc(pipeline->device, 1305 &pipeline->device->scratch_pool, 1306 stage, bin->prog_data->total_scratch), 1307 .offset = 0, 1308 }; 1309} 1310 1311static uint32_t 1312get_scratch_space(const struct anv_shader_bin *bin) 1313{ 1314 return ffs(bin->prog_data->total_scratch / 2048); 1315} 1316 1317static void 1318emit_3dstate_vs(struct anv_pipeline *pipeline) 1319{ 1320 const struct gen_device_info *devinfo = &pipeline->device->info; 1321 const struct brw_vs_prog_data *vs_prog_data = get_vs_prog_data(pipeline); 1322 const struct anv_shader_bin *vs_bin = 1323 pipeline->shaders[MESA_SHADER_VERTEX]; 1324 1325 assert(anv_pipeline_has_stage(pipeline, MESA_SHADER_VERTEX)); 1326 1327 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VS), vs) { 1328 vs.Enable = true; 1329 vs.StatisticsEnable = true; 1330 vs.KernelStartPointer = vs_bin->kernel.offset; 1331#if GEN_GEN >= 8 1332 vs.SIMD8DispatchEnable = 1333 vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8; 1334#endif 1335 1336 assert(!vs_prog_data->base.base.use_alt_mode); 1337#if GEN_GEN < 11 1338 vs.SingleVertexDispatch = false; 1339#endif 1340 vs.VectorMaskEnable = false; 1341 /* WA_1606682166: 1342 * Incorrect TDL's SSP address shift in SARB for 16:6 & 18:8 modes. 1343 * Disable the Sampler state prefetch functionality in the SARB by 1344 * programming 0xB000[30] to '1'. 1345 */ 1346 vs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(vs_bin); 1347 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable suggests to 1348 * disable prefetching of binding tables on A0 and B0 steppings. 1349 * TODO: Revisit this WA on newer steppings. 1350 */ 1351 vs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(vs_bin); 1352 vs.FloatingPointMode = IEEE754; 1353 vs.IllegalOpcodeExceptionEnable = false; 1354 vs.SoftwareExceptionEnable = false; 1355 vs.MaximumNumberofThreads = devinfo->max_vs_threads - 1; 1356 1357 if (GEN_GEN == 9 && devinfo->gt == 4 && 1358 anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { 1359 /* On Sky Lake GT4, we have experienced some hangs related to the VS 1360 * cache and tessellation. It is unknown exactly what is happening 1361 * but the Haswell docs for the "VS Reference Count Full Force Miss 1362 * Enable" field of the "Thread Mode" register refer to a HSW bug in 1363 * which the VUE handle reference count would overflow resulting in 1364 * internal reference counting bugs. My (Jason's) best guess is that 1365 * this bug cropped back up on SKL GT4 when we suddenly had more 1366 * threads in play than any previous gen9 hardware. 1367 * 1368 * What we do know for sure is that setting this bit when 1369 * tessellation shaders are in use fixes a GPU hang in Batman: Arkham 1370 * City when playing with DXVK (https://bugs.freedesktop.org/107280). 1371 * Disabling the vertex cache with tessellation shaders should only 1372 * have a minor performance impact as the tessellation shaders are 1373 * likely generating and processing far more geometry than the vertex 1374 * stage. 1375 */ 1376 vs.VertexCacheDisable = true; 1377 } 1378 1379 vs.VertexURBEntryReadLength = vs_prog_data->base.urb_read_length; 1380 vs.VertexURBEntryReadOffset = 0; 1381 vs.DispatchGRFStartRegisterForURBData = 1382 vs_prog_data->base.base.dispatch_grf_start_reg; 1383 1384#if GEN_GEN >= 8 1385 vs.UserClipDistanceClipTestEnableBitmask = 1386 vs_prog_data->base.clip_distance_mask; 1387 vs.UserClipDistanceCullTestEnableBitmask = 1388 vs_prog_data->base.cull_distance_mask; 1389#endif 1390 1391 vs.PerThreadScratchSpace = get_scratch_space(vs_bin); 1392 vs.ScratchSpaceBasePointer = 1393 get_scratch_address(pipeline, MESA_SHADER_VERTEX, vs_bin); 1394 } 1395} 1396 1397static void 1398emit_3dstate_hs_te_ds(struct anv_pipeline *pipeline, 1399 const VkPipelineTessellationStateCreateInfo *tess_info) 1400{ 1401 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_TESS_EVAL)) { 1402 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), hs); 1403 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_TE), te); 1404 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), ds); 1405 return; 1406 } 1407 1408 const struct gen_device_info *devinfo = &pipeline->device->info; 1409 const struct anv_shader_bin *tcs_bin = 1410 pipeline->shaders[MESA_SHADER_TESS_CTRL]; 1411 const struct anv_shader_bin *tes_bin = 1412 pipeline->shaders[MESA_SHADER_TESS_EVAL]; 1413 1414 const struct brw_tcs_prog_data *tcs_prog_data = get_tcs_prog_data(pipeline); 1415 const struct brw_tes_prog_data *tes_prog_data = get_tes_prog_data(pipeline); 1416 1417 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_HS), hs) { 1418 hs.Enable = true; 1419 hs.StatisticsEnable = true; 1420 hs.KernelStartPointer = tcs_bin->kernel.offset; 1421 /* WA_1606682166 */ 1422 hs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tcs_bin); 1423 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ 1424 hs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(tcs_bin); 1425 hs.MaximumNumberofThreads = devinfo->max_tcs_threads - 1; 1426 hs.IncludeVertexHandles = true; 1427 hs.InstanceCount = tcs_prog_data->instances - 1; 1428 1429 hs.VertexURBEntryReadLength = 0; 1430 hs.VertexURBEntryReadOffset = 0; 1431 hs.DispatchGRFStartRegisterForURBData = 1432 tcs_prog_data->base.base.dispatch_grf_start_reg; 1433 1434 hs.PerThreadScratchSpace = get_scratch_space(tcs_bin); 1435 hs.ScratchSpaceBasePointer = 1436 get_scratch_address(pipeline, MESA_SHADER_TESS_CTRL, tcs_bin); 1437 } 1438 1439 const VkPipelineTessellationDomainOriginStateCreateInfo *domain_origin_state = 1440 tess_info ? vk_find_struct_const(tess_info, PIPELINE_TESSELLATION_DOMAIN_ORIGIN_STATE_CREATE_INFO) : NULL; 1441 1442 VkTessellationDomainOrigin uv_origin = 1443 domain_origin_state ? domain_origin_state->domainOrigin : 1444 VK_TESSELLATION_DOMAIN_ORIGIN_UPPER_LEFT; 1445 1446 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_TE), te) { 1447 te.Partitioning = tes_prog_data->partitioning; 1448 1449 if (uv_origin == VK_TESSELLATION_DOMAIN_ORIGIN_LOWER_LEFT) { 1450 te.OutputTopology = tes_prog_data->output_topology; 1451 } else { 1452 /* When the origin is upper-left, we have to flip the winding order */ 1453 if (tes_prog_data->output_topology == OUTPUT_TRI_CCW) { 1454 te.OutputTopology = OUTPUT_TRI_CW; 1455 } else if (tes_prog_data->output_topology == OUTPUT_TRI_CW) { 1456 te.OutputTopology = OUTPUT_TRI_CCW; 1457 } else { 1458 te.OutputTopology = tes_prog_data->output_topology; 1459 } 1460 } 1461 1462 te.TEDomain = tes_prog_data->domain; 1463 te.TEEnable = true; 1464 te.MaximumTessellationFactorOdd = 63.0; 1465 te.MaximumTessellationFactorNotOdd = 64.0; 1466 } 1467 1468 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_DS), ds) { 1469 ds.Enable = true; 1470 ds.StatisticsEnable = true; 1471 ds.KernelStartPointer = tes_bin->kernel.offset; 1472 /* WA_1606682166 */ 1473 ds.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(tes_bin); 1474 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ 1475 ds.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(tes_bin); 1476 ds.MaximumNumberofThreads = devinfo->max_tes_threads - 1; 1477 1478 ds.ComputeWCoordinateEnable = 1479 tes_prog_data->domain == BRW_TESS_DOMAIN_TRI; 1480 1481 ds.PatchURBEntryReadLength = tes_prog_data->base.urb_read_length; 1482 ds.PatchURBEntryReadOffset = 0; 1483 ds.DispatchGRFStartRegisterForURBData = 1484 tes_prog_data->base.base.dispatch_grf_start_reg; 1485 1486#if GEN_GEN >= 8 1487#if GEN_GEN < 11 1488 ds.DispatchMode = 1489 tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8 ? 1490 DISPATCH_MODE_SIMD8_SINGLE_PATCH : 1491 DISPATCH_MODE_SIMD4X2; 1492#else 1493 assert(tes_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8); 1494 ds.DispatchMode = DISPATCH_MODE_SIMD8_SINGLE_PATCH; 1495#endif 1496 1497 ds.UserClipDistanceClipTestEnableBitmask = 1498 tes_prog_data->base.clip_distance_mask; 1499 ds.UserClipDistanceCullTestEnableBitmask = 1500 tes_prog_data->base.cull_distance_mask; 1501#endif 1502 1503 ds.PerThreadScratchSpace = get_scratch_space(tes_bin); 1504 ds.ScratchSpaceBasePointer = 1505 get_scratch_address(pipeline, MESA_SHADER_TESS_EVAL, tes_bin); 1506 } 1507} 1508 1509static void 1510emit_3dstate_gs(struct anv_pipeline *pipeline) 1511{ 1512 const struct gen_device_info *devinfo = &pipeline->device->info; 1513 const struct anv_shader_bin *gs_bin = 1514 pipeline->shaders[MESA_SHADER_GEOMETRY]; 1515 1516 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_GEOMETRY)) { 1517 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), gs); 1518 return; 1519 } 1520 1521 const struct brw_gs_prog_data *gs_prog_data = get_gs_prog_data(pipeline); 1522 1523 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_GS), gs) { 1524 gs.Enable = true; 1525 gs.StatisticsEnable = true; 1526 gs.KernelStartPointer = gs_bin->kernel.offset; 1527 gs.DispatchMode = gs_prog_data->base.dispatch_mode; 1528 1529 gs.SingleProgramFlow = false; 1530 gs.VectorMaskEnable = false; 1531 /* WA_1606682166 */ 1532 gs.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(gs_bin); 1533 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ 1534 gs.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(gs_bin); 1535 gs.IncludeVertexHandles = gs_prog_data->base.include_vue_handles; 1536 gs.IncludePrimitiveID = gs_prog_data->include_primitive_id; 1537 1538 if (GEN_GEN == 8) { 1539 /* Broadwell is weird. It needs us to divide by 2. */ 1540 gs.MaximumNumberofThreads = devinfo->max_gs_threads / 2 - 1; 1541 } else { 1542 gs.MaximumNumberofThreads = devinfo->max_gs_threads - 1; 1543 } 1544 1545 gs.OutputVertexSize = gs_prog_data->output_vertex_size_hwords * 2 - 1; 1546 gs.OutputTopology = gs_prog_data->output_topology; 1547 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length; 1548 gs.ControlDataFormat = gs_prog_data->control_data_format; 1549 gs.ControlDataHeaderSize = gs_prog_data->control_data_header_size_hwords; 1550 gs.InstanceControl = MAX2(gs_prog_data->invocations, 1) - 1; 1551 gs.ReorderMode = TRAILING; 1552 1553#if GEN_GEN >= 8 1554 gs.ExpectedVertexCount = gs_prog_data->vertices_in; 1555 gs.StaticOutput = gs_prog_data->static_vertex_count >= 0; 1556 gs.StaticOutputVertexCount = gs_prog_data->static_vertex_count >= 0 ? 1557 gs_prog_data->static_vertex_count : 0; 1558#endif 1559 1560 gs.VertexURBEntryReadOffset = 0; 1561 gs.VertexURBEntryReadLength = gs_prog_data->base.urb_read_length; 1562 gs.DispatchGRFStartRegisterForURBData = 1563 gs_prog_data->base.base.dispatch_grf_start_reg; 1564 1565#if GEN_GEN >= 8 1566 gs.UserClipDistanceClipTestEnableBitmask = 1567 gs_prog_data->base.clip_distance_mask; 1568 gs.UserClipDistanceCullTestEnableBitmask = 1569 gs_prog_data->base.cull_distance_mask; 1570#endif 1571 1572 gs.PerThreadScratchSpace = get_scratch_space(gs_bin); 1573 gs.ScratchSpaceBasePointer = 1574 get_scratch_address(pipeline, MESA_SHADER_GEOMETRY, gs_bin); 1575 } 1576} 1577 1578static bool 1579has_color_buffer_write_enabled(const struct anv_pipeline *pipeline, 1580 const VkPipelineColorBlendStateCreateInfo *blend) 1581{ 1582 const struct anv_shader_bin *shader_bin = 1583 pipeline->shaders[MESA_SHADER_FRAGMENT]; 1584 if (!shader_bin) 1585 return false; 1586 1587 const struct anv_pipeline_bind_map *bind_map = &shader_bin->bind_map; 1588 for (int i = 0; i < bind_map->surface_count; i++) { 1589 struct anv_pipeline_binding *binding = &bind_map->surface_to_descriptor[i]; 1590 1591 if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS) 1592 continue; 1593 1594 if (binding->index == UINT32_MAX) 1595 continue; 1596 1597 if (blend && blend->pAttachments[binding->index].colorWriteMask != 0) 1598 return true; 1599 } 1600 1601 return false; 1602} 1603 1604static void 1605emit_3dstate_wm(struct anv_pipeline *pipeline, struct anv_subpass *subpass, 1606 const VkPipelineColorBlendStateCreateInfo *blend, 1607 const VkPipelineMultisampleStateCreateInfo *multisample) 1608{ 1609 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1610 1611 MAYBE_UNUSED uint32_t samples = 1612 multisample ? multisample->rasterizationSamples : 1; 1613 1614 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_WM), wm) { 1615 wm.StatisticsEnable = true; 1616 wm.LineEndCapAntialiasingRegionWidth = _05pixels; 1617 wm.LineAntialiasingRegionWidth = _10pixels; 1618 wm.PointRasterizationRule = RASTRULE_UPPER_RIGHT; 1619 1620 if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1621 if (wm_prog_data->early_fragment_tests) { 1622 wm.EarlyDepthStencilControl = EDSC_PREPS; 1623 } else if (wm_prog_data->has_side_effects) { 1624 wm.EarlyDepthStencilControl = EDSC_PSEXEC; 1625 } else { 1626 wm.EarlyDepthStencilControl = EDSC_NORMAL; 1627 } 1628 1629#if GEN_GEN >= 8 1630 /* Gen8 hardware tries to compute ThreadDispatchEnable for us but 1631 * doesn't take into account KillPixels when no depth or stencil 1632 * writes are enabled. In order for occlusion queries to work 1633 * correctly with no attachments, we need to force-enable PS thread 1634 * dispatch. 1635 * 1636 * The BDW docs are pretty clear that that this bit isn't validated 1637 * and probably shouldn't be used in production: 1638 * 1639 * "This must always be set to Normal. This field should not be 1640 * tested for functional validation." 1641 * 1642 * Unfortunately, however, the other mechanism we have for doing this 1643 * is 3DSTATE_PS_EXTRA::PixelShaderHasUAV which causes hangs on BDW. 1644 * Given two bad options, we choose the one which works. 1645 */ 1646 if ((wm_prog_data->has_side_effects || wm_prog_data->uses_kill) && 1647 !has_color_buffer_write_enabled(pipeline, blend)) 1648 wm.ForceThreadDispatchEnable = ForceON; 1649#endif 1650 1651 wm.BarycentricInterpolationMode = 1652 wm_prog_data->barycentric_interp_modes; 1653 1654#if GEN_GEN < 8 1655 wm.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 1656 wm.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 1657 wm.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 1658 wm.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 1659 1660 /* If the subpass has a depth or stencil self-dependency, then we 1661 * need to force the hardware to do the depth/stencil write *after* 1662 * fragment shader execution. Otherwise, the writes may hit memory 1663 * before we get around to fetching from the input attachment and we 1664 * may get the depth or stencil value from the current draw rather 1665 * than the previous one. 1666 */ 1667 wm.PixelShaderKillsPixel = subpass->has_ds_self_dep || 1668 wm_prog_data->uses_kill; 1669 1670 if (wm.PixelShaderComputedDepthMode != PSCDEPTH_OFF || 1671 wm_prog_data->has_side_effects || 1672 wm.PixelShaderKillsPixel || 1673 has_color_buffer_write_enabled(pipeline, blend)) 1674 wm.ThreadDispatchEnable = true; 1675 1676 if (samples > 1) { 1677 wm.MultisampleRasterizationMode = MSRASTMODE_ON_PATTERN; 1678 if (wm_prog_data->persample_dispatch) { 1679 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 1680 } else { 1681 wm.MultisampleDispatchMode = MSDISPMODE_PERPIXEL; 1682 } 1683 } else { 1684 wm.MultisampleRasterizationMode = MSRASTMODE_OFF_PIXEL; 1685 wm.MultisampleDispatchMode = MSDISPMODE_PERSAMPLE; 1686 } 1687#endif 1688 } 1689 } 1690} 1691 1692static void 1693emit_3dstate_ps(struct anv_pipeline *pipeline, 1694 const VkPipelineColorBlendStateCreateInfo *blend, 1695 const VkPipelineMultisampleStateCreateInfo *multisample) 1696{ 1697 MAYBE_UNUSED const struct gen_device_info *devinfo = &pipeline->device->info; 1698 const struct anv_shader_bin *fs_bin = 1699 pipeline->shaders[MESA_SHADER_FRAGMENT]; 1700 1701 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1702 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) { 1703#if GEN_GEN == 7 1704 /* Even if no fragments are ever dispatched, gen7 hardware hangs if 1705 * we don't at least set the maximum number of threads. 1706 */ 1707 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 1708#endif 1709 } 1710 return; 1711 } 1712 1713 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1714 1715#if GEN_GEN < 8 1716 /* The hardware wedges if you have this bit set but don't turn on any dual 1717 * source blend factors. 1718 */ 1719 bool dual_src_blend = false; 1720 if (wm_prog_data->dual_src_blend && blend) { 1721 for (uint32_t i = 0; i < blend->attachmentCount; i++) { 1722 const VkPipelineColorBlendAttachmentState *bstate = 1723 &blend->pAttachments[i]; 1724 1725 if (bstate->blendEnable && 1726 (is_dual_src_blend_factor(bstate->srcColorBlendFactor) || 1727 is_dual_src_blend_factor(bstate->dstColorBlendFactor) || 1728 is_dual_src_blend_factor(bstate->srcAlphaBlendFactor) || 1729 is_dual_src_blend_factor(bstate->dstAlphaBlendFactor))) { 1730 dual_src_blend = true; 1731 break; 1732 } 1733 } 1734 } 1735#endif 1736 1737 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS), ps) { 1738 ps._8PixelDispatchEnable = wm_prog_data->dispatch_8; 1739 ps._16PixelDispatchEnable = wm_prog_data->dispatch_16; 1740 ps._32PixelDispatchEnable = wm_prog_data->dispatch_32; 1741 1742 /* From the Sky Lake PRM 3DSTATE_PS::32 Pixel Dispatch Enable: 1743 * 1744 * "When NUM_MULTISAMPLES = 16 or FORCE_SAMPLE_COUNT = 16, SIMD32 1745 * Dispatch must not be enabled for PER_PIXEL dispatch mode." 1746 * 1747 * Since 16x MSAA is first introduced on SKL, we don't need to apply 1748 * the workaround on any older hardware. 1749 */ 1750 if (GEN_GEN >= 9 && !wm_prog_data->persample_dispatch && 1751 multisample && multisample->rasterizationSamples == 16) { 1752 assert(ps._8PixelDispatchEnable || ps._16PixelDispatchEnable); 1753 ps._32PixelDispatchEnable = false; 1754 } 1755 1756 ps.KernelStartPointer0 = fs_bin->kernel.offset + 1757 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 0); 1758 ps.KernelStartPointer1 = fs_bin->kernel.offset + 1759 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 1); 1760 ps.KernelStartPointer2 = fs_bin->kernel.offset + 1761 brw_wm_prog_data_prog_offset(wm_prog_data, ps, 2); 1762 1763 ps.SingleProgramFlow = false; 1764 ps.VectorMaskEnable = GEN_GEN >= 8; 1765 /* WA_1606682166 */ 1766 ps.SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(fs_bin); 1767 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable */ 1768 ps.BindingTableEntryCount = GEN_GEN == 11 ? 0 : get_binding_table_entry_count(fs_bin); 1769 ps.PushConstantEnable = wm_prog_data->base.nr_params > 0 || 1770 wm_prog_data->base.ubo_ranges[0].length; 1771 ps.PositionXYOffsetSelect = wm_prog_data->uses_pos_offset ? 1772 POSOFFSET_SAMPLE: POSOFFSET_NONE; 1773#if GEN_GEN < 8 1774 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; 1775 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 1776 ps.DualSourceBlendEnable = dual_src_blend; 1777#endif 1778 1779#if GEN_IS_HASWELL 1780 /* Haswell requires the sample mask to be set in this packet as well 1781 * as in 3DSTATE_SAMPLE_MASK; the values should match. 1782 */ 1783 ps.SampleMask = 0xff; 1784#endif 1785 1786#if GEN_GEN >= 9 1787 ps.MaximumNumberofThreadsPerPSD = 64 - 1; 1788#elif GEN_GEN >= 8 1789 ps.MaximumNumberofThreadsPerPSD = 64 - 2; 1790#else 1791 ps.MaximumNumberofThreads = devinfo->max_wm_threads - 1; 1792#endif 1793 1794 ps.DispatchGRFStartRegisterForConstantSetupData0 = 1795 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 0); 1796 ps.DispatchGRFStartRegisterForConstantSetupData1 = 1797 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 1); 1798 ps.DispatchGRFStartRegisterForConstantSetupData2 = 1799 brw_wm_prog_data_dispatch_grf_start_reg(wm_prog_data, ps, 2); 1800 1801 ps.PerThreadScratchSpace = get_scratch_space(fs_bin); 1802 ps.ScratchSpaceBasePointer = 1803 get_scratch_address(pipeline, MESA_SHADER_FRAGMENT, fs_bin); 1804 } 1805} 1806 1807#if GEN_GEN >= 8 1808static void 1809emit_3dstate_ps_extra(struct anv_pipeline *pipeline, 1810 struct anv_subpass *subpass, 1811 const VkPipelineColorBlendStateCreateInfo *blend) 1812{ 1813 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1814 1815 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1816 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_EXTRA), ps); 1817 return; 1818 } 1819 1820 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_EXTRA), ps) { 1821 ps.PixelShaderValid = true; 1822 ps.AttributeEnable = wm_prog_data->num_varying_inputs > 0; 1823 ps.oMaskPresenttoRenderTarget = wm_prog_data->uses_omask; 1824 ps.PixelShaderIsPerSample = wm_prog_data->persample_dispatch; 1825 ps.PixelShaderComputedDepthMode = wm_prog_data->computed_depth_mode; 1826 ps.PixelShaderUsesSourceDepth = wm_prog_data->uses_src_depth; 1827 ps.PixelShaderUsesSourceW = wm_prog_data->uses_src_w; 1828 1829 /* If the subpass has a depth or stencil self-dependency, then we need 1830 * to force the hardware to do the depth/stencil write *after* fragment 1831 * shader execution. Otherwise, the writes may hit memory before we get 1832 * around to fetching from the input attachment and we may get the depth 1833 * or stencil value from the current draw rather than the previous one. 1834 */ 1835 ps.PixelShaderKillsPixel = subpass->has_ds_self_dep || 1836 wm_prog_data->uses_kill; 1837 1838#if GEN_GEN >= 9 1839 ps.PixelShaderComputesStencil = wm_prog_data->computed_stencil; 1840 ps.PixelShaderPullsBary = wm_prog_data->pulls_bary; 1841 1842 ps.InputCoverageMaskState = ICMS_NONE; 1843 if (wm_prog_data->uses_sample_mask) { 1844 if (wm_prog_data->post_depth_coverage) 1845 ps.InputCoverageMaskState = ICMS_DEPTH_COVERAGE; 1846 else 1847 ps.InputCoverageMaskState = ICMS_INNER_CONSERVATIVE; 1848 } 1849#else 1850 ps.PixelShaderUsesInputCoverageMask = wm_prog_data->uses_sample_mask; 1851#endif 1852 } 1853} 1854 1855static void 1856emit_3dstate_vf_topology(struct anv_pipeline *pipeline) 1857{ 1858 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_TOPOLOGY), vft) { 1859 vft.PrimitiveTopologyType = pipeline->topology; 1860 } 1861} 1862#endif 1863 1864static void 1865emit_3dstate_vf_statistics(struct anv_pipeline *pipeline) 1866{ 1867 anv_batch_emit(&pipeline->batch, GENX(3DSTATE_VF_STATISTICS), vfs) { 1868 vfs.StatisticsEnable = true; 1869 } 1870} 1871 1872static void 1873compute_kill_pixel(struct anv_pipeline *pipeline, 1874 const VkPipelineMultisampleStateCreateInfo *ms_info, 1875 const struct anv_subpass *subpass) 1876{ 1877 if (!anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) { 1878 pipeline->kill_pixel = false; 1879 return; 1880 } 1881 1882 const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline); 1883 1884 /* This computes the KillPixel portion of the computation for whether or 1885 * not we want to enable the PMA fix on gen8 or gen9. It's given by this 1886 * chunk of the giant formula: 1887 * 1888 * (3DSTATE_PS_EXTRA::PixelShaderKillsPixels || 1889 * 3DSTATE_PS_EXTRA::oMask Present to RenderTarget || 1890 * 3DSTATE_PS_BLEND::AlphaToCoverageEnable || 1891 * 3DSTATE_PS_BLEND::AlphaTestEnable || 1892 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable) 1893 * 1894 * 3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable is always false and so is 1895 * 3DSTATE_PS_BLEND::AlphaTestEnable since Vulkan doesn't have a concept 1896 * of an alpha test. 1897 */ 1898 pipeline->kill_pixel = 1899 subpass->has_ds_self_dep || wm_prog_data->uses_kill || 1900 wm_prog_data->uses_omask || 1901 (ms_info && ms_info->alphaToCoverageEnable); 1902} 1903 1904static VkResult 1905genX(graphics_pipeline_create)( 1906 VkDevice _device, 1907 struct anv_pipeline_cache * cache, 1908 const VkGraphicsPipelineCreateInfo* pCreateInfo, 1909 const VkAllocationCallbacks* pAllocator, 1910 VkPipeline* pPipeline) 1911{ 1912 ANV_FROM_HANDLE(anv_device, device, _device); 1913 ANV_FROM_HANDLE(anv_render_pass, pass, pCreateInfo->renderPass); 1914 struct anv_subpass *subpass = &pass->subpasses[pCreateInfo->subpass]; 1915 struct anv_pipeline *pipeline; 1916 VkResult result; 1917 1918 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_GRAPHICS_PIPELINE_CREATE_INFO); 1919 1920 /* Use the default pipeline cache if none is specified */ 1921 if (cache == NULL && device->instance->pipeline_cache_enabled) 1922 cache = &device->default_pipeline_cache; 1923 1924 pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, 1925 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 1926 if (pipeline == NULL) 1927 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 1928 1929 result = anv_pipeline_init(pipeline, device, cache, 1930 pCreateInfo, pAllocator); 1931 if (result != VK_SUCCESS) { 1932 vk_free2(&device->alloc, pAllocator, pipeline); 1933 return result; 1934 } 1935 1936 assert(pCreateInfo->pVertexInputState); 1937 emit_vertex_input(pipeline, pCreateInfo->pVertexInputState); 1938 assert(pCreateInfo->pRasterizationState); 1939 emit_rs_state(pipeline, pCreateInfo->pRasterizationState, 1940 pCreateInfo->pMultisampleState, pass, subpass); 1941 emit_ms_state(pipeline, pCreateInfo->pMultisampleState); 1942 emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass); 1943 emit_cb_state(pipeline, pCreateInfo->pColorBlendState, 1944 pCreateInfo->pMultisampleState); 1945 compute_kill_pixel(pipeline, pCreateInfo->pMultisampleState, subpass); 1946 1947 emit_urb_setup(pipeline); 1948 1949 emit_3dstate_clip(pipeline, pCreateInfo->pViewportState, 1950 pCreateInfo->pRasterizationState); 1951 emit_3dstate_streamout(pipeline, pCreateInfo->pRasterizationState); 1952 1953#if 0 1954 /* From gen7_vs_state.c */ 1955 1956 /** 1957 * From Graphics BSpec: 3D-Media-GPGPU Engine > 3D Pipeline Stages > 1958 * Geometry > Geometry Shader > State: 1959 * 1960 * "Note: Because of corruption in IVB:GT2, software needs to flush the 1961 * whole fixed function pipeline when the GS enable changes value in 1962 * the 3DSTATE_GS." 1963 * 1964 * The hardware architects have clarified that in this context "flush the 1965 * whole fixed function pipeline" means to emit a PIPE_CONTROL with the "CS 1966 * Stall" bit set. 1967 */ 1968 if (!device->info.is_haswell && !device->info.is_baytrail) 1969 gen7_emit_vs_workaround_flush(brw); 1970#endif 1971 1972 emit_3dstate_vs(pipeline); 1973 emit_3dstate_hs_te_ds(pipeline, pCreateInfo->pTessellationState); 1974 emit_3dstate_gs(pipeline); 1975 emit_3dstate_sbe(pipeline); 1976 emit_3dstate_wm(pipeline, subpass, pCreateInfo->pColorBlendState, 1977 pCreateInfo->pMultisampleState); 1978 emit_3dstate_ps(pipeline, pCreateInfo->pColorBlendState, 1979 pCreateInfo->pMultisampleState); 1980#if GEN_GEN >= 8 1981 emit_3dstate_ps_extra(pipeline, subpass, pCreateInfo->pColorBlendState); 1982 emit_3dstate_vf_topology(pipeline); 1983#endif 1984 emit_3dstate_vf_statistics(pipeline); 1985 1986 *pPipeline = anv_pipeline_to_handle(pipeline); 1987 1988 return pipeline->batch.status; 1989} 1990 1991static VkResult 1992compute_pipeline_create( 1993 VkDevice _device, 1994 struct anv_pipeline_cache * cache, 1995 const VkComputePipelineCreateInfo* pCreateInfo, 1996 const VkAllocationCallbacks* pAllocator, 1997 VkPipeline* pPipeline) 1998{ 1999 ANV_FROM_HANDLE(anv_device, device, _device); 2000 const struct anv_physical_device *physical_device = 2001 &device->instance->physicalDevice; 2002 const struct gen_device_info *devinfo = &physical_device->info; 2003 struct anv_pipeline *pipeline; 2004 VkResult result; 2005 2006 assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO); 2007 2008 /* Use the default pipeline cache if none is specified */ 2009 if (cache == NULL && device->instance->pipeline_cache_enabled) 2010 cache = &device->default_pipeline_cache; 2011 2012 pipeline = vk_alloc2(&device->alloc, pAllocator, sizeof(*pipeline), 8, 2013 VK_SYSTEM_ALLOCATION_SCOPE_OBJECT); 2014 if (pipeline == NULL) 2015 return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY); 2016 2017 pipeline->device = device; 2018 2019 pipeline->blend_state.map = NULL; 2020 2021 result = anv_reloc_list_init(&pipeline->batch_relocs, 2022 pAllocator ? pAllocator : &device->alloc); 2023 if (result != VK_SUCCESS) { 2024 vk_free2(&device->alloc, pAllocator, pipeline); 2025 return result; 2026 } 2027 pipeline->batch.next = pipeline->batch.start = pipeline->batch_data; 2028 pipeline->batch.end = pipeline->batch.start + sizeof(pipeline->batch_data); 2029 pipeline->batch.relocs = &pipeline->batch_relocs; 2030 pipeline->batch.status = VK_SUCCESS; 2031 2032 /* When we free the pipeline, we detect stages based on the NULL status 2033 * of various prog_data pointers. Make them NULL by default. 2034 */ 2035 memset(pipeline->shaders, 0, sizeof(pipeline->shaders)); 2036 2037 pipeline->needs_data_cache = false; 2038 2039 assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT); 2040 pipeline->active_stages |= VK_SHADER_STAGE_COMPUTE_BIT; 2041 ANV_FROM_HANDLE(anv_shader_module, module, pCreateInfo->stage.module); 2042 result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module, 2043 pCreateInfo->stage.pName, 2044 pCreateInfo->stage.pSpecializationInfo); 2045 if (result != VK_SUCCESS) { 2046 vk_free2(&device->alloc, pAllocator, pipeline); 2047 return result; 2048 } 2049 2050 const struct brw_cs_prog_data *cs_prog_data = get_cs_prog_data(pipeline); 2051 2052 anv_pipeline_setup_l3_config(pipeline, cs_prog_data->base.total_shared > 0); 2053 2054 uint32_t group_size = cs_prog_data->local_size[0] * 2055 cs_prog_data->local_size[1] * cs_prog_data->local_size[2]; 2056 uint32_t remainder = group_size & (cs_prog_data->simd_size - 1); 2057 2058 if (remainder > 0) 2059 pipeline->cs_right_mask = ~0u >> (32 - remainder); 2060 else 2061 pipeline->cs_right_mask = ~0u >> (32 - cs_prog_data->simd_size); 2062 2063 const uint32_t vfe_curbe_allocation = 2064 ALIGN(cs_prog_data->push.per_thread.regs * cs_prog_data->threads + 2065 cs_prog_data->push.cross_thread.regs, 2); 2066 2067 const uint32_t subslices = MAX2(physical_device->subslice_total, 1); 2068 2069 const struct anv_shader_bin *cs_bin = 2070 pipeline->shaders[MESA_SHADER_COMPUTE]; 2071 2072 anv_batch_emit(&pipeline->batch, GENX(MEDIA_VFE_STATE), vfe) { 2073#if GEN_GEN > 7 2074 vfe.StackSize = 0; 2075#else 2076 vfe.GPGPUMode = true; 2077#endif 2078 vfe.MaximumNumberofThreads = 2079 devinfo->max_cs_threads * subslices - 1; 2080 vfe.NumberofURBEntries = GEN_GEN <= 7 ? 0 : 2; 2081#if GEN_GEN < 11 2082 vfe.ResetGatewayTimer = true; 2083#endif 2084#if GEN_GEN <= 8 2085 vfe.BypassGatewayControl = true; 2086#endif 2087 vfe.URBEntryAllocationSize = GEN_GEN <= 7 ? 0 : 2; 2088 vfe.CURBEAllocationSize = vfe_curbe_allocation; 2089 2090 if (cs_bin->prog_data->total_scratch) { 2091 if (GEN_GEN >= 8) { 2092 /* Broadwell's Per Thread Scratch Space is in the range [0, 11] 2093 * where 0 = 1k, 1 = 2k, 2 = 4k, ..., 11 = 2M. 2094 */ 2095 vfe.PerThreadScratchSpace = 2096 ffs(cs_bin->prog_data->total_scratch) - 11; 2097 } else if (GEN_IS_HASWELL) { 2098 /* Haswell's Per Thread Scratch Space is in the range [0, 10] 2099 * where 0 = 2k, 1 = 4k, 2 = 8k, ..., 10 = 2M. 2100 */ 2101 vfe.PerThreadScratchSpace = 2102 ffs(cs_bin->prog_data->total_scratch) - 12; 2103 } else { 2104 /* IVB and BYT use the range [0, 11] to mean [1kB, 12kB] 2105 * where 0 = 1kB, 1 = 2kB, 2 = 3kB, ..., 11 = 12kB. 2106 */ 2107 vfe.PerThreadScratchSpace = 2108 cs_bin->prog_data->total_scratch / 1024 - 1; 2109 } 2110 vfe.ScratchSpaceBasePointer = 2111 get_scratch_address(pipeline, MESA_SHADER_COMPUTE, cs_bin); 2112 } 2113 } 2114 2115 struct GENX(INTERFACE_DESCRIPTOR_DATA) desc = { 2116 .KernelStartPointer = cs_bin->kernel.offset, 2117 /* WA_1606682166 */ 2118 .SamplerCount = GEN_GEN == 11 ? 0 : get_sampler_count(cs_bin), 2119 /* Gen 11 workarounds table #2056 WABTPPrefetchDisable 2120 * 2121 * We add 1 because the CS indirect parameters buffer isn't accounted 2122 * for in bind_map.surface_count. 2123 */ 2124 .BindingTableEntryCount = GEN_GEN == 11 ? 0 : 1 + MIN2(cs_bin->bind_map.surface_count, 30), 2125 .BarrierEnable = cs_prog_data->uses_barrier, 2126 .SharedLocalMemorySize = 2127 encode_slm_size(GEN_GEN, cs_prog_data->base.total_shared), 2128 2129#if !GEN_IS_HASWELL 2130 .ConstantURBEntryReadOffset = 0, 2131#endif 2132 .ConstantURBEntryReadLength = cs_prog_data->push.per_thread.regs, 2133#if GEN_GEN >= 8 || GEN_IS_HASWELL 2134 .CrossThreadConstantDataReadLength = 2135 cs_prog_data->push.cross_thread.regs, 2136#endif 2137 2138 .NumberofThreadsinGPGPUThreadGroup = cs_prog_data->threads, 2139 }; 2140 GENX(INTERFACE_DESCRIPTOR_DATA_pack)(NULL, 2141 pipeline->interface_descriptor_data, 2142 &desc); 2143 2144 *pPipeline = anv_pipeline_to_handle(pipeline); 2145 2146 return pipeline->batch.status; 2147} 2148 2149VkResult genX(CreateGraphicsPipelines)( 2150 VkDevice _device, 2151 VkPipelineCache pipelineCache, 2152 uint32_t count, 2153 const VkGraphicsPipelineCreateInfo* pCreateInfos, 2154 const VkAllocationCallbacks* pAllocator, 2155 VkPipeline* pPipelines) 2156{ 2157 ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache); 2158 2159 VkResult result = VK_SUCCESS; 2160 2161 unsigned i; 2162 for (i = 0; i < count; i++) { 2163 result = genX(graphics_pipeline_create)(_device, 2164 pipeline_cache, 2165 &pCreateInfos[i], 2166 pAllocator, &pPipelines[i]); 2167 2168 /* Bail out on the first error as it is not obvious what error should be 2169 * report upon 2 different failures. */ 2170 if (result != VK_SUCCESS) 2171 break; 2172 } 2173 2174 for (; i < count; i++) 2175 pPipelines[i] = VK_NULL_HANDLE; 2176 2177 return result; 2178} 2179 2180VkResult genX(CreateComputePipelines)( 2181 VkDevice _device, 2182 VkPipelineCache pipelineCache, 2183 uint32_t count, 2184 const VkComputePipelineCreateInfo* pCreateInfos, 2185 const VkAllocationCallbacks* pAllocator, 2186 VkPipeline* pPipelines) 2187{ 2188 ANV_FROM_HANDLE(anv_pipeline_cache, pipeline_cache, pipelineCache); 2189 2190 VkResult result = VK_SUCCESS; 2191 2192 unsigned i; 2193 for (i = 0; i < count; i++) { 2194 result = compute_pipeline_create(_device, pipeline_cache, 2195 &pCreateInfos[i], 2196 pAllocator, &pPipelines[i]); 2197 2198 /* Bail out on the first error as it is not obvious what error should be 2199 * report upon 2 different failures. */ 2200 if (result != VK_SUCCESS) 2201 break; 2202 } 2203 2204 for (; i < count; i++) 2205 pPipelines[i] = VK_NULL_HANDLE; 2206 2207 return result; 2208} 2209