brw_compiler.h revision 7ec681f3
1/* 2 * Copyright © 2010 - 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#ifndef BRW_COMPILER_H 25#define BRW_COMPILER_H 26 27#include <stdio.h> 28#include "dev/intel_device_info.h" 29#include "main/macros.h" 30#include "main/mtypes.h" 31#include "util/ralloc.h" 32 33#ifdef __cplusplus 34extern "C" { 35#endif 36 37struct ra_regs; 38struct nir_shader; 39struct brw_program; 40 41typedef struct nir_shader nir_shader; 42 43struct brw_compiler { 44 const struct intel_device_info *devinfo; 45 46 struct { 47 struct ra_regs *regs; 48 49 /** 50 * Array of the ra classes for the unaligned contiguous register 51 * block sizes used. 52 */ 53 struct ra_class **classes; 54 } vec4_reg_set; 55 56 struct { 57 struct ra_regs *regs; 58 59 /** 60 * Array of the ra classes for the unaligned contiguous register 61 * block sizes used, indexed by register size. 62 */ 63 struct ra_class *classes[16]; 64 65 /** 66 * ra class for the aligned barycentrics we use for PLN, which doesn't 67 * appear in *classes. 68 */ 69 struct ra_class *aligned_bary_class; 70 } fs_reg_sets[3]; 71 72 void (*shader_debug_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); 73 void (*shader_perf_log)(void *, unsigned *id, const char *str, ...) PRINTFLIKE(3, 4); 74 75 bool scalar_stage[MESA_ALL_SHADER_STAGES]; 76 bool use_tcs_8_patch; 77 struct gl_shader_compiler_options glsl_compiler_options[MESA_ALL_SHADER_STAGES]; 78 79 /** 80 * Apply workarounds for SIN and COS output range problems. 81 * This can negatively impact performance. 82 */ 83 bool precise_trig; 84 85 /** 86 * Is 3DSTATE_CONSTANT_*'s Constant Buffer 0 relative to Dynamic State 87 * Base Address? (If not, it's a normal GPU address.) 88 */ 89 bool constant_buffer_0_is_relative; 90 91 /** 92 * Whether or not the driver supports pull constants. If not, the compiler 93 * will attempt to push everything. 94 */ 95 bool supports_pull_constants; 96 97 /** 98 * Whether or not the driver supports NIR shader constants. This controls 99 * whether nir_opt_large_constants will be run. 100 */ 101 bool supports_shader_constants; 102 103 /** 104 * Whether or not the driver wants uniform params to be compacted by the 105 * back-end compiler. 106 */ 107 bool compact_params; 108 109 /** 110 * Whether or not the driver wants variable group size to be lowered by the 111 * back-end compiler. 112 */ 113 bool lower_variable_group_size; 114 115 /** 116 * Whether indirect UBO loads should use the sampler or go through the 117 * data/constant cache. For the sampler, UBO surface states have to be set 118 * up with VK_FORMAT_R32G32B32A32_FLOAT whereas if it's going through the 119 * constant or data cache, UBOs must use VK_FORMAT_RAW. 120 */ 121 bool indirect_ubos_use_sampler; 122}; 123 124#define brw_shader_debug_log(compiler, data, fmt, ... ) do { \ 125 static unsigned id = 0; \ 126 compiler->shader_debug_log(data, &id, fmt, ##__VA_ARGS__); \ 127} while (0) 128 129#define brw_shader_perf_log(compiler, data, fmt, ... ) do { \ 130 static unsigned id = 0; \ 131 compiler->shader_perf_log(data, &id, fmt, ##__VA_ARGS__); \ 132} while (0) 133 134/** 135 * We use a constant subgroup size of 32. It really only needs to be a 136 * maximum and, since we do SIMD32 for compute shaders in some cases, it 137 * needs to be at least 32. SIMD8 and SIMD16 shaders will still claim a 138 * subgroup size of 32 but will act as if 16 or 24 of those channels are 139 * disabled. 140 */ 141#define BRW_SUBGROUP_SIZE 32 142 143static inline bool 144brw_shader_stage_is_bindless(gl_shader_stage stage) 145{ 146 return stage >= MESA_SHADER_RAYGEN && 147 stage <= MESA_SHADER_CALLABLE; 148} 149 150/** 151 * Program key structures. 152 * 153 * When drawing, we look for the currently bound shaders in the program 154 * cache. This is essentially a hash table lookup, and these are the keys. 155 * 156 * Sometimes OpenGL features specified as state need to be simulated via 157 * shader code, due to a mismatch between the API and the hardware. This 158 * is often referred to as "non-orthagonal state" or "NOS". We store NOS 159 * in the program key so it's considered when searching for a program. If 160 * we haven't seen a particular combination before, we have to recompile a 161 * new specialized version. 162 * 163 * Shader compilation should not look up state in gl_context directly, but 164 * instead use the copy in the program key. This guarantees recompiles will 165 * happen correctly. 166 * 167 * @{ 168 */ 169 170enum PACKED gfx6_gather_sampler_wa { 171 WA_SIGN = 1, /* whether we need to sign extend */ 172 WA_8BIT = 2, /* if we have an 8bit format needing wa */ 173 WA_16BIT = 4, /* if we have a 16bit format needing wa */ 174}; 175 176/** 177 * Sampler information needed by VS, WM, and GS program cache keys. 178 */ 179struct brw_sampler_prog_key_data { 180 /** 181 * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles. 182 */ 183 uint16_t swizzles[MAX_SAMPLERS]; 184 185 uint32_t gl_clamp_mask[3]; 186 187 /** 188 * For RG32F, gather4's channel select is broken. 189 */ 190 uint32_t gather_channel_quirk_mask; 191 192 /** 193 * Whether this sampler uses the compressed multisample surface layout. 194 */ 195 uint32_t compressed_multisample_layout_mask; 196 197 /** 198 * Whether this sampler is using 16x multisampling. If so fetching from 199 * this sampler will be handled with a different instruction, ld2dms_w 200 * instead of ld2dms. 201 */ 202 uint32_t msaa_16; 203 204 /** 205 * For Sandybridge, which shader w/a we need for gather quirks. 206 */ 207 enum gfx6_gather_sampler_wa gfx6_gather_wa[MAX_SAMPLERS]; 208 209 /** 210 * Texture units that have a YUV image bound. 211 */ 212 uint32_t y_u_v_image_mask; 213 uint32_t y_uv_image_mask; 214 uint32_t yx_xuxv_image_mask; 215 uint32_t xy_uxvx_image_mask; 216 uint32_t ayuv_image_mask; 217 uint32_t xyuv_image_mask; 218 uint32_t bt709_mask; 219 uint32_t bt2020_mask; 220 221 /* Scale factor for each texture. */ 222 float scale_factors[32]; 223}; 224 225/** An enum representing what kind of input gl_SubgroupSize is. */ 226enum PACKED brw_subgroup_size_type 227{ 228 BRW_SUBGROUP_SIZE_API_CONSTANT, /**< Default Vulkan behavior */ 229 BRW_SUBGROUP_SIZE_UNIFORM, /**< OpenGL behavior */ 230 BRW_SUBGROUP_SIZE_VARYING, /**< VK_EXT_subgroup_size_control */ 231 232 /* These enums are specifically chosen so that the value of the enum is 233 * also the subgroup size. If any new values are added, they must respect 234 * this invariant. 235 */ 236 BRW_SUBGROUP_SIZE_REQUIRE_8 = 8, /**< VK_EXT_subgroup_size_control */ 237 BRW_SUBGROUP_SIZE_REQUIRE_16 = 16, /**< VK_EXT_subgroup_size_control */ 238 BRW_SUBGROUP_SIZE_REQUIRE_32 = 32, /**< VK_EXT_subgroup_size_control */ 239}; 240 241struct brw_base_prog_key { 242 unsigned program_string_id; 243 244 enum brw_subgroup_size_type subgroup_size_type; 245 bool robust_buffer_access; 246 struct brw_sampler_prog_key_data tex; 247}; 248 249/** 250 * The VF can't natively handle certain types of attributes, such as GL_FIXED 251 * or most 10_10_10_2 types. These flags enable various VS workarounds to 252 * "fix" attributes at the beginning of shaders. 253 */ 254#define BRW_ATTRIB_WA_COMPONENT_MASK 7 /* mask for GL_FIXED scale channel count */ 255#define BRW_ATTRIB_WA_NORMALIZE 8 /* normalize in shader */ 256#define BRW_ATTRIB_WA_BGRA 16 /* swap r/b channels in shader */ 257#define BRW_ATTRIB_WA_SIGN 32 /* interpret as signed in shader */ 258#define BRW_ATTRIB_WA_SCALE 64 /* interpret as scaled in shader */ 259 260/** 261 * OpenGL attribute slots fall in [0, VERT_ATTRIB_MAX - 1] with the range 262 * [VERT_ATTRIB_GENERIC0, VERT_ATTRIB_MAX - 1] reserved for up to 16 user 263 * input vertex attributes. In Vulkan, we expose up to 28 user vertex input 264 * attributes that are mapped to slots also starting at VERT_ATTRIB_GENERIC0. 265 */ 266#define MAX_GL_VERT_ATTRIB VERT_ATTRIB_MAX 267#define MAX_VK_VERT_ATTRIB (VERT_ATTRIB_GENERIC0 + 28) 268 269/** 270 * Max number of binding table entries used for stream output. 271 * 272 * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the 273 * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64. 274 * 275 * On Gfx6, the size of transform feedback data is limited not by the number 276 * of components but by the number of binding table entries we set aside. We 277 * use one binding table entry for a float, one entry for a vector, and one 278 * entry per matrix column. Since the only way we can communicate our 279 * transform feedback capabilities to the client is via 280 * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the 281 * worst case, in which all the varyings are floats, so we use up one binding 282 * table entry per component. Therefore we need to set aside at least 64 283 * binding table entries for use by transform feedback. 284 * 285 * Note: since we don't currently pack varyings, it is currently impossible 286 * for the client to actually use up all of these binding table entries--if 287 * all of their varyings were floats, they would run out of varying slots and 288 * fail to link. But that's a bug, so it seems prudent to go ahead and 289 * allocate the number of binding table entries we will need once the bug is 290 * fixed. 291 */ 292#define BRW_MAX_SOL_BINDINGS 64 293 294/** The program key for Vertex Shaders. */ 295struct brw_vs_prog_key { 296 struct brw_base_prog_key base; 297 298 /** 299 * Per-attribute workaround flags 300 * 301 * For each attribute, a combination of BRW_ATTRIB_WA_*. 302 * 303 * For OpenGL, where we expose a maximum of 16 user input atttributes 304 * we only need up to VERT_ATTRIB_MAX slots, however, in Vulkan 305 * slots preceding VERT_ATTRIB_GENERIC0 are unused and we can 306 * expose up to 28 user input vertex attributes that are mapped to slots 307 * starting at VERT_ATTRIB_GENERIC0, so this array needs to be large 308 * enough to hold this many slots. 309 */ 310 uint8_t gl_attrib_wa_flags[MAX2(MAX_GL_VERT_ATTRIB, MAX_VK_VERT_ATTRIB)]; 311 312 bool copy_edgeflag:1; 313 314 bool clamp_vertex_color:1; 315 316 /** 317 * How many user clipping planes are being uploaded to the vertex shader as 318 * push constants. 319 * 320 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 321 * clip distances. 322 */ 323 unsigned nr_userclip_plane_consts:4; 324 325 /** 326 * For pre-Gfx6 hardware, a bitfield indicating which texture coordinates 327 * are going to be replaced with point coordinates (as a consequence of a 328 * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)). Because 329 * our SF thread requires exact matching between VS outputs and FS inputs, 330 * these texture coordinates will need to be unconditionally included in 331 * the VUE, even if they aren't written by the vertex shader. 332 */ 333 uint8_t point_coord_replace; 334}; 335 336/** The program key for Tessellation Control Shaders. */ 337struct brw_tcs_prog_key 338{ 339 struct brw_base_prog_key base; 340 341 GLenum tes_primitive_mode; 342 343 unsigned input_vertices; 344 345 /** A bitfield of per-patch outputs written. */ 346 uint32_t patch_outputs_written; 347 348 /** A bitfield of per-vertex outputs written. */ 349 uint64_t outputs_written; 350 351 bool quads_workaround; 352}; 353 354/** The program key for Tessellation Evaluation Shaders. */ 355struct brw_tes_prog_key 356{ 357 struct brw_base_prog_key base; 358 359 /** A bitfield of per-patch inputs read. */ 360 uint32_t patch_inputs_read; 361 362 /** A bitfield of per-vertex inputs read. */ 363 uint64_t inputs_read; 364 365 /** 366 * How many user clipping planes are being uploaded to the tessellation 367 * evaluation shader as push constants. 368 * 369 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 370 * clip distances. 371 */ 372 unsigned nr_userclip_plane_consts:4; 373}; 374 375/** The program key for Geometry Shaders. */ 376struct brw_gs_prog_key 377{ 378 struct brw_base_prog_key base; 379 380 /** 381 * How many user clipping planes are being uploaded to the geometry shader 382 * as push constants. 383 * 384 * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to 385 * clip distances. 386 */ 387 unsigned nr_userclip_plane_consts:4; 388}; 389 390enum brw_sf_primitive { 391 BRW_SF_PRIM_POINTS = 0, 392 BRW_SF_PRIM_LINES = 1, 393 BRW_SF_PRIM_TRIANGLES = 2, 394 BRW_SF_PRIM_UNFILLED_TRIS = 3, 395}; 396 397struct brw_sf_prog_key { 398 uint64_t attrs; 399 bool contains_flat_varying; 400 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 401 uint8_t point_sprite_coord_replace; 402 enum brw_sf_primitive primitive:2; 403 bool do_twoside_color:1; 404 bool frontface_ccw:1; 405 bool do_point_sprite:1; 406 bool do_point_coord:1; 407 bool sprite_origin_lower_left:1; 408 bool userclip_active:1; 409}; 410 411enum brw_clip_mode { 412 BRW_CLIP_MODE_NORMAL = 0, 413 BRW_CLIP_MODE_CLIP_ALL = 1, 414 BRW_CLIP_MODE_CLIP_NON_REJECTED = 2, 415 BRW_CLIP_MODE_REJECT_ALL = 3, 416 BRW_CLIP_MODE_ACCEPT_ALL = 4, 417 BRW_CLIP_MODE_KERNEL_CLIP = 5, 418}; 419 420enum brw_clip_fill_mode { 421 BRW_CLIP_FILL_MODE_LINE = 0, 422 BRW_CLIP_FILL_MODE_POINT = 1, 423 BRW_CLIP_FILL_MODE_FILL = 2, 424 BRW_CLIP_FILL_MODE_CULL = 3, 425}; 426 427/* Note that if unfilled primitives are being emitted, we have to fix 428 * up polygon offset and flatshading at this point: 429 */ 430struct brw_clip_prog_key { 431 uint64_t attrs; 432 bool contains_flat_varying; 433 bool contains_noperspective_varying; 434 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 435 unsigned primitive:4; 436 unsigned nr_userclip:4; 437 bool pv_first:1; 438 bool do_unfilled:1; 439 enum brw_clip_fill_mode fill_cw:2; /* includes cull information */ 440 enum brw_clip_fill_mode fill_ccw:2; /* includes cull information */ 441 bool offset_cw:1; 442 bool offset_ccw:1; 443 bool copy_bfc_cw:1; 444 bool copy_bfc_ccw:1; 445 enum brw_clip_mode clip_mode:3; 446 447 float offset_factor; 448 float offset_units; 449 float offset_clamp; 450}; 451 452/* A big lookup table is used to figure out which and how many 453 * additional regs will inserted before the main payload in the WM 454 * program execution. These mainly relate to depth and stencil 455 * processing and the early-depth-test optimization. 456 */ 457enum brw_wm_iz_bits { 458 BRW_WM_IZ_PS_KILL_ALPHATEST_BIT = 0x1, 459 BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT = 0x2, 460 BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT = 0x4, 461 BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT = 0x8, 462 BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT = 0x10, 463 BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT = 0x20, 464 BRW_WM_IZ_BIT_MAX = 0x40 465}; 466 467enum brw_wm_aa_enable { 468 BRW_WM_AA_NEVER, 469 BRW_WM_AA_SOMETIMES, 470 BRW_WM_AA_ALWAYS 471}; 472 473/** The program key for Fragment/Pixel Shaders. */ 474struct brw_wm_prog_key { 475 struct brw_base_prog_key base; 476 477 /* Some collection of BRW_WM_IZ_* */ 478 uint8_t iz_lookup; 479 bool stats_wm:1; 480 bool flat_shade:1; 481 unsigned nr_color_regions:5; 482 bool alpha_test_replicate_alpha:1; 483 bool alpha_to_coverage:1; 484 bool clamp_fragment_color:1; 485 bool persample_interp:1; 486 bool multisample_fbo:1; 487 bool frag_coord_adds_sample_pos:1; 488 enum brw_wm_aa_enable line_aa:2; 489 bool high_quality_derivatives:1; 490 bool force_dual_color_blend:1; 491 bool coherent_fb_fetch:1; 492 bool ignore_sample_mask_out:1; 493 bool coarse_pixel:1; 494 495 uint8_t color_outputs_valid; 496 uint64_t input_slots_valid; 497 GLenum alpha_test_func; /* < For Gfx4/5 MRT alpha test */ 498 float alpha_test_ref; 499}; 500 501struct brw_cs_prog_key { 502 struct brw_base_prog_key base; 503}; 504 505struct brw_bs_prog_key { 506 struct brw_base_prog_key base; 507}; 508 509struct brw_ff_gs_prog_key { 510 uint64_t attrs; 511 512 /** 513 * Hardware primitive type being drawn, e.g. _3DPRIM_TRILIST. 514 */ 515 unsigned primitive:8; 516 517 unsigned pv_first:1; 518 unsigned need_gs_prog:1; 519 520 /** 521 * Number of varyings that are output to transform feedback. 522 */ 523 unsigned num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 524 525 /** 526 * Map from the index of a transform feedback binding table entry to the 527 * gl_varying_slot that should be streamed out through that binding table 528 * entry. 529 */ 530 unsigned char transform_feedback_bindings[BRW_MAX_SOL_BINDINGS]; 531 532 /** 533 * Map from the index of a transform feedback binding table entry to the 534 * swizzles that should be used when streaming out data through that 535 * binding table entry. 536 */ 537 unsigned char transform_feedback_swizzles[BRW_MAX_SOL_BINDINGS]; 538}; 539 540/* brw_any_prog_key is any of the keys that map to an API stage */ 541union brw_any_prog_key { 542 struct brw_base_prog_key base; 543 struct brw_vs_prog_key vs; 544 struct brw_tcs_prog_key tcs; 545 struct brw_tes_prog_key tes; 546 struct brw_gs_prog_key gs; 547 struct brw_wm_prog_key wm; 548 struct brw_cs_prog_key cs; 549 struct brw_bs_prog_key bs; 550}; 551 552/* 553 * Image metadata structure as laid out in the shader parameter 554 * buffer. Entries have to be 16B-aligned for the vec4 back-end to be 555 * able to use them. That's okay because the padding and any unused 556 * entries [most of them except when we're doing untyped surface 557 * access] will be removed by the uniform packing pass. 558 */ 559#define BRW_IMAGE_PARAM_OFFSET_OFFSET 0 560#define BRW_IMAGE_PARAM_SIZE_OFFSET 4 561#define BRW_IMAGE_PARAM_STRIDE_OFFSET 8 562#define BRW_IMAGE_PARAM_TILING_OFFSET 12 563#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET 16 564#define BRW_IMAGE_PARAM_SIZE 20 565 566struct brw_image_param { 567 /** Offset applied to the X and Y surface coordinates. */ 568 uint32_t offset[2]; 569 570 /** Surface X, Y and Z dimensions. */ 571 uint32_t size[3]; 572 573 /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in 574 * pixels, vertical slice stride in pixels. 575 */ 576 uint32_t stride[4]; 577 578 /** Log2 of the tiling modulus in the X, Y and Z dimension. */ 579 uint32_t tiling[3]; 580 581 /** 582 * Right shift to apply for bit 6 address swizzling. Two different 583 * swizzles can be specified and will be applied one after the other. The 584 * resulting address will be: 585 * 586 * addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^ 587 * (addr >> swizzling[1]))) 588 * 589 * Use \c 0xff if any of the swizzles is not required. 590 */ 591 uint32_t swizzling[2]; 592}; 593 594/** Max number of render targets in a shader */ 595#define BRW_MAX_DRAW_BUFFERS 8 596 597/** 598 * Binding table index for the first gfx6 SOL binding. 599 */ 600#define BRW_GFX6_SOL_BINDING_START 0 601 602/** 603 * Stride in bytes between shader_time entries. 604 * 605 * We separate entries by a cacheline to reduce traffic between EUs writing to 606 * different entries. 607 */ 608#define BRW_SHADER_TIME_STRIDE 64 609 610struct brw_ubo_range 611{ 612 uint16_t block; 613 uint8_t start; 614 uint8_t length; 615}; 616 617/* We reserve the first 2^16 values for builtins */ 618#define BRW_PARAM_IS_BUILTIN(param) (((param) & 0xffff0000) == 0) 619 620enum brw_param_builtin { 621 BRW_PARAM_BUILTIN_ZERO, 622 623 BRW_PARAM_BUILTIN_CLIP_PLANE_0_X, 624 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Y, 625 BRW_PARAM_BUILTIN_CLIP_PLANE_0_Z, 626 BRW_PARAM_BUILTIN_CLIP_PLANE_0_W, 627 BRW_PARAM_BUILTIN_CLIP_PLANE_1_X, 628 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Y, 629 BRW_PARAM_BUILTIN_CLIP_PLANE_1_Z, 630 BRW_PARAM_BUILTIN_CLIP_PLANE_1_W, 631 BRW_PARAM_BUILTIN_CLIP_PLANE_2_X, 632 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Y, 633 BRW_PARAM_BUILTIN_CLIP_PLANE_2_Z, 634 BRW_PARAM_BUILTIN_CLIP_PLANE_2_W, 635 BRW_PARAM_BUILTIN_CLIP_PLANE_3_X, 636 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Y, 637 BRW_PARAM_BUILTIN_CLIP_PLANE_3_Z, 638 BRW_PARAM_BUILTIN_CLIP_PLANE_3_W, 639 BRW_PARAM_BUILTIN_CLIP_PLANE_4_X, 640 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Y, 641 BRW_PARAM_BUILTIN_CLIP_PLANE_4_Z, 642 BRW_PARAM_BUILTIN_CLIP_PLANE_4_W, 643 BRW_PARAM_BUILTIN_CLIP_PLANE_5_X, 644 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Y, 645 BRW_PARAM_BUILTIN_CLIP_PLANE_5_Z, 646 BRW_PARAM_BUILTIN_CLIP_PLANE_5_W, 647 BRW_PARAM_BUILTIN_CLIP_PLANE_6_X, 648 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Y, 649 BRW_PARAM_BUILTIN_CLIP_PLANE_6_Z, 650 BRW_PARAM_BUILTIN_CLIP_PLANE_6_W, 651 BRW_PARAM_BUILTIN_CLIP_PLANE_7_X, 652 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Y, 653 BRW_PARAM_BUILTIN_CLIP_PLANE_7_Z, 654 BRW_PARAM_BUILTIN_CLIP_PLANE_7_W, 655 656 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_X, 657 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Y, 658 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_Z, 659 BRW_PARAM_BUILTIN_TESS_LEVEL_OUTER_W, 660 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_X, 661 BRW_PARAM_BUILTIN_TESS_LEVEL_INNER_Y, 662 663 BRW_PARAM_BUILTIN_PATCH_VERTICES_IN, 664 665 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X, 666 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y, 667 BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z, 668 BRW_PARAM_BUILTIN_SUBGROUP_ID, 669 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X, 670 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y, 671 BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z, 672 BRW_PARAM_BUILTIN_WORK_DIM, 673}; 674 675#define BRW_PARAM_BUILTIN_CLIP_PLANE(idx, comp) \ 676 (BRW_PARAM_BUILTIN_CLIP_PLANE_0_X + ((idx) << 2) + (comp)) 677 678#define BRW_PARAM_BUILTIN_IS_CLIP_PLANE(param) \ 679 ((param) >= BRW_PARAM_BUILTIN_CLIP_PLANE_0_X && \ 680 (param) <= BRW_PARAM_BUILTIN_CLIP_PLANE_7_W) 681 682#define BRW_PARAM_BUILTIN_CLIP_PLANE_IDX(param) \ 683 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) >> 2) 684 685#define BRW_PARAM_BUILTIN_CLIP_PLANE_COMP(param) \ 686 (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3) 687 688enum brw_shader_reloc_id { 689 BRW_SHADER_RELOC_CONST_DATA_ADDR_LOW, 690 BRW_SHADER_RELOC_CONST_DATA_ADDR_HIGH, 691 BRW_SHADER_RELOC_SHADER_START_OFFSET, 692 BRW_SHADER_RELOC_RESUME_SBT_ADDR_LOW, 693 BRW_SHADER_RELOC_RESUME_SBT_ADDR_HIGH, 694}; 695 696enum brw_shader_reloc_type { 697 /** An arbitrary 32-bit value */ 698 BRW_SHADER_RELOC_TYPE_U32, 699 /** A MOV instruction with an immediate source */ 700 BRW_SHADER_RELOC_TYPE_MOV_IMM, 701}; 702 703/** Represents a code relocation 704 * 705 * Relocatable constants are immediates in the code which we want to be able 706 * to replace post-compile with the actual value. 707 */ 708struct brw_shader_reloc { 709 /** The 32-bit ID of the relocatable constant */ 710 uint32_t id; 711 712 /** Type of this relocation */ 713 enum brw_shader_reloc_type type; 714 715 /** The offset in the shader to the relocated value 716 * 717 * For MOV_IMM relocs, this is an offset to the MOV instruction. This 718 * allows us to do some sanity checking while we update the value. 719 */ 720 uint32_t offset; 721 722 /** Value to be added to the relocated value before it is written */ 723 uint32_t delta; 724}; 725 726/** A value to write to a relocation */ 727struct brw_shader_reloc_value { 728 /** The 32-bit ID of the relocatable constant */ 729 uint32_t id; 730 731 /** The value with which to replace the relocated immediate */ 732 uint32_t value; 733}; 734 735struct brw_stage_prog_data { 736 struct { 737 /** size of our binding table. */ 738 uint32_t size_bytes; 739 740 /** @{ 741 * surface indices for the various groups of surfaces 742 */ 743 uint32_t pull_constants_start; 744 uint32_t texture_start; 745 uint32_t gather_texture_start; 746 uint32_t ubo_start; 747 uint32_t ssbo_start; 748 uint32_t image_start; 749 uint32_t shader_time_start; 750 uint32_t plane_start[3]; 751 /** @} */ 752 } binding_table; 753 754 struct brw_ubo_range ubo_ranges[4]; 755 756 GLuint nr_params; /**< number of float params/constants */ 757 GLuint nr_pull_params; 758 759 gl_shader_stage stage; 760 761 /* zero_push_reg is a bitfield which indicates what push registers (if any) 762 * should be zeroed by SW at the start of the shader. The corresponding 763 * push_reg_mask_param specifies the param index (in 32-bit units) where 764 * the actual runtime 64-bit mask will be pushed. The shader will zero 765 * push reg i if 766 * 767 * reg_used & zero_push_reg & ~*push_reg_mask_param & (1ull << i) 768 * 769 * If this field is set, brw_compiler::compact_params must be false. 770 */ 771 uint64_t zero_push_reg; 772 unsigned push_reg_mask_param; 773 774 unsigned curb_read_length; 775 unsigned total_scratch; 776 unsigned total_shared; 777 778 unsigned program_size; 779 780 unsigned const_data_size; 781 unsigned const_data_offset; 782 783 unsigned num_relocs; 784 const struct brw_shader_reloc *relocs; 785 786 /** Does this program pull from any UBO or other constant buffers? */ 787 bool has_ubo_pull; 788 789 /** 790 * Register where the thread expects to find input data from the URB 791 * (typically uniforms, followed by vertex or fragment attributes). 792 */ 793 unsigned dispatch_grf_start_reg; 794 795 bool use_alt_mode; /**< Use ALT floating point mode? Otherwise, IEEE. */ 796 797 /* 32-bit identifiers for all push/pull parameters. These can be anything 798 * the driver wishes them to be; the core of the back-end compiler simply 799 * re-arranges them. The one restriction is that the bottom 2^16 values 800 * are reserved for builtins defined in the brw_param_builtin enum defined 801 * above. 802 */ 803 uint32_t *param; 804 uint32_t *pull_param; 805 806 /* Whether shader uses atomic operations. */ 807 bool uses_atomic_load_store; 808}; 809 810static inline uint32_t * 811brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, 812 unsigned nr_new_params) 813{ 814 unsigned old_nr_params = prog_data->nr_params; 815 prog_data->nr_params += nr_new_params; 816 prog_data->param = reralloc(ralloc_parent(prog_data->param), 817 prog_data->param, uint32_t, 818 prog_data->nr_params); 819 return prog_data->param + old_nr_params; 820} 821 822enum brw_barycentric_mode { 823 BRW_BARYCENTRIC_PERSPECTIVE_PIXEL = 0, 824 BRW_BARYCENTRIC_PERSPECTIVE_CENTROID = 1, 825 BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE = 2, 826 BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL = 3, 827 BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4, 828 BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE = 5, 829 BRW_BARYCENTRIC_MODE_COUNT = 6 830}; 831#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \ 832 ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \ 833 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \ 834 (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE)) 835 836enum brw_pixel_shader_computed_depth_mode { 837 BRW_PSCDEPTH_OFF = 0, /* PS does not compute depth */ 838 BRW_PSCDEPTH_ON = 1, /* PS computes depth; no guarantee about value */ 839 BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */ 840 BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */ 841}; 842 843/* Data about a particular attempt to compile a program. Note that 844 * there can be many of these, each in a different GL state 845 * corresponding to a different brw_wm_prog_key struct, with different 846 * compiled programs. 847 */ 848struct brw_wm_prog_data { 849 struct brw_stage_prog_data base; 850 851 GLuint num_varying_inputs; 852 853 uint8_t reg_blocks_8; 854 uint8_t reg_blocks_16; 855 uint8_t reg_blocks_32; 856 857 uint8_t dispatch_grf_start_reg_16; 858 uint8_t dispatch_grf_start_reg_32; 859 uint32_t prog_offset_16; 860 uint32_t prog_offset_32; 861 862 struct { 863 /** @{ 864 * surface indices the WM-specific surfaces 865 */ 866 uint32_t render_target_read_start; 867 /** @} */ 868 } binding_table; 869 870 uint8_t computed_depth_mode; 871 bool computed_stencil; 872 873 bool early_fragment_tests; 874 bool post_depth_coverage; 875 bool inner_coverage; 876 bool dispatch_8; 877 bool dispatch_16; 878 bool dispatch_32; 879 bool dual_src_blend; 880 bool persample_dispatch; 881 bool uses_pos_offset; 882 bool uses_omask; 883 bool uses_kill; 884 bool uses_src_depth; 885 bool uses_src_w; 886 bool uses_depth_w_coefficients; 887 bool uses_sample_mask; 888 bool has_render_target_reads; 889 bool has_side_effects; 890 bool pulls_bary; 891 892 bool contains_flat_varying; 893 bool contains_noperspective_varying; 894 895 /** 896 * Shader is ran at the coarse pixel shading dispatch rate (3DSTATE_CPS). 897 */ 898 bool per_coarse_pixel_dispatch; 899 900 /** 901 * Mask of which interpolation modes are required by the fragment shader. 902 * Used in hardware setup on gfx6+. 903 */ 904 uint32_t barycentric_interp_modes; 905 906 /** 907 * Mask of which FS inputs are marked flat by the shader source. This is 908 * needed for setting up 3DSTATE_SF/SBE. 909 */ 910 uint32_t flat_inputs; 911 912 /** 913 * The FS inputs 914 */ 915 uint64_t inputs; 916 917 /* Mapping of VUE slots to interpolation modes. 918 * Used by the Gfx4-5 clip/sf/wm stages. 919 */ 920 unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */ 921 922 /** 923 * Map from gl_varying_slot to the position within the FS setup data 924 * payload where the varying's attribute vertex deltas should be delivered. 925 * For varying slots that are not used by the FS, the value is -1. 926 */ 927 int urb_setup[VARYING_SLOT_MAX]; 928 929 /** 930 * Cache structure into the urb_setup array above that contains the 931 * attribute numbers of active varyings out of urb_setup. 932 * The actual count is stored in urb_setup_attribs_count. 933 */ 934 uint8_t urb_setup_attribs[VARYING_SLOT_MAX]; 935 uint8_t urb_setup_attribs_count; 936}; 937 938/** Returns the SIMD width corresponding to a given KSP index 939 * 940 * The "Variable Pixel Dispatch" table in the PRM (which can be found, for 941 * example in Vol. 7 of the SKL PRM) has a mapping from dispatch widths to 942 * kernel start pointer (KSP) indices that is based on what dispatch widths 943 * are enabled. This function provides, effectively, the reverse mapping. 944 * 945 * If the given KSP is valid with respect to the SIMD8/16/32 enables, a SIMD 946 * width of 8, 16, or 32 is returned. If the KSP is invalid, 0 is returned. 947 */ 948static inline unsigned 949brw_fs_simd_width_for_ksp(unsigned ksp_idx, bool simd8_enabled, 950 bool simd16_enabled, bool simd32_enabled) 951{ 952 /* This function strictly ignores contiguous dispatch */ 953 switch (ksp_idx) { 954 case 0: 955 return simd8_enabled ? 8 : 956 (simd16_enabled && !simd32_enabled) ? 16 : 957 (simd32_enabled && !simd16_enabled) ? 32 : 0; 958 case 1: 959 return (simd32_enabled && (simd16_enabled || simd8_enabled)) ? 32 : 0; 960 case 2: 961 return (simd16_enabled && (simd32_enabled || simd8_enabled)) ? 16 : 0; 962 default: 963 unreachable("Invalid KSP index"); 964 } 965} 966 967#define brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx) \ 968 brw_fs_simd_width_for_ksp((ksp_idx), (wm_state)._8PixelDispatchEnable, \ 969 (wm_state)._16PixelDispatchEnable, \ 970 (wm_state)._32PixelDispatchEnable) 971 972#define brw_wm_state_has_ksp(wm_state, ksp_idx) \ 973 (brw_wm_state_simd_width_for_ksp((wm_state), (ksp_idx)) != 0) 974 975static inline uint32_t 976_brw_wm_prog_data_prog_offset(const struct brw_wm_prog_data *prog_data, 977 unsigned simd_width) 978{ 979 switch (simd_width) { 980 case 8: return 0; 981 case 16: return prog_data->prog_offset_16; 982 case 32: return prog_data->prog_offset_32; 983 default: return 0; 984 } 985} 986 987#define brw_wm_prog_data_prog_offset(prog_data, wm_state, ksp_idx) \ 988 _brw_wm_prog_data_prog_offset(prog_data, \ 989 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 990 991static inline uint8_t 992_brw_wm_prog_data_dispatch_grf_start_reg(const struct brw_wm_prog_data *prog_data, 993 unsigned simd_width) 994{ 995 switch (simd_width) { 996 case 8: return prog_data->base.dispatch_grf_start_reg; 997 case 16: return prog_data->dispatch_grf_start_reg_16; 998 case 32: return prog_data->dispatch_grf_start_reg_32; 999 default: return 0; 1000 } 1001} 1002 1003#define brw_wm_prog_data_dispatch_grf_start_reg(prog_data, wm_state, ksp_idx) \ 1004 _brw_wm_prog_data_dispatch_grf_start_reg(prog_data, \ 1005 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 1006 1007static inline uint8_t 1008_brw_wm_prog_data_reg_blocks(const struct brw_wm_prog_data *prog_data, 1009 unsigned simd_width) 1010{ 1011 switch (simd_width) { 1012 case 8: return prog_data->reg_blocks_8; 1013 case 16: return prog_data->reg_blocks_16; 1014 case 32: return prog_data->reg_blocks_32; 1015 default: return 0; 1016 } 1017} 1018 1019#define brw_wm_prog_data_reg_blocks(prog_data, wm_state, ksp_idx) \ 1020 _brw_wm_prog_data_reg_blocks(prog_data, \ 1021 brw_wm_state_simd_width_for_ksp(wm_state, ksp_idx)) 1022 1023struct brw_push_const_block { 1024 unsigned dwords; /* Dword count, not reg aligned */ 1025 unsigned regs; 1026 unsigned size; /* Bytes, register aligned */ 1027}; 1028 1029struct brw_cs_prog_data { 1030 struct brw_stage_prog_data base; 1031 1032 unsigned local_size[3]; 1033 1034 /* Program offsets for the 8/16/32 SIMD variants. Multiple variants are 1035 * kept when using variable group size, and the right one can only be 1036 * decided at dispatch time. 1037 */ 1038 unsigned prog_offset[3]; 1039 1040 /* Bitmask indicating which program offsets are valid. */ 1041 unsigned prog_mask; 1042 1043 /* Bitmask indicating which programs have spilled. */ 1044 unsigned prog_spilled; 1045 1046 bool uses_barrier; 1047 bool uses_num_work_groups; 1048 bool uses_inline_data; 1049 bool uses_btd_stack_ids; 1050 1051 struct { 1052 struct brw_push_const_block cross_thread; 1053 struct brw_push_const_block per_thread; 1054 } push; 1055 1056 struct { 1057 /** @{ 1058 * surface indices the CS-specific surfaces 1059 */ 1060 uint32_t work_groups_start; 1061 /** @} */ 1062 } binding_table; 1063}; 1064 1065static inline uint32_t 1066brw_cs_prog_data_prog_offset(const struct brw_cs_prog_data *prog_data, 1067 unsigned dispatch_width) 1068{ 1069 assert(dispatch_width == 8 || 1070 dispatch_width == 16 || 1071 dispatch_width == 32); 1072 const unsigned index = dispatch_width / 16; 1073 assert(prog_data->prog_mask & (1 << index)); 1074 return prog_data->prog_offset[index]; 1075} 1076 1077struct brw_bs_prog_data { 1078 struct brw_stage_prog_data base; 1079 1080 /** SIMD size of the root shader */ 1081 uint8_t simd_size; 1082 1083 /** Maximum stack size of all shaders */ 1084 uint32_t max_stack_size; 1085 1086 /** Offset into the shader where the resume SBT is located */ 1087 uint32_t resume_sbt_offset; 1088}; 1089 1090struct brw_ff_gs_prog_data { 1091 unsigned urb_read_length; 1092 unsigned total_grf; 1093 1094 /** 1095 * Gfx6 transform feedback: Amount by which the streaming vertex buffer 1096 * indices should be incremented each time the GS is invoked. 1097 */ 1098 unsigned svbi_postincrement_value; 1099}; 1100 1101/** 1102 * Enum representing the i965-specific vertex results that don't correspond 1103 * exactly to any element of gl_varying_slot. The values of this enum are 1104 * assigned such that they don't conflict with gl_varying_slot. 1105 */ 1106typedef enum 1107{ 1108 BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX, 1109 BRW_VARYING_SLOT_PAD, 1110 /** 1111 * Technically this is not a varying but just a placeholder that 1112 * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord 1113 * builtin variable to be compiled correctly. see compile_sf_prog() for 1114 * more info. 1115 */ 1116 BRW_VARYING_SLOT_PNTC, 1117 BRW_VARYING_SLOT_COUNT 1118} brw_varying_slot; 1119 1120/** 1121 * We always program SF to start reading at an offset of 1 (2 varying slots) 1122 * from the start of the vertex URB entry. This causes it to skip: 1123 * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gfx4-5 1124 * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gfx6+ 1125 */ 1126#define BRW_SF_URB_ENTRY_READ_OFFSET 1 1127 1128/** 1129 * Bitmask indicating which fragment shader inputs represent varyings (and 1130 * hence have to be delivered to the fragment shader by the SF/SBE stage). 1131 */ 1132#define BRW_FS_VARYING_INPUT_MASK \ 1133 (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \ 1134 ~VARYING_BIT_POS & ~VARYING_BIT_FACE) 1135 1136/** 1137 * Data structure recording the relationship between the gl_varying_slot enum 1138 * and "slots" within the vertex URB entry (VUE). A "slot" is defined as a 1139 * single octaword within the VUE (128 bits). 1140 * 1141 * Note that each BRW register contains 256 bits (2 octawords), so when 1142 * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two 1143 * consecutive VUE slots. When accessing the VUE in URB_INTERLEAVED mode (as 1144 * in a vertex shader), each register corresponds to a single VUE slot, since 1145 * it contains data for two separate vertices. 1146 */ 1147struct brw_vue_map { 1148 /** 1149 * Bitfield representing all varying slots that are (a) stored in this VUE 1150 * map, and (b) actually written by the shader. Does not include any of 1151 * the additional varying slots defined in brw_varying_slot. 1152 */ 1153 uint64_t slots_valid; 1154 1155 /** 1156 * Is this VUE map for a separate shader pipeline? 1157 * 1158 * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched 1159 * without the linker having a chance to dead code eliminate unused varyings. 1160 * 1161 * This means that we have to use a fixed slot layout, based on the output's 1162 * location field, rather than assigning slots in a compact contiguous block. 1163 */ 1164 bool separate; 1165 1166 /** 1167 * Map from gl_varying_slot value to VUE slot. For gl_varying_slots that are 1168 * not stored in a slot (because they are not written, or because 1169 * additional processing is applied before storing them in the VUE), the 1170 * value is -1. 1171 */ 1172 signed char varying_to_slot[VARYING_SLOT_TESS_MAX]; 1173 1174 /** 1175 * Map from VUE slot to gl_varying_slot value. For slots that do not 1176 * directly correspond to a gl_varying_slot, the value comes from 1177 * brw_varying_slot. 1178 * 1179 * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD. 1180 */ 1181 signed char slot_to_varying[VARYING_SLOT_TESS_MAX]; 1182 1183 /** 1184 * Total number of VUE slots in use 1185 */ 1186 int num_slots; 1187 1188 /** 1189 * Number of per-patch VUE slots. Only valid for tessellation control 1190 * shader outputs and tessellation evaluation shader inputs. 1191 */ 1192 int num_per_patch_slots; 1193 1194 /** 1195 * Number of per-vertex VUE slots. Only valid for tessellation control 1196 * shader outputs and tessellation evaluation shader inputs. 1197 */ 1198 int num_per_vertex_slots; 1199}; 1200 1201void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map, 1202 gl_shader_stage stage); 1203 1204/** 1205 * Convert a VUE slot number into a byte offset within the VUE. 1206 */ 1207static inline GLuint brw_vue_slot_to_offset(GLuint slot) 1208{ 1209 return 16*slot; 1210} 1211 1212/** 1213 * Convert a vertex output (brw_varying_slot) into a byte offset within the 1214 * VUE. 1215 */ 1216static inline 1217GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying) 1218{ 1219 return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]); 1220} 1221 1222void brw_compute_vue_map(const struct intel_device_info *devinfo, 1223 struct brw_vue_map *vue_map, 1224 uint64_t slots_valid, 1225 bool separate_shader, 1226 uint32_t pos_slots); 1227 1228void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map, 1229 uint64_t slots_valid, 1230 uint32_t is_patch); 1231 1232/* brw_interpolation_map.c */ 1233void brw_setup_vue_interpolation(const struct brw_vue_map *vue_map, 1234 struct nir_shader *nir, 1235 struct brw_wm_prog_data *prog_data); 1236 1237enum shader_dispatch_mode { 1238 DISPATCH_MODE_4X1_SINGLE = 0, 1239 DISPATCH_MODE_4X2_DUAL_INSTANCE = 1, 1240 DISPATCH_MODE_4X2_DUAL_OBJECT = 2, 1241 DISPATCH_MODE_SIMD8 = 3, 1242 1243 DISPATCH_MODE_TCS_SINGLE_PATCH = 0, 1244 DISPATCH_MODE_TCS_8_PATCH = 2, 1245}; 1246 1247/** 1248 * @defgroup Tessellator parameter enumerations. 1249 * 1250 * These correspond to the hardware values in 3DSTATE_TE, and are provided 1251 * as part of the tessellation evaluation shader. 1252 * 1253 * @{ 1254 */ 1255enum brw_tess_partitioning { 1256 BRW_TESS_PARTITIONING_INTEGER = 0, 1257 BRW_TESS_PARTITIONING_ODD_FRACTIONAL = 1, 1258 BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2, 1259}; 1260 1261enum brw_tess_output_topology { 1262 BRW_TESS_OUTPUT_TOPOLOGY_POINT = 0, 1263 BRW_TESS_OUTPUT_TOPOLOGY_LINE = 1, 1264 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW = 2, 1265 BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3, 1266}; 1267 1268enum brw_tess_domain { 1269 BRW_TESS_DOMAIN_QUAD = 0, 1270 BRW_TESS_DOMAIN_TRI = 1, 1271 BRW_TESS_DOMAIN_ISOLINE = 2, 1272}; 1273/** @} */ 1274 1275struct brw_vue_prog_data { 1276 struct brw_stage_prog_data base; 1277 struct brw_vue_map vue_map; 1278 1279 /** Should the hardware deliver input VUE handles for URB pull loads? */ 1280 bool include_vue_handles; 1281 1282 GLuint urb_read_length; 1283 GLuint total_grf; 1284 1285 uint32_t clip_distance_mask; 1286 uint32_t cull_distance_mask; 1287 1288 /* Used for calculating urb partitions. In the VS, this is the size of the 1289 * URB entry used for both input and output to the thread. In the GS, this 1290 * is the size of the URB entry used for output. 1291 */ 1292 GLuint urb_entry_size; 1293 1294 enum shader_dispatch_mode dispatch_mode; 1295}; 1296 1297struct brw_vs_prog_data { 1298 struct brw_vue_prog_data base; 1299 1300 GLbitfield64 inputs_read; 1301 GLbitfield64 double_inputs_read; 1302 1303 unsigned nr_attribute_slots; 1304 1305 bool uses_vertexid; 1306 bool uses_instanceid; 1307 bool uses_is_indexed_draw; 1308 bool uses_firstvertex; 1309 bool uses_baseinstance; 1310 bool uses_drawid; 1311}; 1312 1313struct brw_tcs_prog_data 1314{ 1315 struct brw_vue_prog_data base; 1316 1317 /** Should the non-SINGLE_PATCH payload provide primitive ID? */ 1318 bool include_primitive_id; 1319 1320 /** Number vertices in output patch */ 1321 int instances; 1322 1323 /** Track patch count threshold */ 1324 int patch_count_threshold; 1325}; 1326 1327 1328struct brw_tes_prog_data 1329{ 1330 struct brw_vue_prog_data base; 1331 1332 enum brw_tess_partitioning partitioning; 1333 enum brw_tess_output_topology output_topology; 1334 enum brw_tess_domain domain; 1335}; 1336 1337struct brw_gs_prog_data 1338{ 1339 struct brw_vue_prog_data base; 1340 1341 unsigned vertices_in; 1342 1343 /** 1344 * Size of an output vertex, measured in HWORDS (32 bytes). 1345 */ 1346 unsigned output_vertex_size_hwords; 1347 1348 unsigned output_topology; 1349 1350 /** 1351 * Size of the control data (cut bits or StreamID bits), in hwords (32 1352 * bytes). 0 if there is no control data. 1353 */ 1354 unsigned control_data_header_size_hwords; 1355 1356 /** 1357 * Format of the control data (either GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID 1358 * if the control data is StreamID bits, or 1359 * GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits). 1360 * Ignored if control_data_header_size is 0. 1361 */ 1362 unsigned control_data_format; 1363 1364 bool include_primitive_id; 1365 1366 /** 1367 * The number of vertices emitted, if constant - otherwise -1. 1368 */ 1369 int static_vertex_count; 1370 1371 int invocations; 1372 1373 /** 1374 * Gfx6: Provoking vertex convention for odd-numbered triangles 1375 * in tristrips. 1376 */ 1377 GLuint pv_first:1; 1378 1379 /** 1380 * Gfx6: Number of varyings that are output to transform feedback. 1381 */ 1382 GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */ 1383 1384 /** 1385 * Gfx6: Map from the index of a transform feedback binding table entry to the 1386 * gl_varying_slot that should be streamed out through that binding table 1387 * entry. 1388 */ 1389 unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */]; 1390 1391 /** 1392 * Gfx6: Map from the index of a transform feedback binding table entry to the 1393 * swizzles that should be used when streaming out data through that 1394 * binding table entry. 1395 */ 1396 unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */]; 1397}; 1398 1399struct brw_sf_prog_data { 1400 uint32_t urb_read_length; 1401 uint32_t total_grf; 1402 1403 /* Each vertex may have upto 12 attributes, 4 components each, 1404 * except WPOS which requires only 2. (11*4 + 2) == 44 ==> 11 1405 * rows. 1406 * 1407 * Actually we use 4 for each, so call it 12 rows. 1408 */ 1409 unsigned urb_entry_size; 1410}; 1411 1412struct brw_clip_prog_data { 1413 uint32_t curb_read_length; /* user planes? */ 1414 uint32_t clip_mode; 1415 uint32_t urb_read_length; 1416 uint32_t total_grf; 1417}; 1418 1419/* brw_any_prog_data is prog_data for any stage that maps to an API stage */ 1420union brw_any_prog_data { 1421 struct brw_stage_prog_data base; 1422 struct brw_vue_prog_data vue; 1423 struct brw_vs_prog_data vs; 1424 struct brw_tcs_prog_data tcs; 1425 struct brw_tes_prog_data tes; 1426 struct brw_gs_prog_data gs; 1427 struct brw_wm_prog_data wm; 1428 struct brw_cs_prog_data cs; 1429 struct brw_bs_prog_data bs; 1430}; 1431 1432#define DEFINE_PROG_DATA_DOWNCAST(STAGE, CHECK) \ 1433static inline struct brw_##STAGE##_prog_data * \ 1434brw_##STAGE##_prog_data(struct brw_stage_prog_data *prog_data) \ 1435{ \ 1436 if (prog_data) \ 1437 assert(CHECK); \ 1438 return (struct brw_##STAGE##_prog_data *) prog_data; \ 1439} \ 1440static inline const struct brw_##STAGE##_prog_data * \ 1441brw_##STAGE##_prog_data_const(const struct brw_stage_prog_data *prog_data) \ 1442{ \ 1443 if (prog_data) \ 1444 assert(CHECK); \ 1445 return (const struct brw_##STAGE##_prog_data *) prog_data; \ 1446} 1447 1448DEFINE_PROG_DATA_DOWNCAST(vs, prog_data->stage == MESA_SHADER_VERTEX) 1449DEFINE_PROG_DATA_DOWNCAST(tcs, prog_data->stage == MESA_SHADER_TESS_CTRL) 1450DEFINE_PROG_DATA_DOWNCAST(tes, prog_data->stage == MESA_SHADER_TESS_EVAL) 1451DEFINE_PROG_DATA_DOWNCAST(gs, prog_data->stage == MESA_SHADER_GEOMETRY) 1452DEFINE_PROG_DATA_DOWNCAST(wm, prog_data->stage == MESA_SHADER_FRAGMENT) 1453DEFINE_PROG_DATA_DOWNCAST(cs, prog_data->stage == MESA_SHADER_COMPUTE) 1454DEFINE_PROG_DATA_DOWNCAST(bs, brw_shader_stage_is_bindless(prog_data->stage)) 1455 1456DEFINE_PROG_DATA_DOWNCAST(vue, prog_data->stage == MESA_SHADER_VERTEX || 1457 prog_data->stage == MESA_SHADER_TESS_CTRL || 1458 prog_data->stage == MESA_SHADER_TESS_EVAL || 1459 prog_data->stage == MESA_SHADER_GEOMETRY) 1460 1461/* These are not really brw_stage_prog_data. */ 1462DEFINE_PROG_DATA_DOWNCAST(ff_gs, true) 1463DEFINE_PROG_DATA_DOWNCAST(clip, true) 1464DEFINE_PROG_DATA_DOWNCAST(sf, true) 1465#undef DEFINE_PROG_DATA_DOWNCAST 1466 1467struct brw_compile_stats { 1468 uint32_t dispatch_width; /**< 0 for vec4 */ 1469 uint32_t instructions; 1470 uint32_t sends; 1471 uint32_t loops; 1472 uint32_t cycles; 1473 uint32_t spills; 1474 uint32_t fills; 1475}; 1476 1477/** @} */ 1478 1479struct brw_compiler * 1480brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo); 1481 1482/** 1483 * Returns a compiler configuration for use with disk shader cache 1484 * 1485 * This value only needs to change for settings that can cause different 1486 * program generation between two runs on the same hardware. 1487 * 1488 * For example, it doesn't need to be different for gen 8 and gen 9 hardware, 1489 * but it does need to be different if INTEL_DEBUG=nocompact is or isn't used. 1490 */ 1491uint64_t 1492brw_get_compiler_config_value(const struct brw_compiler *compiler); 1493 1494unsigned 1495brw_prog_data_size(gl_shader_stage stage); 1496 1497unsigned 1498brw_prog_key_size(gl_shader_stage stage); 1499 1500void 1501brw_prog_key_set_id(union brw_any_prog_key *key, gl_shader_stage, unsigned id); 1502 1503/** 1504 * Parameters for compiling a vertex shader. 1505 * 1506 * Some of these will be modified during the shader compilation. 1507 */ 1508struct brw_compile_vs_params { 1509 nir_shader *nir; 1510 1511 const struct brw_vs_prog_key *key; 1512 struct brw_vs_prog_data *prog_data; 1513 1514 bool edgeflag_is_last; /* true for gallium */ 1515 bool shader_time; 1516 int shader_time_index; 1517 1518 struct brw_compile_stats *stats; 1519 1520 void *log_data; 1521 1522 char *error_str; 1523 1524 /* If unset, DEBUG_VS is used. */ 1525 uint64_t debug_flag; 1526}; 1527 1528/** 1529 * Compile a vertex shader. 1530 * 1531 * Returns the final assembly and updates the parameters structure. 1532 */ 1533const unsigned * 1534brw_compile_vs(const struct brw_compiler *compiler, 1535 void *mem_ctx, 1536 struct brw_compile_vs_params *params); 1537 1538/** 1539 * Compile a tessellation control shader. 1540 * 1541 * Returns the final assembly and the program's size. 1542 */ 1543const unsigned * 1544brw_compile_tcs(const struct brw_compiler *compiler, 1545 void *log_data, 1546 void *mem_ctx, 1547 const struct brw_tcs_prog_key *key, 1548 struct brw_tcs_prog_data *prog_data, 1549 nir_shader *nir, 1550 int shader_time_index, 1551 struct brw_compile_stats *stats, 1552 char **error_str); 1553 1554/** 1555 * Compile a tessellation evaluation shader. 1556 * 1557 * Returns the final assembly and the program's size. 1558 */ 1559const unsigned * 1560brw_compile_tes(const struct brw_compiler *compiler, void *log_data, 1561 void *mem_ctx, 1562 const struct brw_tes_prog_key *key, 1563 const struct brw_vue_map *input_vue_map, 1564 struct brw_tes_prog_data *prog_data, 1565 nir_shader *nir, 1566 int shader_time_index, 1567 struct brw_compile_stats *stats, 1568 char **error_str); 1569 1570/** 1571 * Compile a vertex shader. 1572 * 1573 * Returns the final assembly and the program's size. 1574 */ 1575const unsigned * 1576brw_compile_gs(const struct brw_compiler *compiler, void *log_data, 1577 void *mem_ctx, 1578 const struct brw_gs_prog_key *key, 1579 struct brw_gs_prog_data *prog_data, 1580 nir_shader *nir, 1581 int shader_time_index, 1582 struct brw_compile_stats *stats, 1583 char **error_str); 1584 1585/** 1586 * Compile a strips and fans shader. 1587 * 1588 * This is a fixed-function shader determined entirely by the shader key and 1589 * a VUE map. 1590 * 1591 * Returns the final assembly and the program's size. 1592 */ 1593const unsigned * 1594brw_compile_sf(const struct brw_compiler *compiler, 1595 void *mem_ctx, 1596 const struct brw_sf_prog_key *key, 1597 struct brw_sf_prog_data *prog_data, 1598 struct brw_vue_map *vue_map, 1599 unsigned *final_assembly_size); 1600 1601/** 1602 * Compile a clipper shader. 1603 * 1604 * This is a fixed-function shader determined entirely by the shader key and 1605 * a VUE map. 1606 * 1607 * Returns the final assembly and the program's size. 1608 */ 1609const unsigned * 1610brw_compile_clip(const struct brw_compiler *compiler, 1611 void *mem_ctx, 1612 const struct brw_clip_prog_key *key, 1613 struct brw_clip_prog_data *prog_data, 1614 struct brw_vue_map *vue_map, 1615 unsigned *final_assembly_size); 1616 1617/** 1618 * Parameters for compiling a fragment shader. 1619 * 1620 * Some of these will be modified during the shader compilation. 1621 */ 1622struct brw_compile_fs_params { 1623 nir_shader *nir; 1624 1625 const struct brw_wm_prog_key *key; 1626 struct brw_wm_prog_data *prog_data; 1627 const struct brw_vue_map *vue_map; 1628 1629 bool shader_time; 1630 int shader_time_index8; 1631 int shader_time_index16; 1632 int shader_time_index32; 1633 1634 bool allow_spilling; 1635 bool use_rep_send; 1636 1637 struct brw_compile_stats *stats; 1638 1639 void *log_data; 1640 1641 char *error_str; 1642 1643 /* If unset, DEBUG_WM is used. */ 1644 uint64_t debug_flag; 1645}; 1646 1647/** 1648 * Compile a fragment shader. 1649 * 1650 * Returns the final assembly and updates the parameters structure. 1651 */ 1652const unsigned * 1653brw_compile_fs(const struct brw_compiler *compiler, 1654 void *mem_ctx, 1655 struct brw_compile_fs_params *params); 1656 1657/** 1658 * Parameters for compiling a compute shader. 1659 * 1660 * Some of these will be modified during the shader compilation. 1661 */ 1662struct brw_compile_cs_params { 1663 nir_shader *nir; 1664 1665 const struct brw_cs_prog_key *key; 1666 struct brw_cs_prog_data *prog_data; 1667 1668 bool shader_time; 1669 int shader_time_index; 1670 1671 struct brw_compile_stats *stats; 1672 1673 void *log_data; 1674 1675 char *error_str; 1676 1677 /* If unset, DEBUG_CS is used. */ 1678 uint64_t debug_flag; 1679}; 1680 1681/** 1682 * Compile a compute shader. 1683 * 1684 * Returns the final assembly and updates the parameters structure. 1685 */ 1686const unsigned * 1687brw_compile_cs(const struct brw_compiler *compiler, 1688 void *mem_ctx, 1689 struct brw_compile_cs_params *params); 1690 1691/** 1692 * Compile a Ray Tracing shader. 1693 * 1694 * Returns the final assembly and the program's size. 1695 */ 1696const unsigned * 1697brw_compile_bs(const struct brw_compiler *compiler, void *log_data, 1698 void *mem_ctx, 1699 const struct brw_bs_prog_key *key, 1700 struct brw_bs_prog_data *prog_data, 1701 struct nir_shader *shader, 1702 unsigned num_resume_shaders, 1703 struct nir_shader **resume_shaders, 1704 struct brw_compile_stats *stats, 1705 char **error_str); 1706 1707/** 1708 * Compile a fixed function geometry shader. 1709 * 1710 * Returns the final assembly and the program's size. 1711 */ 1712const unsigned * 1713brw_compile_ff_gs_prog(struct brw_compiler *compiler, 1714 void *mem_ctx, 1715 const struct brw_ff_gs_prog_key *key, 1716 struct brw_ff_gs_prog_data *prog_data, 1717 struct brw_vue_map *vue_map, 1718 unsigned *final_assembly_size); 1719 1720void brw_debug_key_recompile(const struct brw_compiler *c, void *log, 1721 gl_shader_stage stage, 1722 const struct brw_base_prog_key *old_key, 1723 const struct brw_base_prog_key *key); 1724 1725/* Shared Local Memory Size is specified as powers of two, 1726 * and also have a Gen-dependent minimum value if not zero. 1727 */ 1728static inline uint32_t 1729intel_calculate_slm_size(unsigned gen, uint32_t bytes) 1730{ 1731 assert(bytes <= 64 * 1024); 1732 if (bytes > 0) 1733 return MAX2(util_next_power_of_two(bytes), gen >= 9 ? 1024 : 4096); 1734 else 1735 return 0; 1736} 1737 1738static inline uint32_t 1739encode_slm_size(unsigned gen, uint32_t bytes) 1740{ 1741 uint32_t slm_size = 0; 1742 1743 /* Shared Local Memory is specified as powers of two, and encoded in 1744 * INTERFACE_DESCRIPTOR_DATA with the following representations: 1745 * 1746 * Size | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB | 1747 * ------------------------------------------------------------------- 1748 * Gfx7-8 | 0 | none | none | 1 | 2 | 4 | 8 | 16 | 1749 * ------------------------------------------------------------------- 1750 * Gfx9+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 1751 */ 1752 1753 if (bytes > 0) { 1754 slm_size = intel_calculate_slm_size(gen, bytes); 1755 assert(util_is_power_of_two_nonzero(slm_size)); 1756 1757 if (gen >= 9) { 1758 /* Turn an exponent of 10 (1024 kB) into 1. */ 1759 assert(slm_size >= 1024); 1760 slm_size = ffs(slm_size) - 10; 1761 } else { 1762 assert(slm_size >= 4096); 1763 /* Convert to the pre-Gfx9 representation. */ 1764 slm_size = slm_size / 4096; 1765 } 1766 } 1767 1768 return slm_size; 1769} 1770 1771unsigned 1772brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, 1773 unsigned threads); 1774 1775void 1776brw_write_shader_relocs(const struct intel_device_info *devinfo, 1777 void *program, 1778 const struct brw_stage_prog_data *prog_data, 1779 struct brw_shader_reloc_value *values, 1780 unsigned num_values); 1781 1782struct brw_cs_dispatch_info { 1783 uint32_t group_size; 1784 uint32_t simd_size; 1785 uint32_t threads; 1786 1787 /* RightExecutionMask field used in GPGPU_WALKER. */ 1788 uint32_t right_mask; 1789}; 1790 1791/** 1792 * Get the dispatch information for a shader to be used with GPGPU_WALKER and 1793 * similar instructions. 1794 * 1795 * If override_local_size is not NULL, it must to point to a 3-element that 1796 * will override the value from prog_data->local_size. This is used by 1797 * ARB_compute_variable_group_size, where the size is set only at dispatch 1798 * time (so prog_data is outdated). 1799 */ 1800struct brw_cs_dispatch_info 1801brw_cs_get_dispatch_info(const struct intel_device_info *devinfo, 1802 const struct brw_cs_prog_data *prog_data, 1803 const unsigned *override_local_size); 1804 1805/** 1806 * Return true if the given shader stage is dispatched contiguously by the 1807 * relevant fixed function starting from channel 0 of the SIMD thread, which 1808 * implies that the dispatch mask of a thread can be assumed to have the form 1809 * '2^n - 1' for some n. 1810 */ 1811static inline bool 1812brw_stage_has_packed_dispatch(ASSERTED const struct intel_device_info *devinfo, 1813 gl_shader_stage stage, 1814 const struct brw_stage_prog_data *prog_data) 1815{ 1816 /* The code below makes assumptions about the hardware's thread dispatch 1817 * behavior that could be proven wrong in future generations -- Make sure 1818 * to do a full test run with brw_fs_test_dispatch_packing() hooked up to 1819 * the NIR front-end before changing this assertion. 1820 */ 1821 assert(devinfo->ver <= 12); 1822 1823 switch (stage) { 1824 case MESA_SHADER_FRAGMENT: { 1825 /* The PSD discards subspans coming in with no lit samples, which in the 1826 * per-pixel shading case implies that each subspan will either be fully 1827 * lit (due to the VMask being used to allow derivative computations), 1828 * or not dispatched at all. In per-sample dispatch mode individual 1829 * samples from the same subspan have a fixed relative location within 1830 * the SIMD thread, so dispatch of unlit samples cannot be avoided in 1831 * general and we should return false. 1832 */ 1833 const struct brw_wm_prog_data *wm_prog_data = 1834 (const struct brw_wm_prog_data *)prog_data; 1835 return !wm_prog_data->persample_dispatch; 1836 } 1837 case MESA_SHADER_COMPUTE: 1838 /* Compute shaders will be spawned with either a fully enabled dispatch 1839 * mask or with whatever bottom/right execution mask was given to the 1840 * GPGPU walker command to be used along the workgroup edges -- In both 1841 * cases the dispatch mask is required to be tightly packed for our 1842 * invocation index calculations to work. 1843 */ 1844 return true; 1845 default: 1846 /* Most remaining fixed functions are limited to use a packed dispatch 1847 * mask due to the hardware representation of the dispatch mask as a 1848 * single counter representing the number of enabled channels. 1849 */ 1850 return true; 1851 } 1852} 1853 1854/** 1855 * Computes the first varying slot in the URB produced by the previous stage 1856 * that is used in the next stage. We do this by testing the varying slots in 1857 * the previous stage's vue map against the inputs read in the next stage. 1858 * 1859 * Note that: 1860 * 1861 * - Each URB offset contains two varying slots and we can only skip a 1862 * full offset if both slots are unused, so the value we return here is always 1863 * rounded down to the closest multiple of two. 1864 * 1865 * - gl_Layer and gl_ViewportIndex don't have their own varying slots, they are 1866 * part of the vue header, so if these are read we can't skip anything. 1867 */ 1868static inline int 1869brw_compute_first_urb_slot_required(uint64_t inputs_read, 1870 const struct brw_vue_map *prev_stage_vue_map) 1871{ 1872 if ((inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT)) == 0) { 1873 for (int i = 0; i < prev_stage_vue_map->num_slots; i++) { 1874 int varying = prev_stage_vue_map->slot_to_varying[i]; 1875 if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) 1876 return ROUND_DOWN_TO(i, 2); 1877 } 1878 } 1879 1880 return 0; 1881} 1882 1883#ifdef __cplusplus 1884} /* extern "C" */ 1885#endif 1886 1887#endif /* BRW_COMPILER_H */ 1888