ir3_shader.h revision 7ec681f3
1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#ifndef IR3_SHADER_H_ 28#define IR3_SHADER_H_ 29 30#include <stdio.h> 31 32#include "c11/threads.h" 33#include "compiler/nir/nir.h" 34#include "compiler/shader_enums.h" 35#include "util/bitscan.h" 36#include "util/disk_cache.h" 37 38#include "ir3_compiler.h" 39 40struct glsl_type; 41 42/* driver param indices: */ 43enum ir3_driver_param { 44 /* compute shader driver params: */ 45 IR3_DP_NUM_WORK_GROUPS_X = 0, 46 IR3_DP_NUM_WORK_GROUPS_Y = 1, 47 IR3_DP_NUM_WORK_GROUPS_Z = 2, 48 IR3_DP_BASE_GROUP_X = 4, 49 IR3_DP_BASE_GROUP_Y = 5, 50 IR3_DP_BASE_GROUP_Z = 6, 51 IR3_DP_SUBGROUP_SIZE = 7, 52 IR3_DP_LOCAL_GROUP_SIZE_X = 8, 53 IR3_DP_LOCAL_GROUP_SIZE_Y = 9, 54 IR3_DP_LOCAL_GROUP_SIZE_Z = 10, 55 IR3_DP_SUBGROUP_ID_SHIFT = 11, 56 /* NOTE: gl_NumWorkGroups should be vec4 aligned because 57 * glDispatchComputeIndirect() needs to load these from 58 * the info->indirect buffer. Keep that in mind when/if 59 * adding any addition CS driver params. 60 */ 61 IR3_DP_CS_COUNT = 12, /* must be aligned to vec4 */ 62 63 /* vertex shader driver params: */ 64 IR3_DP_DRAWID = 0, 65 IR3_DP_VTXID_BASE = 1, 66 IR3_DP_INSTID_BASE = 2, 67 IR3_DP_VTXCNT_MAX = 3, 68 /* user-clip-plane components, up to 8x vec4's: */ 69 IR3_DP_UCP0_X = 4, 70 /* .... */ 71 IR3_DP_UCP7_W = 35, 72 IR3_DP_VS_COUNT = 36 /* must be aligned to vec4 */ 73}; 74 75#define IR3_MAX_SHADER_BUFFERS 32 76#define IR3_MAX_SHADER_IMAGES 32 77#define IR3_MAX_SO_BUFFERS 4 78#define IR3_MAX_SO_STREAMS 4 79#define IR3_MAX_SO_OUTPUTS 64 80#define IR3_MAX_UBO_PUSH_RANGES 32 81 82/* mirrors SYSTEM_VALUE_BARYCENTRIC_ but starting from 0 */ 83enum ir3_bary { 84 IJ_PERSP_PIXEL, 85 IJ_PERSP_SAMPLE, 86 IJ_PERSP_CENTROID, 87 IJ_PERSP_SIZE, 88 IJ_LINEAR_PIXEL, 89 IJ_LINEAR_CENTROID, 90 IJ_LINEAR_SAMPLE, 91 IJ_COUNT, 92}; 93 94/** 95 * Description of a lowered UBO. 96 */ 97struct ir3_ubo_info { 98 uint32_t block; /* Which constant block */ 99 uint16_t bindless_base; /* For bindless, which base register is used */ 100 bool bindless; 101}; 102 103/** 104 * Description of a range of a lowered UBO access. 105 * 106 * Drivers should not assume that there are not multiple disjoint 107 * lowered ranges of a single UBO. 108 */ 109struct ir3_ubo_range { 110 struct ir3_ubo_info ubo; 111 uint32_t offset; /* start offset to push in the const register file */ 112 uint32_t start, end; /* range of block that's actually used */ 113}; 114 115struct ir3_ubo_analysis_state { 116 struct ir3_ubo_range range[IR3_MAX_UBO_PUSH_RANGES]; 117 uint32_t num_enabled; 118 uint32_t size; 119 uint32_t 120 cmdstream_size; /* for per-gen backend to stash required cmdstream size */ 121}; 122 123/** 124 * Describes the layout of shader consts. This includes: 125 * + User consts + driver lowered UBO ranges 126 * + SSBO sizes 127 * + Image sizes/dimensions 128 * + Driver params (ie. IR3_DP_*) 129 * + TFBO addresses (for generations that do not have hardware streamout) 130 * + Lowered immediates 131 * 132 * For consts needed to pass internal values to shader which may or may not 133 * be required, rather than allocating worst-case const space, we scan the 134 * shader and allocate consts as-needed: 135 * 136 * + SSBO sizes: only needed if shader has a get_ssbo_size intrinsic 137 * for a given SSBO 138 * 139 * + Image dimensions: needed to calculate pixel offset, but only for 140 * images that have a image_store intrinsic 141 * 142 * Layout of constant registers, each section aligned to vec4. Note 143 * that pointer size (ubo, etc) changes depending on generation. 144 * 145 * user consts 146 * UBO addresses 147 * SSBO sizes 148 * if (vertex shader) { 149 * driver params (IR3_DP_*) 150 * if (stream_output.num_outputs > 0) 151 * stream-out addresses 152 * } else if (compute_shader) { 153 * driver params (IR3_DP_*) 154 * } 155 * immediates 156 * 157 * Immediates go last mostly because they are inserted in the CP pass 158 * after the nir -> ir3 frontend. 159 * 160 * Note UBO size in bytes should be aligned to vec4 161 */ 162struct ir3_const_state { 163 unsigned num_ubos; 164 unsigned num_driver_params; /* scalar */ 165 166 /* UBO that should be mapped to the NIR shader's constant_data (or -1). */ 167 int32_t constant_data_ubo; 168 169 struct { 170 /* user const start at zero */ 171 unsigned ubo; 172 unsigned image_dims; 173 unsigned driver_param; 174 unsigned tfbo; 175 unsigned primitive_param; 176 unsigned primitive_map; 177 unsigned immediate; 178 } offsets; 179 180 struct { 181 uint32_t mask; /* bitmask of images that have image_store */ 182 uint32_t count; /* number of consts allocated */ 183 /* three const allocated per image which has image_store: 184 * + cpp (bytes per pixel) 185 * + pitch (y pitch) 186 * + array_pitch (z pitch) 187 */ 188 uint32_t off[IR3_MAX_SHADER_IMAGES]; 189 } image_dims; 190 191 unsigned immediates_count; 192 unsigned immediates_size; 193 uint32_t *immediates; 194 195 /* State of ubo access lowered to push consts: */ 196 struct ir3_ubo_analysis_state ubo_state; 197}; 198 199/** 200 * A single output for vertex transform feedback. 201 */ 202struct ir3_stream_output { 203 unsigned register_index : 6; /**< 0 to 63 (OUT index) */ 204 unsigned start_component : 2; /** 0 to 3 */ 205 unsigned num_components : 3; /** 1 to 4 */ 206 unsigned output_buffer : 3; /**< 0 to PIPE_MAX_SO_BUFFERS */ 207 unsigned dst_offset : 16; /**< offset into the buffer in dwords */ 208 unsigned stream : 2; /**< 0 to 3 */ 209}; 210 211/** 212 * Stream output for vertex transform feedback. 213 */ 214struct ir3_stream_output_info { 215 unsigned num_outputs; 216 /** stride for an entire vertex for each buffer in dwords */ 217 uint16_t stride[IR3_MAX_SO_BUFFERS]; 218 219 /* These correspond to the VPC_SO_STREAM_CNTL fields */ 220 uint8_t streams_written; 221 uint8_t buffer_to_stream[IR3_MAX_SO_BUFFERS]; 222 223 /** 224 * Array of stream outputs, in the order they are to be written in. 225 * Selected components are tightly packed into the output buffer. 226 */ 227 struct ir3_stream_output output[IR3_MAX_SO_OUTPUTS]; 228}; 229 230/** 231 * Starting from a4xx, HW supports pre-dispatching texture sampling 232 * instructions prior to scheduling a shader stage, when the 233 * coordinate maps exactly to an output of the previous stage. 234 */ 235 236/** 237 * There is a limit in the number of pre-dispatches allowed for any 238 * given stage. 239 */ 240#define IR3_MAX_SAMPLER_PREFETCH 4 241 242/** 243 * This is the output stream value for 'cmd', as used by blob. It may 244 * encode the return type (in 3 bits) but it hasn't been verified yet. 245 */ 246#define IR3_SAMPLER_PREFETCH_CMD 0x4 247#define IR3_SAMPLER_BINDLESS_PREFETCH_CMD 0x6 248 249/** 250 * Stream output for texture sampling pre-dispatches. 251 */ 252struct ir3_sampler_prefetch { 253 uint8_t src; 254 uint8_t samp_id; 255 uint8_t tex_id; 256 uint16_t samp_bindless_id; 257 uint16_t tex_bindless_id; 258 uint8_t dst; 259 uint8_t wrmask; 260 uint8_t half_precision; 261 uint8_t cmd; 262}; 263 264/* Configuration key used to identify a shader variant.. different 265 * shader variants can be used to implement features not supported 266 * in hw (two sided color), binning-pass vertex shader, etc. 267 * 268 * When adding to this struct, please update ir3_shader_variant()'s debug 269 * output. 270 */ 271struct ir3_shader_key { 272 union { 273 struct { 274 /* 275 * Combined Vertex/Fragment shader parameters: 276 */ 277 unsigned ucp_enables : 8; 278 279 /* do we need to check {v,f}saturate_{s,t,r}? */ 280 unsigned has_per_samp : 1; 281 282 /* 283 * Fragment shader variant parameters: 284 */ 285 unsigned sample_shading : 1; 286 unsigned msaa : 1; 287 /* used when shader needs to handle flat varyings (a4xx) 288 * for front/back color inputs to frag shader: 289 */ 290 unsigned rasterflat : 1; 291 292 /* Indicates that this is a tessellation pipeline which requires a 293 * whole different kind of vertex shader. In case of 294 * tessellation, this field also tells us which kind of output 295 * topology the TES uses, which the TCS needs to know. 296 */ 297#define IR3_TESS_NONE 0 298#define IR3_TESS_TRIANGLES 1 299#define IR3_TESS_QUADS 2 300#define IR3_TESS_ISOLINES 3 301 unsigned tessellation : 2; 302 303 unsigned has_gs : 1; 304 305 /* Whether stages after TCS read gl_PrimitiveID, used to determine 306 * whether the TCS has to store it in the tess factor BO. 307 */ 308 unsigned tcs_store_primid : 1; 309 310 /* Whether this variant sticks to the "safe" maximum constlen, 311 * which guarantees that the combined stages will never go over 312 * the limit: 313 */ 314 unsigned safe_constlen : 1; 315 316 /* Whether gl_Layer must be forced to 0 because it isn't written. */ 317 unsigned layer_zero : 1; 318 319 /* Whether gl_ViewportIndex must be forced to 0 because it isn't 320 * written. */ 321 unsigned view_zero : 1; 322 }; 323 uint32_t global; 324 }; 325 326 /* bitmask of ms shifts (a3xx) */ 327 uint32_t vsamples, fsamples; 328 329 /* bitmask of samplers which need astc srgb workaround (a4xx+a5xx): */ 330 uint16_t vastc_srgb, fastc_srgb; 331}; 332 333static inline unsigned 334ir3_tess_mode(unsigned gl_tess_mode) 335{ 336 switch (gl_tess_mode) { 337 case GL_ISOLINES: 338 return IR3_TESS_ISOLINES; 339 case GL_TRIANGLES: 340 return IR3_TESS_TRIANGLES; 341 case GL_QUADS: 342 return IR3_TESS_QUADS; 343 default: 344 unreachable("bad tessmode"); 345 } 346} 347 348static inline bool 349ir3_shader_key_equal(const struct ir3_shader_key *a, 350 const struct ir3_shader_key *b) 351{ 352 /* slow-path if we need to check {v,f}saturate_{s,t,r} */ 353 if (a->has_per_samp || b->has_per_samp) 354 return memcmp(a, b, sizeof(struct ir3_shader_key)) == 0; 355 return a->global == b->global; 356} 357 358/* will the two keys produce different lowering for a fragment shader? */ 359static inline bool 360ir3_shader_key_changes_fs(struct ir3_shader_key *key, 361 struct ir3_shader_key *last_key) 362{ 363 if (last_key->has_per_samp || key->has_per_samp) { 364 if ((last_key->fsamples != key->fsamples) || 365 (last_key->fastc_srgb != key->fastc_srgb)) 366 return true; 367 } 368 369 if (last_key->rasterflat != key->rasterflat) 370 return true; 371 372 if (last_key->layer_zero != key->layer_zero) 373 return true; 374 375 if (last_key->ucp_enables != key->ucp_enables) 376 return true; 377 378 if (last_key->safe_constlen != key->safe_constlen) 379 return true; 380 381 return false; 382} 383 384/* will the two keys produce different lowering for a vertex shader? */ 385static inline bool 386ir3_shader_key_changes_vs(struct ir3_shader_key *key, 387 struct ir3_shader_key *last_key) 388{ 389 if (last_key->has_per_samp || key->has_per_samp) { 390 if ((last_key->vsamples != key->vsamples) || 391 (last_key->vastc_srgb != key->vastc_srgb)) 392 return true; 393 } 394 395 if (last_key->ucp_enables != key->ucp_enables) 396 return true; 397 398 if (last_key->safe_constlen != key->safe_constlen) 399 return true; 400 401 return false; 402} 403 404/** 405 * On a4xx+a5xx, Images share state with textures and SSBOs: 406 * 407 * + Uses texture (cat5) state/instruction (isam) to read 408 * + Uses SSBO state and instructions (cat6) to write and for atomics 409 * 410 * Starting with a6xx, Images and SSBOs are basically the same thing, 411 * with texture state and isam also used for SSBO reads. 412 * 413 * On top of that, gallium makes the SSBO (shader_buffers) state semi 414 * sparse, with the first half of the state space used for atomic 415 * counters lowered to atomic buffers. We could ignore this, but I 416 * don't think we could *really* handle the case of a single shader 417 * that used the max # of textures + images + SSBOs. And once we are 418 * offsetting images by num_ssbos (or visa versa) to map them into 419 * the same hardware state, the hardware state has become coupled to 420 * the shader state, so at this point we might as well just use a 421 * mapping table to remap things from image/SSBO idx to hw idx. 422 * 423 * To make things less (more?) confusing, for the hw "SSBO" state 424 * (since it is really both SSBO and Image) I'll use the name "IBO" 425 */ 426struct ir3_ibo_mapping { 427#define IBO_INVALID 0xff 428 /* Maps logical SSBO state to hw tex state: */ 429 uint8_t ssbo_to_tex[IR3_MAX_SHADER_BUFFERS]; 430 431 /* Maps logical Image state to hw tex state: */ 432 uint8_t image_to_tex[IR3_MAX_SHADER_IMAGES]; 433 434 /* Maps hw state back to logical SSBO or Image state: 435 * 436 * note IBO_SSBO ORd into values to indicate that the 437 * hw slot is used for SSBO state vs Image state. 438 */ 439#define IBO_SSBO 0x80 440 uint8_t tex_to_image[32]; 441 442 /* including real textures */ 443 uint8_t num_tex; 444 /* the number of real textures, ie. image/ssbo start here */ 445 uint8_t tex_base; 446}; 447 448struct ir3_disasm_info { 449 bool write_disasm; 450 char *nir; 451 char *disasm; 452}; 453 454/* Represents half register in regid */ 455#define HALF_REG_ID 0x100 456 457/** 458 * Shader variant which contains the actual hw shader instructions, 459 * and necessary info for shader state setup. 460 */ 461struct ir3_shader_variant { 462 struct fd_bo *bo; 463 464 /* variant id (for debug) */ 465 uint32_t id; 466 467 struct ir3_shader_key key; 468 469 /* vertex shaders can have an extra version for hwbinning pass, 470 * which is pointed to by so->binning: 471 */ 472 bool binning_pass; 473 // union { 474 struct ir3_shader_variant *binning; 475 struct ir3_shader_variant *nonbinning; 476 // }; 477 478 struct ir3 *ir; /* freed after assembling machine instructions */ 479 480 /* shader variants form a linked list: */ 481 struct ir3_shader_variant *next; 482 483 /* replicated here to avoid passing extra ptrs everywhere: */ 484 gl_shader_stage type; 485 struct ir3_shader *shader; 486 487 /* variant's copy of nir->constant_data (since we don't track the NIR in 488 * the variant, and shader->nir is before the opt pass). Moves to v->bin 489 * after assembly. 490 */ 491 void *constant_data; 492 493 /* 494 * Below here is serialized when written to disk cache: 495 */ 496 497 /* The actual binary shader instructions, size given by info.sizedwords: */ 498 uint32_t *bin; 499 500 struct ir3_const_state *const_state; 501 502 /* 503 * The following macros are used by the shader disk cache save/ 504 * restore paths to serialize/deserialize the variant. Any 505 * pointers that require special handling in store_variant() 506 * and retrieve_variant() should go above here. 507 */ 508#define VARIANT_CACHE_START offsetof(struct ir3_shader_variant, info) 509#define VARIANT_CACHE_PTR(v) (((char *)v) + VARIANT_CACHE_START) 510#define VARIANT_CACHE_SIZE \ 511 (sizeof(struct ir3_shader_variant) - VARIANT_CACHE_START) 512 513 struct ir3_info info; 514 515 uint32_t constant_data_size; 516 517 /* Levels of nesting of flow control: 518 */ 519 unsigned branchstack; 520 521 unsigned max_sun; 522 unsigned loops; 523 524 /* the instructions length is in units of instruction groups 525 * (4 instructions for a3xx, 16 instructions for a4xx.. each 526 * instruction is 2 dwords): 527 */ 528 unsigned instrlen; 529 530 /* the constants length is in units of vec4's, and is the sum of 531 * the uniforms and the built-in compiler constants 532 */ 533 unsigned constlen; 534 535 /* The private memory size in bytes */ 536 unsigned pvtmem_size; 537 /* Whether we should use the new per-wave layout rather than per-fiber. */ 538 bool pvtmem_per_wave; 539 540 /* Size in bytes of required shared memory */ 541 unsigned shared_size; 542 543 /* About Linkage: 544 * + Let the frag shader determine the position/compmask for the 545 * varyings, since it is the place where we know if the varying 546 * is actually used, and if so, which components are used. So 547 * what the hw calls "outloc" is taken from the "inloc" of the 548 * frag shader. 549 * + From the vert shader, we only need the output regid 550 */ 551 552 bool frag_face, color0_mrt; 553 uint8_t fragcoord_compmask; 554 555 /* NOTE: for input/outputs, slot is: 556 * gl_vert_attrib - for VS inputs 557 * gl_varying_slot - for VS output / FS input 558 * gl_frag_result - for FS output 559 */ 560 561 /* varyings/outputs: */ 562 unsigned outputs_count; 563 struct { 564 uint8_t slot; 565 uint8_t regid; 566 uint8_t view; 567 bool half : 1; 568 } outputs[32 + 2]; /* +POSITION +PSIZE */ 569 bool writes_pos, writes_smask, writes_psize, writes_stencilref; 570 571 /* Size in dwords of all outputs for VS, size of entire patch for HS. */ 572 uint32_t output_size; 573 574 /* Expected size of incoming output_loc for HS, DS, and GS */ 575 uint32_t input_size; 576 577 /* Map from location to offset in per-primitive storage. In dwords for 578 * HS, where varyings are read in the next stage via ldg with a dword 579 * offset, and in bytes for all other stages. 580 */ 581 unsigned output_loc[32 + 4]; /* +POSITION +PSIZE +CLIP_DIST0 +CLIP_DIST1 */ 582 583 /* attributes (VS) / varyings (FS): 584 * Note that sysval's should come *after* normal inputs. 585 */ 586 unsigned inputs_count; 587 struct { 588 uint8_t slot; 589 uint8_t regid; 590 uint8_t compmask; 591 /* location of input (ie. offset passed to bary.f, etc). This 592 * matches the SP_VS_VPC_DST_REG.OUTLOCn value (a3xx and a4xx 593 * have the OUTLOCn value offset by 8, presumably to account 594 * for gl_Position/gl_PointSize) 595 */ 596 uint8_t inloc; 597 /* vertex shader specific: */ 598 bool sysval : 1; /* slot is a gl_system_value */ 599 /* fragment shader specific: */ 600 bool bary : 1; /* fetched varying (vs one loaded into reg) */ 601 bool rasterflat : 1; /* special handling for emit->rasterflat */ 602 bool half : 1; 603 bool flat : 1; 604 } inputs[32 + 2]; /* +POSITION +FACE */ 605 606 /* sum of input components (scalar). For frag shaders, it only counts 607 * the varying inputs: 608 */ 609 unsigned total_in; 610 611 /* sum of sysval input components (scalar). */ 612 unsigned sysval_in; 613 614 /* For frag shaders, the total number of inputs (not scalar, 615 * ie. SP_VS_PARAM_REG.TOTALVSOUTVAR) 616 */ 617 unsigned varying_in; 618 619 /* Remapping table to map Image and SSBO to hw state: */ 620 struct ir3_ibo_mapping image_mapping; 621 622 /* number of samplers/textures (which are currently 1:1): */ 623 int num_samp; 624 625 /* is there an implicit sampler to read framebuffer (FS only).. if 626 * so the sampler-idx is 'num_samp - 1' (ie. it is appended after 627 * the last "real" texture) 628 */ 629 bool fb_read; 630 631 /* do we have one or more SSBO instructions: */ 632 bool has_ssbo; 633 634 /* Which bindless resources are used, for filling out sp_xs_config */ 635 bool bindless_tex; 636 bool bindless_samp; 637 bool bindless_ibo; 638 bool bindless_ubo; 639 640 /* do we need derivatives: */ 641 bool need_pixlod; 642 643 bool need_fine_derivatives; 644 645 /* do we need VS driver params? */ 646 bool need_driver_params; 647 648 /* do we have image write, etc (which prevents early-z): */ 649 bool no_earlyz; 650 651 /* do we have kill, which also prevents early-z, but not necessarily 652 * early-lrz (as long as lrz-write is disabled, which must be handled 653 * outside of ir3. Unlike other no_earlyz cases, kill doesn't have 654 * side effects that prevent early-lrz discard. 655 */ 656 bool has_kill; 657 658 bool per_samp; 659 660 /* Are we using split or merged register file? */ 661 bool mergedregs; 662 663 uint8_t clip_mask, cull_mask; 664 665 /* for astc srgb workaround, the number/base of additional 666 * alpha tex states we need, and index of original tex states 667 */ 668 struct { 669 unsigned base, count; 670 unsigned orig_idx[16]; 671 } astc_srgb; 672 673 /* texture sampler pre-dispatches */ 674 uint32_t num_sampler_prefetch; 675 struct ir3_sampler_prefetch sampler_prefetch[IR3_MAX_SAMPLER_PREFETCH]; 676 677 uint16_t local_size[3]; 678 bool local_size_variable; 679 680 struct ir3_disasm_info disasm_info; 681}; 682 683static inline const char * 684ir3_shader_stage(struct ir3_shader_variant *v) 685{ 686 switch (v->type) { 687 case MESA_SHADER_VERTEX: 688 return v->binning_pass ? "BVERT" : "VERT"; 689 case MESA_SHADER_TESS_CTRL: 690 return "TCS"; 691 case MESA_SHADER_TESS_EVAL: 692 return "TES"; 693 case MESA_SHADER_GEOMETRY: 694 return "GEOM"; 695 case MESA_SHADER_FRAGMENT: 696 return "FRAG"; 697 case MESA_SHADER_COMPUTE: 698 return "CL"; 699 default: 700 unreachable("invalid type"); 701 return NULL; 702 } 703} 704 705/* Currently we do not do binning for tess. And for GS there is no 706 * cross-stage VS+GS optimization, so the full VS+GS is used in 707 * the binning pass. 708 */ 709static inline bool 710ir3_has_binning_vs(const struct ir3_shader_key *key) 711{ 712 if (key->tessellation || key->has_gs) 713 return false; 714 return true; 715} 716 717/** 718 * Represents a shader at the API level, before state-specific variants are 719 * generated. 720 */ 721struct ir3_shader { 722 gl_shader_stage type; 723 724 /* shader id (for debug): */ 725 uint32_t id; 726 uint32_t variant_count; 727 728 /* Set by freedreno after shader_state_create, so we can emit debug info 729 * when recompiling a shader at draw time. 730 */ 731 bool initial_variants_done; 732 733 struct ir3_compiler *compiler; 734 735 unsigned num_reserved_user_consts; 736 737 bool nir_finalized; 738 struct nir_shader *nir; 739 struct ir3_stream_output_info stream_output; 740 741 struct ir3_shader_variant *variants; 742 mtx_t variants_lock; 743 744 cache_key cache_key; /* shader disk-cache key */ 745 746 /* Bitmask of bits of the shader key used by this shader. Used to avoid 747 * recompiles for GL NOS that doesn't actually apply to the shader. 748 */ 749 struct ir3_shader_key key_mask; 750}; 751 752/** 753 * In order to use the same cmdstream, in particular constlen setup and const 754 * emit, for both binning and draw pass (a6xx+), the binning pass re-uses it's 755 * corresponding draw pass shaders const_state. 756 */ 757static inline struct ir3_const_state * 758ir3_const_state(const struct ir3_shader_variant *v) 759{ 760 if (v->binning_pass) 761 return v->nonbinning->const_state; 762 return v->const_state; 763} 764 765/* Given a variant, calculate the maximum constlen it can have. 766 */ 767 768static inline unsigned 769ir3_max_const(const struct ir3_shader_variant *v) 770{ 771 const struct ir3_compiler *compiler = v->shader->compiler; 772 773 if (v->shader->type == MESA_SHADER_COMPUTE) { 774 return compiler->max_const_compute; 775 } else if (v->key.safe_constlen) { 776 return compiler->max_const_safe; 777 } else if (v->shader->type == MESA_SHADER_FRAGMENT) { 778 return compiler->max_const_frag; 779 } else { 780 return compiler->max_const_geom; 781 } 782} 783 784void *ir3_shader_assemble(struct ir3_shader_variant *v); 785struct ir3_shader_variant * 786ir3_shader_get_variant(struct ir3_shader *shader, 787 const struct ir3_shader_key *key, bool binning_pass, 788 bool keep_ir, bool *created); 789struct ir3_shader * 790ir3_shader_from_nir(struct ir3_compiler *compiler, nir_shader *nir, 791 unsigned reserved_user_consts, 792 struct ir3_stream_output_info *stream_output); 793uint32_t ir3_trim_constlen(struct ir3_shader_variant **variants, 794 const struct ir3_compiler *compiler); 795void ir3_shader_destroy(struct ir3_shader *shader); 796void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out); 797uint64_t ir3_shader_outputs(const struct ir3_shader *so); 798 799int ir3_glsl_type_size(const struct glsl_type *type, bool bindless); 800 801/* 802 * Helper/util: 803 */ 804 805/* clears shader-key flags which don't apply to the given shader. 806 */ 807static inline void 808ir3_key_clear_unused(struct ir3_shader_key *key, struct ir3_shader *shader) 809{ 810 uint32_t *key_bits = (uint32_t *)key; 811 uint32_t *key_mask = (uint32_t *)&shader->key_mask; 812 STATIC_ASSERT(sizeof(*key) % 4 == 0); 813 for (int i = 0; i < sizeof(*key) >> 2; i++) 814 key_bits[i] &= key_mask[i]; 815} 816 817static inline int 818ir3_find_output(const struct ir3_shader_variant *so, gl_varying_slot slot) 819{ 820 int j; 821 822 for (j = 0; j < so->outputs_count; j++) 823 if (so->outputs[j].slot == slot) 824 return j; 825 826 /* it seems optional to have a OUT.BCOLOR[n] for each OUT.COLOR[n] 827 * in the vertex shader.. but the fragment shader doesn't know this 828 * so it will always have both IN.COLOR[n] and IN.BCOLOR[n]. So 829 * at link time if there is no matching OUT.BCOLOR[n], we must map 830 * OUT.COLOR[n] to IN.BCOLOR[n]. And visa versa if there is only 831 * a OUT.BCOLOR[n] but no matching OUT.COLOR[n] 832 */ 833 if (slot == VARYING_SLOT_BFC0) { 834 slot = VARYING_SLOT_COL0; 835 } else if (slot == VARYING_SLOT_BFC1) { 836 slot = VARYING_SLOT_COL1; 837 } else if (slot == VARYING_SLOT_COL0) { 838 slot = VARYING_SLOT_BFC0; 839 } else if (slot == VARYING_SLOT_COL1) { 840 slot = VARYING_SLOT_BFC1; 841 } else { 842 return -1; 843 } 844 845 for (j = 0; j < so->outputs_count; j++) 846 if (so->outputs[j].slot == slot) 847 return j; 848 849 debug_assert(0); 850 851 return -1; 852} 853 854static inline int 855ir3_next_varying(const struct ir3_shader_variant *so, int i) 856{ 857 while (++i < so->inputs_count) 858 if (so->inputs[i].compmask && so->inputs[i].bary) 859 break; 860 return i; 861} 862 863struct ir3_shader_linkage { 864 /* Maximum location either consumed by the fragment shader or produced by 865 * the last geometry stage, i.e. the size required for each vertex in the 866 * VPC in DWORD's. 867 */ 868 uint8_t max_loc; 869 870 /* Number of entries in var. */ 871 uint8_t cnt; 872 873 /* Bitset of locations used, including ones which are only used by the FS. 874 */ 875 uint32_t varmask[4]; 876 877 /* Map from VS output to location. */ 878 struct { 879 uint8_t regid; 880 uint8_t compmask; 881 uint8_t loc; 882 } var[32]; 883 884 /* location for fixed-function gl_PrimitiveID passthrough */ 885 uint8_t primid_loc; 886 887 /* location for fixed-function gl_ViewIndex passthrough */ 888 uint8_t viewid_loc; 889 890 /* location for combined clip/cull distance arrays */ 891 uint8_t clip0_loc, clip1_loc; 892}; 893 894static inline void 895ir3_link_add(struct ir3_shader_linkage *l, uint8_t regid_, uint8_t compmask, 896 uint8_t loc) 897{ 898 for (int j = 0; j < util_last_bit(compmask); j++) { 899 uint8_t comploc = loc + j; 900 l->varmask[comploc / 32] |= 1 << (comploc % 32); 901 } 902 903 l->max_loc = MAX2(l->max_loc, loc + util_last_bit(compmask)); 904 905 if (regid_ != regid(63, 0)) { 906 int i = l->cnt++; 907 debug_assert(i < ARRAY_SIZE(l->var)); 908 909 l->var[i].regid = regid_; 910 l->var[i].compmask = compmask; 911 l->var[i].loc = loc; 912 } 913} 914 915static inline void 916ir3_link_shaders(struct ir3_shader_linkage *l, 917 const struct ir3_shader_variant *vs, 918 const struct ir3_shader_variant *fs, bool pack_vs_out) 919{ 920 /* On older platforms, varmask isn't programmed at all, and it appears 921 * that the hardware generates a mask of used VPC locations using the VS 922 * output map, and hangs if a FS bary instruction references a location 923 * not in the list. This means that we need to have a dummy entry in the 924 * VS out map for things like gl_PointCoord which aren't written by the 925 * VS. Furthermore we can't use r63.x, so just pick a random register to 926 * use if there is no VS output. 927 */ 928 const unsigned default_regid = pack_vs_out ? regid(63, 0) : regid(0, 0); 929 int j = -1, k; 930 931 l->primid_loc = 0xff; 932 l->viewid_loc = 0xff; 933 l->clip0_loc = 0xff; 934 l->clip1_loc = 0xff; 935 936 while (l->cnt < ARRAY_SIZE(l->var)) { 937 j = ir3_next_varying(fs, j); 938 939 if (j >= fs->inputs_count) 940 break; 941 942 if (fs->inputs[j].inloc >= fs->total_in) 943 continue; 944 945 k = ir3_find_output(vs, fs->inputs[j].slot); 946 947 if (k < 0 && fs->inputs[j].slot == VARYING_SLOT_PRIMITIVE_ID) { 948 l->primid_loc = fs->inputs[j].inloc; 949 } 950 951 if (fs->inputs[j].slot == VARYING_SLOT_VIEW_INDEX) { 952 assert(k < 0); 953 l->viewid_loc = fs->inputs[j].inloc; 954 } 955 956 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST0) 957 l->clip0_loc = fs->inputs[j].inloc; 958 959 if (fs->inputs[j].slot == VARYING_SLOT_CLIP_DIST1) 960 l->clip1_loc = fs->inputs[j].inloc; 961 962 ir3_link_add(l, k >= 0 ? vs->outputs[k].regid : default_regid, 963 fs->inputs[j].compmask, fs->inputs[j].inloc); 964 } 965} 966 967static inline uint32_t 968ir3_find_output_regid(const struct ir3_shader_variant *so, unsigned slot) 969{ 970 int j; 971 for (j = 0; j < so->outputs_count; j++) 972 if (so->outputs[j].slot == slot) { 973 uint32_t regid = so->outputs[j].regid; 974 if (so->outputs[j].half) 975 regid |= HALF_REG_ID; 976 return regid; 977 } 978 return regid(63, 0); 979} 980 981void ir3_link_stream_out(struct ir3_shader_linkage *l, 982 const struct ir3_shader_variant *v); 983 984#define VARYING_SLOT_GS_HEADER_IR3 (VARYING_SLOT_MAX + 0) 985#define VARYING_SLOT_GS_VERTEX_FLAGS_IR3 (VARYING_SLOT_MAX + 1) 986#define VARYING_SLOT_TCS_HEADER_IR3 (VARYING_SLOT_MAX + 2) 987#define VARYING_SLOT_REL_PATCH_ID_IR3 (VARYING_SLOT_MAX + 3) 988 989static inline uint32_t 990ir3_find_sysval_regid(const struct ir3_shader_variant *so, unsigned slot) 991{ 992 int j; 993 for (j = 0; j < so->inputs_count; j++) 994 if (so->inputs[j].sysval && (so->inputs[j].slot == slot)) 995 return so->inputs[j].regid; 996 return regid(63, 0); 997} 998 999/* calculate register footprint in terms of half-regs (ie. one full 1000 * reg counts as two half-regs). 1001 */ 1002static inline uint32_t 1003ir3_shader_halfregs(const struct ir3_shader_variant *v) 1004{ 1005 return (2 * (v->info.max_reg + 1)) + (v->info.max_half_reg + 1); 1006} 1007 1008static inline uint32_t 1009ir3_shader_nibo(const struct ir3_shader_variant *v) 1010{ 1011 /* The dummy variant used in binning mode won't have an actual shader. */ 1012 if (!v->shader) 1013 return 0; 1014 1015 return v->shader->nir->info.num_ssbos + v->shader->nir->info.num_images; 1016} 1017 1018static inline uint32_t 1019ir3_shader_branchstack_hw(const struct ir3_shader_variant *v) 1020{ 1021 /* Dummy shader */ 1022 if (!v->shader) 1023 return 0; 1024 1025 if (v->shader->compiler->gen < 5) 1026 return v->branchstack; 1027 1028 if (v->branchstack > 0) { 1029 uint32_t branchstack = v->branchstack / 2 + 1; 1030 return MIN2(branchstack, v->shader->compiler->branchstack_size / 2); 1031 } else { 1032 return 0; 1033 } 1034} 1035 1036#endif /* IR3_SHADER_H_ */ 1037