101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2011 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg/** 2501e04c3fSmrg * @file brw_vue_map.c 2601e04c3fSmrg * 2701e04c3fSmrg * This file computes the "VUE map" for a (non-fragment) shader stage, which 2801e04c3fSmrg * describes the layout of its output varyings. The VUE map is used to match 2901e04c3fSmrg * outputs from one stage with the inputs of the next. 3001e04c3fSmrg * 3101e04c3fSmrg * Largely, varyings can be placed however we like - producers/consumers simply 3201e04c3fSmrg * have to agree on the layout. However, there is also a "VUE Header" that 3301e04c3fSmrg * prescribes a fixed-layout for items that interact with fixed function 3401e04c3fSmrg * hardware, such as the clipper and rasterizer. 3501e04c3fSmrg * 3601e04c3fSmrg * Authors: 3701e04c3fSmrg * Paul Berry <stereotype441@gmail.com> 3801e04c3fSmrg * Chris Forbes <chrisf@ijw.co.nz> 3901e04c3fSmrg * Eric Anholt <eric@anholt.net> 4001e04c3fSmrg */ 4101e04c3fSmrg 4201e04c3fSmrg 4301e04c3fSmrg#include "brw_compiler.h" 441463c08dSmrg#include "dev/intel_debug.h" 4501e04c3fSmrg 4601e04c3fSmrgstatic inline void 4701e04c3fSmrgassign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot) 4801e04c3fSmrg{ 4901e04c3fSmrg /* Make sure this varying hasn't been assigned a slot already */ 5001e04c3fSmrg assert (vue_map->varying_to_slot[varying] == -1); 5101e04c3fSmrg 5201e04c3fSmrg vue_map->varying_to_slot[varying] = slot; 5301e04c3fSmrg vue_map->slot_to_varying[slot] = varying; 5401e04c3fSmrg} 5501e04c3fSmrg 5601e04c3fSmrg/** 5701e04c3fSmrg * Compute the VUE map for a shader stage. 5801e04c3fSmrg */ 5901e04c3fSmrgvoid 601463c08dSmrgbrw_compute_vue_map(const struct intel_device_info *devinfo, 6101e04c3fSmrg struct brw_vue_map *vue_map, 6201e04c3fSmrg uint64_t slots_valid, 631463c08dSmrg bool separate, 641463c08dSmrg uint32_t pos_slots) 6501e04c3fSmrg{ 6601e04c3fSmrg /* Keep using the packed/contiguous layout on old hardware - we only need 6701e04c3fSmrg * the SSO layout when using geometry/tessellation shaders or 32 FS input 6801e04c3fSmrg * varyings, which only exist on Gen >= 6. It's also a bit more efficient. 6901e04c3fSmrg */ 701463c08dSmrg if (devinfo->ver < 6) 7101e04c3fSmrg separate = false; 7201e04c3fSmrg 7301e04c3fSmrg if (separate) { 7401e04c3fSmrg /* In SSO mode, we don't know whether the adjacent stage will 7501e04c3fSmrg * read/write gl_ClipDistance, which has a fixed slot location. 7601e04c3fSmrg * We have to assume the worst and reserve a slot for it, or else 7701e04c3fSmrg * the rest of our varyings will be off by a slot. 7801e04c3fSmrg * 7901e04c3fSmrg * Note that we don't have to worry about COL/BFC, as those built-in 8001e04c3fSmrg * variables only exist in legacy GL, which only supports VS and FS. 8101e04c3fSmrg */ 8201e04c3fSmrg slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0); 8301e04c3fSmrg slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1); 8401e04c3fSmrg } 8501e04c3fSmrg 8601e04c3fSmrg vue_map->slots_valid = slots_valid; 8701e04c3fSmrg vue_map->separate = separate; 8801e04c3fSmrg 8901e04c3fSmrg /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they 9001e04c3fSmrg * are stored in the first VUE slot (VARYING_SLOT_PSIZ). 9101e04c3fSmrg */ 9201e04c3fSmrg slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT); 9301e04c3fSmrg 9401e04c3fSmrg /* Make sure that the values we store in vue_map->varying_to_slot and 9501e04c3fSmrg * vue_map->slot_to_varying won't overflow the signed chars that are used 9601e04c3fSmrg * to store them. Note that since vue_map->slot_to_varying sometimes holds 9701e04c3fSmrg * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that 9801e04c3fSmrg * BRW_VARYING_SLOT_COUNT is <= 127, not 128. 9901e04c3fSmrg */ 10001e04c3fSmrg STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127); 10101e04c3fSmrg 10201e04c3fSmrg for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) { 10301e04c3fSmrg vue_map->varying_to_slot[i] = -1; 10401e04c3fSmrg vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; 10501e04c3fSmrg } 10601e04c3fSmrg 10701e04c3fSmrg int slot = 0; 10801e04c3fSmrg 10901e04c3fSmrg /* VUE header: format depends on chip generation and whether clipping is 11001e04c3fSmrg * enabled. 11101e04c3fSmrg * 11201e04c3fSmrg * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30), 11301e04c3fSmrg * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout. 11401e04c3fSmrg */ 1151463c08dSmrg if (devinfo->ver < 6) { 11601e04c3fSmrg /* There are 8 dwords in VUE header pre-Ironlake: 11701e04c3fSmrg * dword 0-3 is indices, point width, clip flags. 11801e04c3fSmrg * dword 4-7 is ndc position 11901e04c3fSmrg * dword 8-11 is the first vertex data. 12001e04c3fSmrg * 12101e04c3fSmrg * On Ironlake the VUE header is nominally 20 dwords, but the hardware 1221463c08dSmrg * will accept the same header layout as Gfx4 [and should be a bit faster] 12301e04c3fSmrg */ 12401e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++); 12501e04c3fSmrg assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC, slot++); 12601e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++); 12701e04c3fSmrg } else { 12801e04c3fSmrg /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge: 12901e04c3fSmrg * dword 0-3 of the header is indices, point width, clip flags. 13001e04c3fSmrg * dword 4-7 is the 4D space position 13101e04c3fSmrg * dword 8-15 of the vertex header is the user clip distance if 13201e04c3fSmrg * enabled. 13301e04c3fSmrg * dword 8-11 or 16-19 is the first vertex element data we fill. 13401e04c3fSmrg */ 13501e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++); 13601e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++); 1371463c08dSmrg 1381463c08dSmrg /* When using Primitive Replication, multiple slots are used for storing 1391463c08dSmrg * positions for each view. 1401463c08dSmrg */ 1411463c08dSmrg assert(pos_slots >= 1); 1421463c08dSmrg if (pos_slots > 1) { 1431463c08dSmrg for (int i = 1; i < pos_slots; i++) { 1441463c08dSmrg vue_map->slot_to_varying[slot++] = VARYING_SLOT_POS; 1451463c08dSmrg } 1461463c08dSmrg } 1471463c08dSmrg 14801e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0)) 14901e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0, slot++); 15001e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1)) 15101e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1, slot++); 15201e04c3fSmrg 1531463c08dSmrg /* Vertex URB Formats table says: "Vertex Header shall be padded at the 1541463c08dSmrg * end so that the header ends on a 32-byte boundary". 1551463c08dSmrg */ 1561463c08dSmrg slot += slot % 2; 1571463c08dSmrg 15801e04c3fSmrg /* front and back colors need to be consecutive so that we can use 15901e04c3fSmrg * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing 16001e04c3fSmrg * two-sided color. 16101e04c3fSmrg */ 16201e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0)) 16301e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_COL0, slot++); 16401e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0)) 16501e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_BFC0, slot++); 16601e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1)) 16701e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_COL1, slot++); 16801e04c3fSmrg if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1)) 16901e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++); 17001e04c3fSmrg } 17101e04c3fSmrg 17201e04c3fSmrg /* The hardware doesn't care about the rest of the vertex outputs, so we 17301e04c3fSmrg * can assign them however we like. For normal programs, we simply assign 17401e04c3fSmrg * them contiguously. 17501e04c3fSmrg * 17601e04c3fSmrg * For separate shader pipelines, we first assign built-in varyings 17701e04c3fSmrg * contiguous slots. This works because ARB_separate_shader_objects 17801e04c3fSmrg * requires that all shaders have matching built-in varying interface 17901e04c3fSmrg * blocks. Next, we assign generic varyings based on their location 18001e04c3fSmrg * (either explicit or linker assigned). This guarantees a fixed layout. 18101e04c3fSmrg * 18201e04c3fSmrg * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX, 18301e04c3fSmrg * since it's encoded as the clip distances by emit_clip_distances(). 18401e04c3fSmrg * However, it may be output by transform feedback, and we'd rather not 18501e04c3fSmrg * recompute state when TF changes, so we just always include it. 18601e04c3fSmrg */ 18701e04c3fSmrg uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0); 18801e04c3fSmrg while (builtins != 0) { 18901e04c3fSmrg const int varying = ffsll(builtins) - 1; 19001e04c3fSmrg if (vue_map->varying_to_slot[varying] == -1) { 19101e04c3fSmrg assign_vue_slot(vue_map, varying, slot++); 19201e04c3fSmrg } 19301e04c3fSmrg builtins &= ~BITFIELD64_BIT(varying); 19401e04c3fSmrg } 19501e04c3fSmrg 19601e04c3fSmrg const int first_generic_slot = slot; 19701e04c3fSmrg uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0); 19801e04c3fSmrg while (generics != 0) { 19901e04c3fSmrg const int varying = ffsll(generics) - 1; 20001e04c3fSmrg if (separate) { 20101e04c3fSmrg slot = first_generic_slot + varying - VARYING_SLOT_VAR0; 20201e04c3fSmrg } 20301e04c3fSmrg assign_vue_slot(vue_map, varying, slot++); 20401e04c3fSmrg generics &= ~BITFIELD64_BIT(varying); 20501e04c3fSmrg } 20601e04c3fSmrg 20701e04c3fSmrg vue_map->num_slots = slot; 20801e04c3fSmrg vue_map->num_per_vertex_slots = 0; 20901e04c3fSmrg vue_map->num_per_patch_slots = 0; 21001e04c3fSmrg} 21101e04c3fSmrg 21201e04c3fSmrg/** 21301e04c3fSmrg * Compute the VUE map for tessellation control shader outputs and 21401e04c3fSmrg * tessellation evaluation shader inputs. 21501e04c3fSmrg */ 21601e04c3fSmrgvoid 21701e04c3fSmrgbrw_compute_tess_vue_map(struct brw_vue_map *vue_map, 21801e04c3fSmrg uint64_t vertex_slots, 21901e04c3fSmrg uint32_t patch_slots) 22001e04c3fSmrg{ 22101e04c3fSmrg /* I don't think anything actually uses this... */ 22201e04c3fSmrg vue_map->slots_valid = vertex_slots; 22301e04c3fSmrg 22401e04c3fSmrg /* separate isn't really meaningful, but make sure it's initialized */ 22501e04c3fSmrg vue_map->separate = false; 22601e04c3fSmrg 22701e04c3fSmrg vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER | 22801e04c3fSmrg VARYING_BIT_TESS_LEVEL_INNER); 22901e04c3fSmrg 23001e04c3fSmrg /* Make sure that the values we store in vue_map->varying_to_slot and 23101e04c3fSmrg * vue_map->slot_to_varying won't overflow the signed chars that are used 23201e04c3fSmrg * to store them. Note that since vue_map->slot_to_varying sometimes holds 23301e04c3fSmrg * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that 23401e04c3fSmrg * VARYING_SLOT_TESS_MAX is <= 127, not 128. 23501e04c3fSmrg */ 23601e04c3fSmrg STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127); 23701e04c3fSmrg 23801e04c3fSmrg for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) { 23901e04c3fSmrg vue_map->varying_to_slot[i] = -1; 24001e04c3fSmrg vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD; 24101e04c3fSmrg } 24201e04c3fSmrg 24301e04c3fSmrg int slot = 0; 24401e04c3fSmrg 24501e04c3fSmrg /* The first 8 DWords are reserved for the "Patch Header". 24601e04c3fSmrg * 24701e04c3fSmrg * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout 24801e04c3fSmrg * depends on the domain type. They might not be in slots 0 and 1 as 24901e04c3fSmrg * described here, but pretending they're separate allows us to uniquely 25001e04c3fSmrg * identify them by distinct slot locations. 25101e04c3fSmrg */ 25201e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++); 25301e04c3fSmrg assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++); 25401e04c3fSmrg 25501e04c3fSmrg /* first assign per-patch varyings */ 25601e04c3fSmrg while (patch_slots != 0) { 25701e04c3fSmrg const int varying = ffsll(patch_slots) - 1; 25801e04c3fSmrg if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) { 25901e04c3fSmrg assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++); 26001e04c3fSmrg } 26101e04c3fSmrg patch_slots &= ~BITFIELD64_BIT(varying); 26201e04c3fSmrg } 26301e04c3fSmrg 26401e04c3fSmrg /* apparently, including the patch header... */ 26501e04c3fSmrg vue_map->num_per_patch_slots = slot; 26601e04c3fSmrg 26701e04c3fSmrg /* then assign per-vertex varyings for each vertex in our patch */ 26801e04c3fSmrg while (vertex_slots != 0) { 26901e04c3fSmrg const int varying = ffsll(vertex_slots) - 1; 27001e04c3fSmrg if (vue_map->varying_to_slot[varying] == -1) { 27101e04c3fSmrg assign_vue_slot(vue_map, varying, slot++); 27201e04c3fSmrg } 27301e04c3fSmrg vertex_slots &= ~BITFIELD64_BIT(varying); 27401e04c3fSmrg } 27501e04c3fSmrg 27601e04c3fSmrg vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots; 27701e04c3fSmrg vue_map->num_slots = slot; 27801e04c3fSmrg} 27901e04c3fSmrg 28001e04c3fSmrgstatic const char * 2811463c08dSmrgvarying_name(brw_varying_slot slot, gl_shader_stage stage) 28201e04c3fSmrg{ 28301e04c3fSmrg assume(slot < BRW_VARYING_SLOT_COUNT); 28401e04c3fSmrg 28501e04c3fSmrg if (slot < VARYING_SLOT_MAX) 2861463c08dSmrg return gl_varying_slot_name_for_stage((gl_varying_slot)slot, stage); 28701e04c3fSmrg 28801e04c3fSmrg static const char *brw_names[] = { 28901e04c3fSmrg [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC", 29001e04c3fSmrg [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD", 29101e04c3fSmrg [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC", 29201e04c3fSmrg }; 29301e04c3fSmrg 29401e04c3fSmrg return brw_names[slot - VARYING_SLOT_MAX]; 29501e04c3fSmrg} 29601e04c3fSmrg 29701e04c3fSmrgvoid 2981463c08dSmrgbrw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map, 2991463c08dSmrg gl_shader_stage stage) 30001e04c3fSmrg{ 30101e04c3fSmrg if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) { 30201e04c3fSmrg fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n", 30301e04c3fSmrg vue_map->num_slots, 30401e04c3fSmrg vue_map->num_per_patch_slots, 30501e04c3fSmrg vue_map->num_per_vertex_slots, 30601e04c3fSmrg vue_map->separate ? "SSO" : "non-SSO"); 30701e04c3fSmrg for (int i = 0; i < vue_map->num_slots; i++) { 30801e04c3fSmrg if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) { 30901e04c3fSmrg fprintf(fp, " [%d] VARYING_SLOT_PATCH%d\n", i, 31001e04c3fSmrg vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0); 31101e04c3fSmrg } else { 31201e04c3fSmrg fprintf(fp, " [%d] %s\n", i, 3131463c08dSmrg varying_name(vue_map->slot_to_varying[i], stage)); 31401e04c3fSmrg } 31501e04c3fSmrg } 31601e04c3fSmrg } else { 31701e04c3fSmrg fprintf(fp, "VUE map (%d slots, %s)\n", 31801e04c3fSmrg vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO"); 31901e04c3fSmrg for (int i = 0; i < vue_map->num_slots; i++) { 32001e04c3fSmrg fprintf(fp, " [%d] %s\n", i, 3211463c08dSmrg varying_name(vue_map->slot_to_varying[i], stage)); 32201e04c3fSmrg } 32301e04c3fSmrg } 32401e04c3fSmrg fprintf(fp, "\n"); 32501e04c3fSmrg} 326