1b8e80941Smrg/* 2b8e80941Smrg * © Copyright 2017-2018 Alyssa Rosenzweig 3b8e80941Smrg * © Copyright 2017-2018 Connor Abbott 4b8e80941Smrg * © Copyright 2017-2018 Lyude Paul 5b8e80941Smrg * 6b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 7b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 8b8e80941Smrg * to deal in the Software without restriction, including without limitation 9b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 11b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 12b8e80941Smrg * 13b8e80941Smrg * The above copyright notice and this permission notice (including the next 14b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 15b8e80941Smrg * Software. 16b8e80941Smrg * 17b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23b8e80941Smrg * SOFTWARE. 24b8e80941Smrg * 25b8e80941Smrg */ 26b8e80941Smrg 27b8e80941Smrg#ifndef __PANFROST_JOB_H__ 28b8e80941Smrg#define __PANFROST_JOB_H__ 29b8e80941Smrg 30b8e80941Smrg#include <stdint.h> 31b8e80941Smrg#include <panfrost-misc.h> 32b8e80941Smrg 33b8e80941Smrg#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8) 34b8e80941Smrg 35b8e80941Smrg#define MALI_FBD_HIERARCHY_WEIGHTS 8 36b8e80941Smrg 37b8e80941Smrg#define MALI_PAYLOAD_SIZE 256 38b8e80941Smrg 39b8e80941Smrgtypedef u32 mali_jd_core_req; 40b8e80941Smrg 41b8e80941Smrgenum mali_job_type { 42b8e80941Smrg JOB_NOT_STARTED = 0, 43b8e80941Smrg JOB_TYPE_NULL = 1, 44b8e80941Smrg JOB_TYPE_SET_VALUE = 2, 45b8e80941Smrg JOB_TYPE_CACHE_FLUSH = 3, 46b8e80941Smrg JOB_TYPE_COMPUTE = 4, 47b8e80941Smrg JOB_TYPE_VERTEX = 5, 48b8e80941Smrg JOB_TYPE_GEOMETRY = 6, 49b8e80941Smrg JOB_TYPE_TILER = 7, 50b8e80941Smrg JOB_TYPE_FUSED = 8, 51b8e80941Smrg JOB_TYPE_FRAGMENT = 9, 52b8e80941Smrg}; 53b8e80941Smrg 54b8e80941Smrgenum mali_draw_mode { 55b8e80941Smrg MALI_DRAW_NONE = 0x0, 56b8e80941Smrg MALI_POINTS = 0x1, 57b8e80941Smrg MALI_LINES = 0x2, 58b8e80941Smrg MALI_LINE_STRIP = 0x4, 59b8e80941Smrg MALI_LINE_LOOP = 0x6, 60b8e80941Smrg MALI_TRIANGLES = 0x8, 61b8e80941Smrg MALI_TRIANGLE_STRIP = 0xA, 62b8e80941Smrg MALI_TRIANGLE_FAN = 0xC, 63b8e80941Smrg MALI_POLYGON = 0xD, 64b8e80941Smrg MALI_QUADS = 0xE, 65b8e80941Smrg MALI_QUAD_STRIP = 0xF, 66b8e80941Smrg 67b8e80941Smrg /* All other modes invalid */ 68b8e80941Smrg}; 69b8e80941Smrg 70b8e80941Smrg/* Applies to tiler_gl_enables */ 71b8e80941Smrg 72b8e80941Smrg 73b8e80941Smrg#define MALI_OCCLUSION_QUERY (1 << 3) 74b8e80941Smrg#define MALI_OCCLUSION_PRECISE (1 << 4) 75b8e80941Smrg 76b8e80941Smrg#define MALI_FRONT_FACE(v) (v << 5) 77b8e80941Smrg#define MALI_CCW (0) 78b8e80941Smrg#define MALI_CW (1) 79b8e80941Smrg 80b8e80941Smrg#define MALI_CULL_FACE_FRONT (1 << 6) 81b8e80941Smrg#define MALI_CULL_FACE_BACK (1 << 7) 82b8e80941Smrg 83b8e80941Smrg/* TODO: Might this actually be a finer bitfield? */ 84b8e80941Smrg#define MALI_DEPTH_STENCIL_ENABLE 0x6400 85b8e80941Smrg 86b8e80941Smrg#define DS_ENABLE(field) \ 87b8e80941Smrg (field == MALI_DEPTH_STENCIL_ENABLE) \ 88b8e80941Smrg ? "MALI_DEPTH_STENCIL_ENABLE" \ 89b8e80941Smrg : (field == 0) ? "0" \ 90b8e80941Smrg : "0 /* XXX: Unknown, check hexdump */" 91b8e80941Smrg 92b8e80941Smrg/* Used in stencil and depth tests */ 93b8e80941Smrg 94b8e80941Smrgenum mali_func { 95b8e80941Smrg MALI_FUNC_NEVER = 0, 96b8e80941Smrg MALI_FUNC_LESS = 1, 97b8e80941Smrg MALI_FUNC_EQUAL = 2, 98b8e80941Smrg MALI_FUNC_LEQUAL = 3, 99b8e80941Smrg MALI_FUNC_GREATER = 4, 100b8e80941Smrg MALI_FUNC_NOTEQUAL = 5, 101b8e80941Smrg MALI_FUNC_GEQUAL = 6, 102b8e80941Smrg MALI_FUNC_ALWAYS = 7 103b8e80941Smrg}; 104b8e80941Smrg 105b8e80941Smrg/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */ 106b8e80941Smrg 107b8e80941Smrgenum mali_alt_func { 108b8e80941Smrg MALI_ALT_FUNC_NEVER = 0, 109b8e80941Smrg MALI_ALT_FUNC_GREATER = 1, 110b8e80941Smrg MALI_ALT_FUNC_EQUAL = 2, 111b8e80941Smrg MALI_ALT_FUNC_GEQUAL = 3, 112b8e80941Smrg MALI_ALT_FUNC_LESS = 4, 113b8e80941Smrg MALI_ALT_FUNC_NOTEQUAL = 5, 114b8e80941Smrg MALI_ALT_FUNC_LEQUAL = 6, 115b8e80941Smrg MALI_ALT_FUNC_ALWAYS = 7 116b8e80941Smrg}; 117b8e80941Smrg 118b8e80941Smrg/* Flags apply to unknown2_3? */ 119b8e80941Smrg 120b8e80941Smrg#define MALI_HAS_MSAA (1 << 0) 121b8e80941Smrg#define MALI_CAN_DISCARD (1 << 5) 122b8e80941Smrg 123b8e80941Smrg/* Applies on SFBD systems, specifying that programmable blending is in use */ 124b8e80941Smrg#define MALI_HAS_BLEND_SHADER (1 << 6) 125b8e80941Smrg 126b8e80941Smrg/* func is mali_func */ 127b8e80941Smrg#define MALI_DEPTH_FUNC(func) (func << 8) 128b8e80941Smrg#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7) 129b8e80941Smrg#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7) 130b8e80941Smrg 131b8e80941Smrg#define MALI_DEPTH_TEST (1 << 11) 132b8e80941Smrg 133b8e80941Smrg/* Next flags to unknown2_4 */ 134b8e80941Smrg#define MALI_STENCIL_TEST (1 << 0) 135b8e80941Smrg 136b8e80941Smrg/* What?! */ 137b8e80941Smrg#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1) 138b8e80941Smrg 139b8e80941Smrg#define MALI_NO_DITHER (1 << 9) 140b8e80941Smrg#define MALI_DEPTH_RANGE_A (1 << 12) 141b8e80941Smrg#define MALI_DEPTH_RANGE_B (1 << 13) 142b8e80941Smrg#define MALI_NO_MSAA (1 << 14) 143b8e80941Smrg 144b8e80941Smrg/* Stencil test state is all encoded in a single u32, just with a lot of 145b8e80941Smrg * enums... */ 146b8e80941Smrg 147b8e80941Smrgenum mali_stencil_op { 148b8e80941Smrg MALI_STENCIL_KEEP = 0, 149b8e80941Smrg MALI_STENCIL_REPLACE = 1, 150b8e80941Smrg MALI_STENCIL_ZERO = 2, 151b8e80941Smrg MALI_STENCIL_INVERT = 3, 152b8e80941Smrg MALI_STENCIL_INCR_WRAP = 4, 153b8e80941Smrg MALI_STENCIL_DECR_WRAP = 5, 154b8e80941Smrg MALI_STENCIL_INCR = 6, 155b8e80941Smrg MALI_STENCIL_DECR = 7 156b8e80941Smrg}; 157b8e80941Smrg 158b8e80941Smrgstruct mali_stencil_test { 159b8e80941Smrg unsigned ref : 8; 160b8e80941Smrg unsigned mask : 8; 161b8e80941Smrg enum mali_func func : 3; 162b8e80941Smrg enum mali_stencil_op sfail : 3; 163b8e80941Smrg enum mali_stencil_op dpfail : 3; 164b8e80941Smrg enum mali_stencil_op dppass : 3; 165b8e80941Smrg unsigned zero : 4; 166b8e80941Smrg} __attribute__((packed)); 167b8e80941Smrg 168b8e80941Smrg/* Blending is a mess, since anything fancy triggers a blend shader, and 169b8e80941Smrg * -those- are not understood whatsover yet */ 170b8e80941Smrg 171b8e80941Smrg#define MALI_MASK_R (1 << 0) 172b8e80941Smrg#define MALI_MASK_G (1 << 1) 173b8e80941Smrg#define MALI_MASK_B (1 << 2) 174b8e80941Smrg#define MALI_MASK_A (1 << 3) 175b8e80941Smrg 176b8e80941Smrgenum mali_nondominant_mode { 177b8e80941Smrg MALI_BLEND_NON_MIRROR = 0, 178b8e80941Smrg MALI_BLEND_NON_ZERO = 1 179b8e80941Smrg}; 180b8e80941Smrg 181b8e80941Smrgenum mali_dominant_blend { 182b8e80941Smrg MALI_BLEND_DOM_SOURCE = 0, 183b8e80941Smrg MALI_BLEND_DOM_DESTINATION = 1 184b8e80941Smrg}; 185b8e80941Smrg 186b8e80941Smrgenum mali_dominant_factor { 187b8e80941Smrg MALI_DOMINANT_UNK0 = 0, 188b8e80941Smrg MALI_DOMINANT_ZERO = 1, 189b8e80941Smrg MALI_DOMINANT_SRC_COLOR = 2, 190b8e80941Smrg MALI_DOMINANT_DST_COLOR = 3, 191b8e80941Smrg MALI_DOMINANT_UNK4 = 4, 192b8e80941Smrg MALI_DOMINANT_SRC_ALPHA = 5, 193b8e80941Smrg MALI_DOMINANT_DST_ALPHA = 6, 194b8e80941Smrg MALI_DOMINANT_CONSTANT = 7, 195b8e80941Smrg}; 196b8e80941Smrg 197b8e80941Smrgenum mali_blend_modifier { 198b8e80941Smrg MALI_BLEND_MOD_UNK0 = 0, 199b8e80941Smrg MALI_BLEND_MOD_NORMAL = 1, 200b8e80941Smrg MALI_BLEND_MOD_SOURCE_ONE = 2, 201b8e80941Smrg MALI_BLEND_MOD_DEST_ONE = 3, 202b8e80941Smrg}; 203b8e80941Smrg 204b8e80941Smrgstruct mali_blend_mode { 205b8e80941Smrg enum mali_blend_modifier clip_modifier : 2; 206b8e80941Smrg unsigned unused_0 : 1; 207b8e80941Smrg unsigned negate_source : 1; 208b8e80941Smrg 209b8e80941Smrg enum mali_dominant_blend dominant : 1; 210b8e80941Smrg 211b8e80941Smrg enum mali_nondominant_mode nondominant_mode : 1; 212b8e80941Smrg 213b8e80941Smrg unsigned unused_1 : 1; 214b8e80941Smrg 215b8e80941Smrg unsigned negate_dest : 1; 216b8e80941Smrg 217b8e80941Smrg enum mali_dominant_factor dominant_factor : 3; 218b8e80941Smrg unsigned complement_dominant : 1; 219b8e80941Smrg} __attribute__((packed)); 220b8e80941Smrg 221b8e80941Smrgstruct mali_blend_equation { 222b8e80941Smrg /* Of type mali_blend_mode */ 223b8e80941Smrg unsigned rgb_mode : 12; 224b8e80941Smrg unsigned alpha_mode : 12; 225b8e80941Smrg 226b8e80941Smrg unsigned zero1 : 4; 227b8e80941Smrg 228b8e80941Smrg /* Corresponds to MALI_MASK_* above and glColorMask arguments */ 229b8e80941Smrg 230b8e80941Smrg unsigned color_mask : 4; 231b8e80941Smrg 232b8e80941Smrg /* Attached constant for CONSTANT_ALPHA, etc */ 233b8e80941Smrg 234b8e80941Smrg#ifndef BIFROST 235b8e80941Smrg float constant; 236b8e80941Smrg#endif 237b8e80941Smrg} __attribute__((packed)); 238b8e80941Smrg 239b8e80941Smrg/* Used with channel swizzling */ 240b8e80941Smrgenum mali_channel { 241b8e80941Smrg MALI_CHANNEL_RED = 0, 242b8e80941Smrg MALI_CHANNEL_GREEN = 1, 243b8e80941Smrg MALI_CHANNEL_BLUE = 2, 244b8e80941Smrg MALI_CHANNEL_ALPHA = 3, 245b8e80941Smrg MALI_CHANNEL_ZERO = 4, 246b8e80941Smrg MALI_CHANNEL_ONE = 5, 247b8e80941Smrg MALI_CHANNEL_RESERVED_0 = 6, 248b8e80941Smrg MALI_CHANNEL_RESERVED_1 = 7, 249b8e80941Smrg}; 250b8e80941Smrg 251b8e80941Smrgstruct mali_channel_swizzle { 252b8e80941Smrg enum mali_channel r : 3; 253b8e80941Smrg enum mali_channel g : 3; 254b8e80941Smrg enum mali_channel b : 3; 255b8e80941Smrg enum mali_channel a : 3; 256b8e80941Smrg} __attribute__((packed)); 257b8e80941Smrg 258b8e80941Smrg/* Compressed per-pixel formats. Each of these formats expands to one to four 259b8e80941Smrg * floating-point or integer numbers, as defined by the OpenGL specification. 260b8e80941Smrg * There are various places in OpenGL where the user can specify a compressed 261b8e80941Smrg * format in memory, which all use the same 8-bit enum in the various 262b8e80941Smrg * descriptors, although different hardware units support different formats. 263b8e80941Smrg */ 264b8e80941Smrg 265b8e80941Smrg/* The top 3 bits specify how the bits of each component are interpreted. */ 266b8e80941Smrg 267b8e80941Smrg/* e.g. R11F_G11F_B10F */ 268b8e80941Smrg#define MALI_FORMAT_SPECIAL (2 << 5) 269b8e80941Smrg 270b8e80941Smrg/* signed normalized, e.g. RGBA8_SNORM */ 271b8e80941Smrg#define MALI_FORMAT_SNORM (3 << 5) 272b8e80941Smrg 273b8e80941Smrg/* e.g. RGBA8UI */ 274b8e80941Smrg#define MALI_FORMAT_UINT (4 << 5) 275b8e80941Smrg 276b8e80941Smrg/* e.g. RGBA8 and RGBA32F */ 277b8e80941Smrg#define MALI_FORMAT_UNORM (5 << 5) 278b8e80941Smrg 279b8e80941Smrg/* e.g. RGBA8I and RGBA16F */ 280b8e80941Smrg#define MALI_FORMAT_SINT (6 << 5) 281b8e80941Smrg 282b8e80941Smrg/* These formats seem to largely duplicate the others. They're used at least 283b8e80941Smrg * for Bifrost framebuffer output. 284b8e80941Smrg */ 285b8e80941Smrg#define MALI_FORMAT_SPECIAL2 (7 << 5) 286b8e80941Smrg 287b8e80941Smrg/* If the high 3 bits are 3 to 6 these two bits say how many components 288b8e80941Smrg * there are. 289b8e80941Smrg */ 290b8e80941Smrg#define MALI_NR_CHANNELS(n) ((n - 1) << 3) 291b8e80941Smrg 292b8e80941Smrg/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each 293b8e80941Smrg * component is, except the special MALI_CHANNEL_FLOAT which overrides what the 294b8e80941Smrg * bits mean. 295b8e80941Smrg */ 296b8e80941Smrg 297b8e80941Smrg#define MALI_CHANNEL_4 2 298b8e80941Smrg 299b8e80941Smrg#define MALI_CHANNEL_8 3 300b8e80941Smrg 301b8e80941Smrg#define MALI_CHANNEL_16 4 302b8e80941Smrg 303b8e80941Smrg#define MALI_CHANNEL_32 5 304b8e80941Smrg 305b8e80941Smrg/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For 306b8e80941Smrg * MALI_FORMAT_UNORM, it means a 32-bit float. 307b8e80941Smrg */ 308b8e80941Smrg#define MALI_CHANNEL_FLOAT 7 309b8e80941Smrg 310b8e80941Smrgenum mali_format { 311b8e80941Smrg MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0, 312b8e80941Smrg MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2, 313b8e80941Smrg MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3, 314b8e80941Smrg MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5, 315b8e80941Smrg MALI_RGB10_A2UI = MALI_FORMAT_SPECIAL | 0x7, 316b8e80941Smrg MALI_RGB10_A2I = MALI_FORMAT_SPECIAL | 0x9, 317b8e80941Smrg 318b8e80941Smrg /* YUV formats */ 319b8e80941Smrg MALI_NV12 = MALI_FORMAT_SPECIAL | 0xc, 320b8e80941Smrg 321b8e80941Smrg MALI_Z32_UNORM = MALI_FORMAT_SPECIAL | 0xD, 322b8e80941Smrg MALI_R32_FIXED = MALI_FORMAT_SPECIAL | 0x11, 323b8e80941Smrg MALI_RG32_FIXED = MALI_FORMAT_SPECIAL | 0x12, 324b8e80941Smrg MALI_RGB32_FIXED = MALI_FORMAT_SPECIAL | 0x13, 325b8e80941Smrg MALI_RGBA32_FIXED = MALI_FORMAT_SPECIAL | 0x14, 326b8e80941Smrg MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19, 327b8e80941Smrg /* Only used for varyings, to indicate the transformed gl_Position */ 328b8e80941Smrg MALI_VARYING_POS = MALI_FORMAT_SPECIAL | 0x1e, 329b8e80941Smrg /* Only used for varyings, to indicate that the write should be 330b8e80941Smrg * discarded. 331b8e80941Smrg */ 332b8e80941Smrg MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f, 333b8e80941Smrg 334b8e80941Smrg MALI_R8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 335b8e80941Smrg MALI_R16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 336b8e80941Smrg MALI_R32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 337b8e80941Smrg MALI_RG8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 338b8e80941Smrg MALI_RG16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 339b8e80941Smrg MALI_RG32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 340b8e80941Smrg MALI_RGB8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 341b8e80941Smrg MALI_RGB16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 342b8e80941Smrg MALI_RGB32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 343b8e80941Smrg MALI_RGBA8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 344b8e80941Smrg MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 345b8e80941Smrg MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 346b8e80941Smrg 347b8e80941Smrg MALI_R8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 348b8e80941Smrg MALI_R16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 349b8e80941Smrg MALI_R32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 350b8e80941Smrg MALI_RG8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 351b8e80941Smrg MALI_RG16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 352b8e80941Smrg MALI_RG32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 353b8e80941Smrg MALI_RGB8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 354b8e80941Smrg MALI_RGB16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 355b8e80941Smrg MALI_RGB32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 356b8e80941Smrg MALI_RGBA8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 357b8e80941Smrg MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 358b8e80941Smrg MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 359b8e80941Smrg 360b8e80941Smrg MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 361b8e80941Smrg MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 362b8e80941Smrg MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 363b8e80941Smrg MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, 364b8e80941Smrg MALI_RG8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 365b8e80941Smrg MALI_RG16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 366b8e80941Smrg MALI_RG32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 367b8e80941Smrg MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, 368b8e80941Smrg MALI_RGB8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 369b8e80941Smrg MALI_RGB16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 370b8e80941Smrg MALI_RGB32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 371b8e80941Smrg MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, 372b8e80941Smrg MALI_RGBA4_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4, 373b8e80941Smrg MALI_RGBA8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 374b8e80941Smrg MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 375b8e80941Smrg MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 376b8e80941Smrg MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, 377b8e80941Smrg 378b8e80941Smrg MALI_R8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 379b8e80941Smrg MALI_R16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 380b8e80941Smrg MALI_R32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 381b8e80941Smrg MALI_R16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, 382b8e80941Smrg MALI_RG8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 383b8e80941Smrg MALI_RG16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 384b8e80941Smrg MALI_RG32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 385b8e80941Smrg MALI_RG16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, 386b8e80941Smrg MALI_RGB8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 387b8e80941Smrg MALI_RGB16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 388b8e80941Smrg MALI_RGB32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 389b8e80941Smrg MALI_RGB16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, 390b8e80941Smrg MALI_RGBA8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 391b8e80941Smrg MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 392b8e80941Smrg MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 393b8e80941Smrg MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, 394b8e80941Smrg 395b8e80941Smrg MALI_RGBA4 = MALI_FORMAT_SPECIAL2 | 0x8, 396b8e80941Smrg MALI_RGBA8_2 = MALI_FORMAT_SPECIAL2 | 0xd, 397b8e80941Smrg MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe, 398b8e80941Smrg}; 399b8e80941Smrg 400b8e80941Smrg 401b8e80941Smrg/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion 402b8e80941Smrg * literally performing a bitwise invert. This function produces slightly wrong 403b8e80941Smrg * results and I'm not sure why; some rounding issue I suppose... */ 404b8e80941Smrg 405b8e80941Smrg#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f)) 406b8e80941Smrg#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f) 407b8e80941Smrg 408b8e80941Smrg/* Applies to unknown1 */ 409b8e80941Smrg#define MALI_NO_ALPHA_TO_COVERAGE (1 << 10) 410b8e80941Smrg 411b8e80941Smrg/* Flags denoting the fragment shader's use of tilebuffer readback. If the 412b8e80941Smrg * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If 413b8e80941Smrg * it might read depth/stencil in particular, also set MALI_READS_ZS */ 414b8e80941Smrg 415b8e80941Smrg#define MALI_READS_ZS (1 << 12) 416b8e80941Smrg#define MALI_READS_TILEBUFFER (1 << 16) 417b8e80941Smrg 418b8e80941Smrgstruct mali_blend_meta { 419b8e80941Smrg#ifndef BIFROST 420b8e80941Smrg /* Base value of 0x200. 421b8e80941Smrg * OR with 0x1 for blending (anything other than REPLACE). 422b8e80941Smrg * OR with 0x2 for programmable blending 423b8e80941Smrg */ 424b8e80941Smrg 425b8e80941Smrg u64 unk1; 426b8e80941Smrg 427b8e80941Smrg union { 428b8e80941Smrg struct mali_blend_equation blend_equation_1; 429b8e80941Smrg mali_ptr blend_shader; 430b8e80941Smrg }; 431b8e80941Smrg 432b8e80941Smrg u64 zero2; 433b8e80941Smrg struct mali_blend_equation blend_equation_2; 434b8e80941Smrg#else 435b8e80941Smrg u32 unk1; // = 0x200 436b8e80941Smrg struct mali_blend_equation blend_equation; 437b8e80941Smrg /* 438b8e80941Smrg * - 0x19 normally 439b8e80941Smrg * - 0x3 when this slot is unused (everything else is 0 except the index) 440b8e80941Smrg * - 0x11 when this is the fourth slot (and it's used) 441b8e80941Smrg+ * - 0 when there is a blend shader 442b8e80941Smrg */ 443b8e80941Smrg u16 unk2; 444b8e80941Smrg /* increments from 0 to 3 */ 445b8e80941Smrg u16 index; 446b8e80941Smrg 447b8e80941Smrg union { 448b8e80941Smrg struct { 449b8e80941Smrg /* So far, I've only seen: 450b8e80941Smrg * - R001 for 1-component formats 451b8e80941Smrg * - RG01 for 2-component formats 452b8e80941Smrg * - RGB1 for 3-component formats 453b8e80941Smrg * - RGBA for 4-component formats 454b8e80941Smrg */ 455b8e80941Smrg u32 swizzle : 12; 456b8e80941Smrg enum mali_format format : 8; 457b8e80941Smrg 458b8e80941Smrg /* Type of the shader output variable. Note, this can 459b8e80941Smrg * be different from the format. 460b8e80941Smrg * 461b8e80941Smrg * 0: f16 (mediump float) 462b8e80941Smrg * 1: f32 (highp float) 463b8e80941Smrg * 2: i32 (highp int) 464b8e80941Smrg * 3: u32 (highp uint) 465b8e80941Smrg * 4: i16 (mediump int) 466b8e80941Smrg * 5: u16 (mediump uint) 467b8e80941Smrg */ 468b8e80941Smrg u32 shader_type : 3; 469b8e80941Smrg u32 zero : 9; 470b8e80941Smrg }; 471b8e80941Smrg 472b8e80941Smrg /* Only the low 32 bits of the blend shader are stored, the 473b8e80941Smrg * high 32 bits are implicitly the same as the original shader. 474b8e80941Smrg * According to the kernel driver, the program counter for 475b8e80941Smrg * shaders is actually only 24 bits, so shaders cannot cross 476b8e80941Smrg * the 2^24-byte boundary, and neither can the blend shader. 477b8e80941Smrg * The blob handles this by allocating a 2^24 byte pool for 478b8e80941Smrg * shaders, and making sure that any blend shaders are stored 479b8e80941Smrg * in the same pool as the original shader. The kernel will 480b8e80941Smrg * make sure this allocation is aligned to 2^24 bytes. 481b8e80941Smrg */ 482b8e80941Smrg u32 blend_shader; 483b8e80941Smrg }; 484b8e80941Smrg#endif 485b8e80941Smrg} __attribute__((packed)); 486b8e80941Smrg 487b8e80941Smrgstruct mali_shader_meta { 488b8e80941Smrg mali_ptr shader; 489b8e80941Smrg u16 texture_count; 490b8e80941Smrg u16 sampler_count; 491b8e80941Smrg u16 attribute_count; 492b8e80941Smrg u16 varying_count; 493b8e80941Smrg 494b8e80941Smrg union { 495b8e80941Smrg struct { 496b8e80941Smrg u32 uniform_buffer_count : 4; 497b8e80941Smrg u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler 498b8e80941Smrg } bifrost1; 499b8e80941Smrg struct { 500b8e80941Smrg /* 0x200 except MALI_NO_ALPHA_TO_COVERAGE. Mysterious 1 501b8e80941Smrg * other times. Who knows really? */ 502b8e80941Smrg u16 unknown1; 503b8e80941Smrg 504b8e80941Smrg /* Whole number of uniform registers used, times two; 505b8e80941Smrg * whole number of work registers used (no scale). 506b8e80941Smrg */ 507b8e80941Smrg unsigned work_count : 5; 508b8e80941Smrg unsigned uniform_count : 5; 509b8e80941Smrg unsigned unknown2 : 6; 510b8e80941Smrg } midgard1; 511b8e80941Smrg }; 512b8e80941Smrg 513b8e80941Smrg /* On bifrost: Exactly the same as glPolygonOffset() for both. 514b8e80941Smrg * On midgard: Depth factor is exactly as passed to glPolygonOffset. 515b8e80941Smrg * Depth units is equal to the value passed to glDeptOhffset + 1.0f 516b8e80941Smrg * (use MALI_NEGATIVE) 517b8e80941Smrg */ 518b8e80941Smrg float depth_units; 519b8e80941Smrg float depth_factor; 520b8e80941Smrg 521b8e80941Smrg u32 unknown2_2; 522b8e80941Smrg 523b8e80941Smrg u16 alpha_coverage; 524b8e80941Smrg u16 unknown2_3; 525b8e80941Smrg 526b8e80941Smrg u8 stencil_mask_front; 527b8e80941Smrg u8 stencil_mask_back; 528b8e80941Smrg u16 unknown2_4; 529b8e80941Smrg 530b8e80941Smrg struct mali_stencil_test stencil_front; 531b8e80941Smrg struct mali_stencil_test stencil_back; 532b8e80941Smrg 533b8e80941Smrg union { 534b8e80941Smrg struct { 535b8e80941Smrg u32 unk3 : 7; 536b8e80941Smrg /* On Bifrost, some system values are preloaded in 537b8e80941Smrg * registers R55-R62 by the thread dispatcher prior to 538b8e80941Smrg * the start of shader execution. This is a bitfield 539b8e80941Smrg * with one entry for each register saying which 540b8e80941Smrg * registers need to be preloaded. Right now, the known 541b8e80941Smrg * values are: 542b8e80941Smrg * 543b8e80941Smrg * Vertex/compute: 544b8e80941Smrg * - R55 : gl_LocalInvocationID.xy 545b8e80941Smrg * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits 546b8e80941Smrg * - R57 : gl_WorkGroupID.x 547b8e80941Smrg * - R58 : gl_WorkGroupID.y 548b8e80941Smrg * - R59 : gl_WorkGroupID.z 549b8e80941Smrg * - R60 : gl_GlobalInvocationID.x 550b8e80941Smrg * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base) 551b8e80941Smrg * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base) 552b8e80941Smrg * 553b8e80941Smrg * Fragment: 554b8e80941Smrg * - R55 : unknown, never seen (but the bit for this is 555b8e80941Smrg * always set?) 556b8e80941Smrg * - R56 : unknown (bit always unset) 557b8e80941Smrg * - R57 : gl_PrimitiveID 558b8e80941Smrg * - R58 : gl_FrontFacing in low bit, potentially other stuff 559b8e80941Smrg * - R59 : u16 fragment coordinates (used to compute 560b8e80941Smrg * gl_FragCoord.xy, together with sample positions) 561b8e80941Smrg * - R60 : gl_SampleMask (used in epilog, so pretty 562b8e80941Smrg * much always used, but the bit is always 0 -- is 563b8e80941Smrg * this just always pushed?) 564b8e80941Smrg * - R61 : gl_SampleMaskIn and gl_SampleID, used by 565b8e80941Smrg * varying interpolation. 566b8e80941Smrg * - R62 : unknown (bit always unset). 567b8e80941Smrg */ 568b8e80941Smrg u32 preload_regs : 8; 569b8e80941Smrg /* In units of 8 bytes or 64 bits, since the 570b8e80941Smrg * uniform/const port loads 64 bits at a time. 571b8e80941Smrg */ 572b8e80941Smrg u32 uniform_count : 7; 573b8e80941Smrg u32 unk4 : 10; // = 2 574b8e80941Smrg } bifrost2; 575b8e80941Smrg struct { 576b8e80941Smrg u32 unknown2_7; 577b8e80941Smrg } midgard2; 578b8e80941Smrg }; 579b8e80941Smrg 580b8e80941Smrg /* zero on bifrost */ 581b8e80941Smrg u32 unknown2_8; 582b8e80941Smrg 583b8e80941Smrg /* Blending information for the older non-MRT Midgard HW. Check for 584b8e80941Smrg * MALI_HAS_BLEND_SHADER to decide how to interpret. 585b8e80941Smrg */ 586b8e80941Smrg 587b8e80941Smrg union { 588b8e80941Smrg mali_ptr blend_shader; 589b8e80941Smrg struct mali_blend_equation blend_equation; 590b8e80941Smrg }; 591b8e80941Smrg 592b8e80941Smrg /* There can be up to 4 blend_meta's. None of them are required for 593b8e80941Smrg * vertex shaders or the non-MRT case for Midgard (so the blob doesn't 594b8e80941Smrg * allocate any space). 595b8e80941Smrg */ 596b8e80941Smrg struct mali_blend_meta blend_meta[]; 597b8e80941Smrg 598b8e80941Smrg} __attribute__((packed)); 599b8e80941Smrg 600b8e80941Smrg/* This only concerns hardware jobs */ 601b8e80941Smrg 602b8e80941Smrg/* Possible values for job_descriptor_size */ 603b8e80941Smrg 604b8e80941Smrg#define MALI_JOB_32 0 605b8e80941Smrg#define MALI_JOB_64 1 606b8e80941Smrg 607b8e80941Smrgstruct mali_job_descriptor_header { 608b8e80941Smrg u32 exception_status; 609b8e80941Smrg u32 first_incomplete_task; 610b8e80941Smrg u64 fault_pointer; 611b8e80941Smrg u8 job_descriptor_size : 1; 612b8e80941Smrg enum mali_job_type job_type : 7; 613b8e80941Smrg u8 job_barrier : 1; 614b8e80941Smrg u8 unknown_flags : 7; 615b8e80941Smrg u16 job_index; 616b8e80941Smrg u16 job_dependency_index_1; 617b8e80941Smrg u16 job_dependency_index_2; 618b8e80941Smrg 619b8e80941Smrg union { 620b8e80941Smrg u64 next_job_64; 621b8e80941Smrg u32 next_job_32; 622b8e80941Smrg }; 623b8e80941Smrg} __attribute__((packed)); 624b8e80941Smrg 625b8e80941Smrgstruct mali_payload_set_value { 626b8e80941Smrg u64 out; 627b8e80941Smrg u64 unknown; 628b8e80941Smrg} __attribute__((packed)); 629b8e80941Smrg 630b8e80941Smrg/* Special attributes have a fixed index */ 631b8e80941Smrg#define MALI_SPECIAL_ATTRIBUTE_BASE 16 632b8e80941Smrg#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0) 633b8e80941Smrg#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1) 634b8e80941Smrg 635b8e80941Smrg/* 636b8e80941Smrg * Mali Attributes 637b8e80941Smrg * 638b8e80941Smrg * This structure lets the attribute unit compute the address of an attribute 639b8e80941Smrg * given the vertex and instance ID. Unfortunately, the way this works is 640b8e80941Smrg * rather complicated when instancing is enabled. 641b8e80941Smrg * 642b8e80941Smrg * To explain this, first we need to explain how compute and vertex threads are 643b8e80941Smrg * dispatched. This is a guess (although a pretty firm guess!) since the 644b8e80941Smrg * details are mostly hidden from the driver, except for attribute instancing. 645b8e80941Smrg * When a quad is dispatched, it receives a single, linear index. However, we 646b8e80941Smrg * need to translate that index into a (vertex id, instance id) pair, or a 647b8e80941Smrg * (local id x, local id y, local id z) triple for compute shaders (although 648b8e80941Smrg * vertex shaders and compute shaders are handled almost identically). 649b8e80941Smrg * Focusing on vertex shaders, one option would be to do: 650b8e80941Smrg * 651b8e80941Smrg * vertex_id = linear_id % num_vertices 652b8e80941Smrg * instance_id = linear_id / num_vertices 653b8e80941Smrg * 654b8e80941Smrg * but this involves a costly division and modulus by an arbitrary number. 655b8e80941Smrg * Instead, we could pad num_vertices. We dispatch padded_num_vertices * 656b8e80941Smrg * num_instances threads instead of num_vertices * num_instances, which results 657b8e80941Smrg * in some "extra" threads with vertex_id >= num_vertices, which we have to 658b8e80941Smrg * discard. The more we pad num_vertices, the more "wasted" threads we 659b8e80941Smrg * dispatch, but the division is potentially easier. 660b8e80941Smrg * 661b8e80941Smrg * One straightforward choice is to pad num_vertices to the next power of two, 662b8e80941Smrg * which means that the division and modulus are just simple bit shifts and 663b8e80941Smrg * masking. But the actual algorithm is a bit more complicated. The thread 664b8e80941Smrg * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition 665b8e80941Smrg * to dividing by a power of two. This is possibly using the technique 666b8e80941Smrg * described in patent US20170010862A1. As a result, padded_num_vertices can be 667b8e80941Smrg * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads, 668b8e80941Smrg * since we need less padding. 669b8e80941Smrg * 670b8e80941Smrg * padded_num_vertices is picked by the hardware. The driver just specifies the 671b8e80941Smrg * actual number of vertices. At least for Mali G71, the first few cases are 672b8e80941Smrg * given by: 673b8e80941Smrg * 674b8e80941Smrg * num_vertices | padded_num_vertices 675b8e80941Smrg * 3 | 4 676b8e80941Smrg * 4-7 | 8 677b8e80941Smrg * 8-11 | 12 (3 * 4) 678b8e80941Smrg * 12-15 | 16 679b8e80941Smrg * 16-19 | 20 (5 * 4) 680b8e80941Smrg * 681b8e80941Smrg * Note that padded_num_vertices is a multiple of four (presumably because 682b8e80941Smrg * threads are dispatched in groups of 4). Also, padded_num_vertices is always 683b8e80941Smrg * at least one more than num_vertices, which seems like a quirk of the 684b8e80941Smrg * hardware. For larger num_vertices, the hardware uses the following 685b8e80941Smrg * algorithm: using the binary representation of num_vertices, we look at the 686b8e80941Smrg * most significant set bit as well as the following 3 bits. Let n be the 687b8e80941Smrg * number of bits after those 4 bits. Then we set padded_num_vertices according 688b8e80941Smrg * to the following table: 689b8e80941Smrg * 690b8e80941Smrg * high bits | padded_num_vertices 691b8e80941Smrg * 1000 | 9 * 2^n 692b8e80941Smrg * 1001 | 5 * 2^(n+1) 693b8e80941Smrg * 101x | 3 * 2^(n+2) 694b8e80941Smrg * 110x | 7 * 2^(n+1) 695b8e80941Smrg * 111x | 2^(n+4) 696b8e80941Smrg * 697b8e80941Smrg * For example, if num_vertices = 70 is passed to glDraw(), its binary 698b8e80941Smrg * representation is 1000110, so n = 3 and the high bits are 1000, and 699b8e80941Smrg * therefore padded_num_vertices = 9 * 2^3 = 72. 700b8e80941Smrg * 701b8e80941Smrg * The attribute unit works in terms of the original linear_id. if 702b8e80941Smrg * num_instances = 1, then they are the same, and everything is simple. 703b8e80941Smrg * However, with instancing things get more complicated. There are four 704b8e80941Smrg * possible modes, two of them we can group together: 705b8e80941Smrg * 706b8e80941Smrg * 1. Use the linear_id directly. Only used when there is no instancing. 707b8e80941Smrg * 708b8e80941Smrg * 2. Use the linear_id modulo a constant. This is used for per-vertex 709b8e80941Smrg * attributes with instancing enabled by making the constant equal 710b8e80941Smrg * padded_num_vertices. Because the modulus is always padded_num_vertices, this 711b8e80941Smrg * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9. 712b8e80941Smrg * The shift field specifies the power of two, while the extra_flags field 713b8e80941Smrg * specifies the odd number. If shift = n and extra_flags = m, then the modulus 714b8e80941Smrg * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed 715b8e80941Smrg * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and 716b8e80941Smrg * shift = 3. Note that we must exactly follow the hardware algorithm used to 717b8e80941Smrg * get padded_num_vertices in order to correctly implement per-vertex 718b8e80941Smrg * attributes. 719b8e80941Smrg * 720b8e80941Smrg * 3. Divide the linear_id by a constant. In order to correctly implement 721b8e80941Smrg * instance divisors, we have to divide linear_id by padded_num_vertices times 722b8e80941Smrg * to user-specified divisor. So first we compute padded_num_vertices, again 723b8e80941Smrg * following the exact same algorithm that the hardware uses, then multiply it 724b8e80941Smrg * by the GL-level divisor to get the hardware-level divisor. This case is 725b8e80941Smrg * further divided into two more cases. If the hardware-level divisor is a 726b8e80941Smrg * power of two, then we just need to shift. The shift amount is specified by 727b8e80941Smrg * the shift field, so that the hardware-level divisor is just 2^shift. 728b8e80941Smrg * 729b8e80941Smrg * If it isn't a power of two, then we have to divide by an arbitrary integer. 730b8e80941Smrg * For that, we use the well-known technique of multiplying by an approximation 731b8e80941Smrg * of the inverse. The driver must compute the magic multiplier and shift 732b8e80941Smrg * amount, and then the hardware does the multiplication and shift. The 733b8e80941Smrg * hardware and driver also use the "round-down" optimization as described in 734b8e80941Smrg * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf. 735b8e80941Smrg * The hardware further assumes the multiplier is between 2^31 and 2^32, so the 736b8e80941Smrg * high bit is implicitly set to 1 even though it is set to 0 by the driver -- 737b8e80941Smrg * presumably this simplifies the hardware multiplier a little. The hardware 738b8e80941Smrg * first multiplies linear_id by the multiplier and takes the high 32 bits, 739b8e80941Smrg * then applies the round-down correction if extra_flags = 1, then finally 740b8e80941Smrg * shifts right by the shift field. 741b8e80941Smrg * 742b8e80941Smrg * There are some differences between ridiculousfish's algorithm and the Mali 743b8e80941Smrg * hardware algorithm, which means that the reference code from ridiculousfish 744b8e80941Smrg * doesn't always produce the right constants. Mali does not use the pre-shift 745b8e80941Smrg * optimization, since that would make a hardware implementation slower (it 746b8e80941Smrg * would have to always do the pre-shift, multiply, and post-shift operations). 747b8e80941Smrg * It also forces the multplier to be at least 2^31, which means that the 748b8e80941Smrg * exponent is entirely fixed, so there is no trial-and-error. Altogether, 749b8e80941Smrg * given the divisor d, the algorithm the driver must follow is: 750b8e80941Smrg * 751b8e80941Smrg * 1. Set shift = floor(log2(d)). 752b8e80941Smrg * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d. 753b8e80941Smrg * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set 754b8e80941Smrg * magic_divisor = m - 1 and extra_flags = 1. 755b8e80941Smrg * 4. Otherwise, set magic_divisor = m and extra_flags = 0. 756b8e80941Smrg */ 757b8e80941Smrg 758b8e80941Smrgenum mali_attr_mode { 759b8e80941Smrg MALI_ATTR_UNUSED = 0, 760b8e80941Smrg MALI_ATTR_LINEAR = 1, 761b8e80941Smrg MALI_ATTR_POT_DIVIDE = 2, 762b8e80941Smrg MALI_ATTR_MODULO = 3, 763b8e80941Smrg MALI_ATTR_NPOT_DIVIDE = 4, 764b8e80941Smrg}; 765b8e80941Smrg 766b8e80941Smrg/* This magic "pseudo-address" is used as `elements` to implement 767b8e80941Smrg * gl_PointCoord. When read from a fragment shader, it generates a point 768b8e80941Smrg * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces 769b8e80941Smrg * require an affine transformation in the shader. */ 770b8e80941Smrg 771b8e80941Smrg#define MALI_VARYING_POINT_COORD (0x60) 772b8e80941Smrg 773b8e80941Smrgunion mali_attr { 774b8e80941Smrg /* This is used for actual attributes. */ 775b8e80941Smrg struct { 776b8e80941Smrg /* The bottom 3 bits are the mode */ 777b8e80941Smrg mali_ptr elements : 64 - 8; 778b8e80941Smrg u32 shift : 5; 779b8e80941Smrg u32 extra_flags : 3; 780b8e80941Smrg u32 stride; 781b8e80941Smrg u32 size; 782b8e80941Smrg }; 783b8e80941Smrg /* The entry after an NPOT_DIVIDE entry has this format. It stores 784b8e80941Smrg * extra information that wouldn't fit in a normal entry. 785b8e80941Smrg */ 786b8e80941Smrg struct { 787b8e80941Smrg u32 unk; /* = 0x20 */ 788b8e80941Smrg u32 magic_divisor; 789b8e80941Smrg u32 zero; 790b8e80941Smrg /* This is the original, GL-level divisor. */ 791b8e80941Smrg u32 divisor; 792b8e80941Smrg }; 793b8e80941Smrg} __attribute__((packed)); 794b8e80941Smrg 795b8e80941Smrgstruct mali_attr_meta { 796b8e80941Smrg /* Vertex buffer index */ 797b8e80941Smrg u8 index; 798b8e80941Smrg 799b8e80941Smrg unsigned unknown1 : 2; 800b8e80941Smrg unsigned swizzle : 12; 801b8e80941Smrg enum mali_format format : 8; 802b8e80941Smrg 803b8e80941Smrg /* Always observed to be zero at the moment */ 804b8e80941Smrg unsigned unknown3 : 2; 805b8e80941Smrg 806b8e80941Smrg /* When packing multiple attributes in a buffer, offset addresses by this value */ 807b8e80941Smrg uint32_t src_offset; 808b8e80941Smrg} __attribute__((packed)); 809b8e80941Smrg 810b8e80941Smrgenum mali_fbd_type { 811b8e80941Smrg MALI_SFBD = 0, 812b8e80941Smrg MALI_MFBD = 1, 813b8e80941Smrg}; 814b8e80941Smrg 815b8e80941Smrg#define FBD_TYPE (1) 816b8e80941Smrg#define FBD_MASK (~0x3f) 817b8e80941Smrg 818b8e80941Smrgstruct mali_uniform_buffer_meta { 819b8e80941Smrg /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16 820b8e80941Smrg * bytes. This gives a maximum of 2^14 bytes, which just so happens to 821b8e80941Smrg * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE. 822b8e80941Smrg */ 823b8e80941Smrg u64 size : 10; 824b8e80941Smrg 825b8e80941Smrg /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits 826b8e80941Smrg * should be 0 for userspace pointers, according to 827b8e80941Smrg * https://lwn.net/Articles/718895/. By reusing these bits, we can make 828b8e80941Smrg * each entry in the table only 64 bits. 829b8e80941Smrg */ 830b8e80941Smrg mali_ptr ptr : 64 - 10; 831b8e80941Smrg}; 832b8e80941Smrg 833b8e80941Smrg/* On Bifrost, these fields are the same between the vertex and tiler payloads. 834b8e80941Smrg * They also seem to be the same between Bifrost and Midgard. They're shared in 835b8e80941Smrg * fused payloads. 836b8e80941Smrg */ 837b8e80941Smrg 838b8e80941Smrg/* Applies to unknown_draw */ 839b8e80941Smrg 840b8e80941Smrg#define MALI_DRAW_INDEXED_UINT8 (0x10) 841b8e80941Smrg#define MALI_DRAW_INDEXED_UINT16 (0x20) 842b8e80941Smrg#define MALI_DRAW_INDEXED_UINT32 (0x30) 843b8e80941Smrg#define MALI_DRAW_VARYING_SIZE (0x100) 844b8e80941Smrg#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000) 845b8e80941Smrg 846b8e80941Smrgstruct mali_vertex_tiler_prefix { 847b8e80941Smrg /* This is a dynamic bitfield containing the following things in this order: 848b8e80941Smrg * 849b8e80941Smrg * - gl_WorkGroupSize.x 850b8e80941Smrg * - gl_WorkGroupSize.y 851b8e80941Smrg * - gl_WorkGroupSize.z 852b8e80941Smrg * - gl_NumWorkGroups.x 853b8e80941Smrg * - gl_NumWorkGroups.y 854b8e80941Smrg * - gl_NumWorkGroups.z 855b8e80941Smrg * 856b8e80941Smrg * The number of bits allocated for each number is based on the *_shift 857b8e80941Smrg * fields below. For example, workgroups_y_shift gives the bit that 858b8e80941Smrg * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit 859b8e80941Smrg * that gl_NumWorkGroups.z starts at (and therefore one after the bit 860b8e80941Smrg * that gl_NumWorkGroups.y ends at). The actual value for each gl_* 861b8e80941Smrg * value is one more than the stored value, since if any of the values 862b8e80941Smrg * are zero, then there would be no invocations (and hence no job). If 863b8e80941Smrg * there were 0 bits allocated to a given field, then it must be zero, 864b8e80941Smrg * and hence the real value is one. 865b8e80941Smrg * 866b8e80941Smrg * Vertex jobs reuse the same job dispatch mechanism as compute jobs, 867b8e80941Smrg * effectively doing glDispatchCompute(1, vertex_count, instance_count) 868b8e80941Smrg * where vertex count is the number of vertices. 869b8e80941Smrg */ 870b8e80941Smrg u32 invocation_count; 871b8e80941Smrg 872b8e80941Smrg u32 size_y_shift : 5; 873b8e80941Smrg u32 size_z_shift : 5; 874b8e80941Smrg u32 workgroups_x_shift : 6; 875b8e80941Smrg u32 workgroups_y_shift : 6; 876b8e80941Smrg u32 workgroups_z_shift : 6; 877b8e80941Smrg /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */ 878b8e80941Smrg u32 workgroups_x_shift_2 : 4; 879b8e80941Smrg 880b8e80941Smrg u32 draw_mode : 4; 881b8e80941Smrg u32 unknown_draw : 22; 882b8e80941Smrg 883b8e80941Smrg /* This is the the same as workgroups_x_shift_2 in compute shaders, but 884b8e80941Smrg * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has 885b8e80941Smrg * something to do with how many quads get put in the same execution 886b8e80941Smrg * engine, which is a balance (you don't want to starve the engine, but 887b8e80941Smrg * you also want to distribute work evenly). 888b8e80941Smrg */ 889b8e80941Smrg u32 workgroups_x_shift_3 : 6; 890b8e80941Smrg 891b8e80941Smrg 892b8e80941Smrg /* Negative of draw_start for TILER jobs from what I've seen */ 893b8e80941Smrg int32_t negative_start; 894b8e80941Smrg u32 zero1; 895b8e80941Smrg 896b8e80941Smrg /* Like many other strictly nonzero quantities, index_count is 897b8e80941Smrg * subtracted by one. For an indexed cube, this is equal to 35 = 6 898b8e80941Smrg * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is, 899b8e80941Smrg * for an indexed draw, index_count is the number of actual vertices 900b8e80941Smrg * rendered whereas invocation_count is the number of unique vertices 901b8e80941Smrg * rendered (the number of times the vertex shader must be invoked). 902b8e80941Smrg * For non-indexed draws, this is just equal to invocation_count. */ 903b8e80941Smrg 904b8e80941Smrg u32 index_count; 905b8e80941Smrg 906b8e80941Smrg /* No hidden structure; literally just a pointer to an array of uint 907b8e80941Smrg * indices (width depends on flags). Thanks, guys, for not making my 908b8e80941Smrg * life insane for once! NULL for non-indexed draws. */ 909b8e80941Smrg 910b8e80941Smrg uintptr_t indices; 911b8e80941Smrg} __attribute__((packed)); 912b8e80941Smrg 913b8e80941Smrg/* Point size / line width can either be specified as a 32-bit float (for 914b8e80941Smrg * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer 915b8e80941Smrg * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler 916b8e80941Smrg * payload, the contents of varying_pointer will be intepreted as an array of 917b8e80941Smrg * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by 918b8e80941Smrg * creating a special MALI_R16F varying writing to varying_pointer. */ 919b8e80941Smrg 920b8e80941Smrgunion midgard_primitive_size { 921b8e80941Smrg float constant; 922b8e80941Smrg uintptr_t pointer; 923b8e80941Smrg}; 924b8e80941Smrg 925b8e80941Smrgstruct bifrost_vertex_only { 926b8e80941Smrg u32 unk2; /* =0x2 */ 927b8e80941Smrg 928b8e80941Smrg u32 zero0; 929b8e80941Smrg 930b8e80941Smrg u64 zero1; 931b8e80941Smrg} __attribute__((packed)); 932b8e80941Smrg 933b8e80941Smrgstruct bifrost_tiler_heap_meta { 934b8e80941Smrg u32 zero; 935b8e80941Smrg u32 heap_size; 936b8e80941Smrg /* note: these are just guesses! */ 937b8e80941Smrg mali_ptr tiler_heap_start; 938b8e80941Smrg mali_ptr tiler_heap_free; 939b8e80941Smrg mali_ptr tiler_heap_end; 940b8e80941Smrg 941b8e80941Smrg /* hierarchy weights? but they're still 0 after the job has run... */ 942b8e80941Smrg u32 zeros[12]; 943b8e80941Smrg} __attribute__((packed)); 944b8e80941Smrg 945b8e80941Smrgstruct bifrost_tiler_meta { 946b8e80941Smrg u64 zero0; 947b8e80941Smrg u32 unk; // = 0xf0 948b8e80941Smrg u16 width; 949b8e80941Smrg u16 height; 950b8e80941Smrg u64 zero1; 951b8e80941Smrg mali_ptr tiler_heap_meta; 952b8e80941Smrg /* TODO what is this used for? */ 953b8e80941Smrg u64 zeros[20]; 954b8e80941Smrg} __attribute__((packed)); 955b8e80941Smrg 956b8e80941Smrgstruct bifrost_tiler_only { 957b8e80941Smrg /* 0x20 */ 958b8e80941Smrg union midgard_primitive_size primitive_size; 959b8e80941Smrg 960b8e80941Smrg mali_ptr tiler_meta; 961b8e80941Smrg 962b8e80941Smrg u64 zero1, zero2, zero3, zero4, zero5, zero6; 963b8e80941Smrg 964b8e80941Smrg u32 gl_enables; 965b8e80941Smrg u32 zero7; 966b8e80941Smrg u64 zero8; 967b8e80941Smrg} __attribute__((packed)); 968b8e80941Smrg 969b8e80941Smrgstruct bifrost_scratchpad { 970b8e80941Smrg u32 zero; 971b8e80941Smrg u32 flags; // = 0x1f 972b8e80941Smrg /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated 973b8e80941Smrg * during startup. It seems to serve the same purpose as the 974b8e80941Smrg * gpu_scratchpad in the SFBD for Midgard, although it's slightly 975b8e80941Smrg * larger. 976b8e80941Smrg */ 977b8e80941Smrg mali_ptr gpu_scratchpad; 978b8e80941Smrg} __attribute__((packed)); 979b8e80941Smrg 980b8e80941Smrgstruct mali_vertex_tiler_postfix { 981b8e80941Smrg /* Zero for vertex jobs. Pointer to the position (gl_Position) varying 982b8e80941Smrg * output from the vertex shader for tiler jobs. 983b8e80941Smrg */ 984b8e80941Smrg 985b8e80941Smrg uintptr_t position_varying; 986b8e80941Smrg 987b8e80941Smrg /* An array of mali_uniform_buffer_meta's. The size is given by the 988b8e80941Smrg * shader_meta. 989b8e80941Smrg */ 990b8e80941Smrg uintptr_t uniform_buffers; 991b8e80941Smrg 992b8e80941Smrg /* This is a pointer to an array of pointers to the texture 993b8e80941Smrg * descriptors, number of pointers bounded by number of textures. The 994b8e80941Smrg * indirection is needed to accomodate varying numbers and sizes of 995b8e80941Smrg * texture descriptors */ 996b8e80941Smrg uintptr_t texture_trampoline; 997b8e80941Smrg 998b8e80941Smrg /* For OpenGL, from what I've seen, this is intimately connected to 999b8e80941Smrg * texture_meta. cwabbott says this is not the case under Vulkan, hence 1000b8e80941Smrg * why this field is seperate (Midgard is Vulkan capable). Pointer to 1001b8e80941Smrg * array of sampler descriptors (which are uniform in size) */ 1002b8e80941Smrg uintptr_t sampler_descriptor; 1003b8e80941Smrg 1004b8e80941Smrg uintptr_t uniforms; 1005b8e80941Smrg u8 flags : 4; 1006b8e80941Smrg uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */ 1007b8e80941Smrg uintptr_t attributes; /* struct attribute_buffer[] */ 1008b8e80941Smrg uintptr_t attribute_meta; /* attribute_meta[] */ 1009b8e80941Smrg uintptr_t varyings; /* struct attr */ 1010b8e80941Smrg uintptr_t varying_meta; /* pointer */ 1011b8e80941Smrg uintptr_t viewport; 1012b8e80941Smrg uintptr_t occlusion_counter; /* A single bit as far as I can tell */ 1013b8e80941Smrg 1014b8e80941Smrg /* Note: on Bifrost, this isn't actually the FBD. It points to 1015b8e80941Smrg * bifrost_scratchpad instead. However, it does point to the same thing 1016b8e80941Smrg * in vertex and tiler jobs. 1017b8e80941Smrg */ 1018b8e80941Smrg mali_ptr framebuffer; 1019b8e80941Smrg 1020b8e80941Smrg#ifdef __LP64__ 1021b8e80941Smrg#ifdef BIFROST 1022b8e80941Smrg /* most likely padding to make this a multiple of 64 bytes */ 1023b8e80941Smrg u64 zero7; 1024b8e80941Smrg#endif 1025b8e80941Smrg#endif 1026b8e80941Smrg} __attribute__((packed)); 1027b8e80941Smrg 1028b8e80941Smrgstruct midgard_payload_vertex_tiler { 1029b8e80941Smrg#ifndef __LP64__ 1030b8e80941Smrg union midgard_primitive_size primitive_size; 1031b8e80941Smrg#endif 1032b8e80941Smrg 1033b8e80941Smrg struct mali_vertex_tiler_prefix prefix; 1034b8e80941Smrg 1035b8e80941Smrg#ifndef __LP64__ 1036b8e80941Smrg u32 zero3; 1037b8e80941Smrg#endif 1038b8e80941Smrg 1039b8e80941Smrg u32 gl_enables; // 0x5 1040b8e80941Smrg 1041b8e80941Smrg /* Offset for first vertex in buffer */ 1042b8e80941Smrg u32 draw_start; 1043b8e80941Smrg 1044b8e80941Smrg uintptr_t zero5; 1045b8e80941Smrg 1046b8e80941Smrg struct mali_vertex_tiler_postfix postfix; 1047b8e80941Smrg 1048b8e80941Smrg#ifdef __LP64__ 1049b8e80941Smrg union midgard_primitive_size primitive_size; 1050b8e80941Smrg#endif 1051b8e80941Smrg} __attribute__((packed)); 1052b8e80941Smrg 1053b8e80941Smrgstruct bifrost_payload_vertex { 1054b8e80941Smrg struct mali_vertex_tiler_prefix prefix; 1055b8e80941Smrg struct bifrost_vertex_only vertex; 1056b8e80941Smrg struct mali_vertex_tiler_postfix postfix; 1057b8e80941Smrg} __attribute__((packed)); 1058b8e80941Smrg 1059b8e80941Smrgstruct bifrost_payload_tiler { 1060b8e80941Smrg struct mali_vertex_tiler_prefix prefix; 1061b8e80941Smrg struct bifrost_tiler_only tiler; 1062b8e80941Smrg struct mali_vertex_tiler_postfix postfix; 1063b8e80941Smrg} __attribute__((packed)); 1064b8e80941Smrg 1065b8e80941Smrgstruct bifrost_payload_fused { 1066b8e80941Smrg struct mali_vertex_tiler_prefix prefix; 1067b8e80941Smrg struct bifrost_tiler_only tiler; 1068b8e80941Smrg struct mali_vertex_tiler_postfix tiler_postfix; 1069b8e80941Smrg struct bifrost_vertex_only vertex; 1070b8e80941Smrg struct mali_vertex_tiler_postfix vertex_postfix; 1071b8e80941Smrg} __attribute__((packed)); 1072b8e80941Smrg 1073b8e80941Smrg/* Pointed to from texture_trampoline, mostly unknown still, haven't 1074b8e80941Smrg * managed to replay successfully */ 1075b8e80941Smrg 1076b8e80941Smrg/* Purposeful off-by-one in width, height fields. For example, a (64, 64) 1077b8e80941Smrg * texture is stored as (63, 63) in these fields. This adjusts for that. 1078b8e80941Smrg * There's an identical pattern in the framebuffer descriptor. Even vertex 1079b8e80941Smrg * count fields work this way, hence the generic name -- integral fields that 1080b8e80941Smrg * are strictly positive generally need this adjustment. */ 1081b8e80941Smrg 1082b8e80941Smrg#define MALI_POSITIVE(dim) (dim - 1) 1083b8e80941Smrg 1084b8e80941Smrg/* Opposite of MALI_POSITIVE, found in the depth_units field */ 1085b8e80941Smrg 1086b8e80941Smrg#define MALI_NEGATIVE(dim) (dim + 1) 1087b8e80941Smrg 1088b8e80941Smrg/* Used with wrapping. Incomplete (this is a 4-bit field...) */ 1089b8e80941Smrg 1090b8e80941Smrgenum mali_wrap_mode { 1091b8e80941Smrg MALI_WRAP_REPEAT = 0x8, 1092b8e80941Smrg MALI_WRAP_CLAMP_TO_EDGE = 0x9, 1093b8e80941Smrg MALI_WRAP_CLAMP_TO_BORDER = 0xB, 1094b8e80941Smrg MALI_WRAP_MIRRORED_REPEAT = 0xC 1095b8e80941Smrg}; 1096b8e80941Smrg 1097b8e80941Smrg/* 8192x8192 */ 1098b8e80941Smrg#define MAX_MIP_LEVELS (13) 1099b8e80941Smrg 1100b8e80941Smrg/* Cubemap bloats everything up */ 1101b8e80941Smrg#define MAX_FACES (6) 1102b8e80941Smrg 1103b8e80941Smrg/* Corresponds to the type passed to glTexImage2D and so forth */ 1104b8e80941Smrg 1105b8e80941Smrgstruct mali_texture_format { 1106b8e80941Smrg unsigned swizzle : 12; 1107b8e80941Smrg enum mali_format format : 8; 1108b8e80941Smrg 1109b8e80941Smrg unsigned usage1 : 3; 1110b8e80941Smrg unsigned is_not_cubemap : 1; 1111b8e80941Smrg unsigned usage2 : 8; 1112b8e80941Smrg} __attribute__((packed)); 1113b8e80941Smrg 1114b8e80941Smrgstruct mali_texture_descriptor { 1115b8e80941Smrg uint16_t width; 1116b8e80941Smrg uint16_t height; 1117b8e80941Smrg uint16_t depth; 1118b8e80941Smrg 1119b8e80941Smrg uint16_t unknown1; 1120b8e80941Smrg 1121b8e80941Smrg struct mali_texture_format format; 1122b8e80941Smrg 1123b8e80941Smrg uint16_t unknown3; 1124b8e80941Smrg 1125b8e80941Smrg /* One for non-mipmapped, zero for mipmapped */ 1126b8e80941Smrg uint8_t unknown3A; 1127b8e80941Smrg 1128b8e80941Smrg /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */ 1129b8e80941Smrg uint8_t nr_mipmap_levels; 1130b8e80941Smrg 1131b8e80941Smrg /* Swizzling is a single 32-bit word, broken up here for convenience. 1132b8e80941Smrg * Here, swizzling refers to the ES 3.0 texture parameters for channel 1133b8e80941Smrg * level swizzling, not the internal pixel-level swizzling which is 1134b8e80941Smrg * below OpenGL's reach */ 1135b8e80941Smrg 1136b8e80941Smrg unsigned swizzle : 12; 1137b8e80941Smrg unsigned swizzle_zero : 20; 1138b8e80941Smrg 1139b8e80941Smrg uint32_t unknown5; 1140b8e80941Smrg uint32_t unknown6; 1141b8e80941Smrg uint32_t unknown7; 1142b8e80941Smrg 1143b8e80941Smrg mali_ptr swizzled_bitmaps[MAX_MIP_LEVELS * MAX_FACES]; 1144b8e80941Smrg} __attribute__((packed)); 1145b8e80941Smrg 1146b8e80941Smrg/* Used as part of filter_mode */ 1147b8e80941Smrg 1148b8e80941Smrg#define MALI_LINEAR 0 1149b8e80941Smrg#define MALI_NEAREST 1 1150b8e80941Smrg#define MALI_MIP_LINEAR (0x18) 1151b8e80941Smrg 1152b8e80941Smrg/* Used to construct low bits of filter_mode */ 1153b8e80941Smrg 1154b8e80941Smrg#define MALI_TEX_MAG(mode) (((mode) & 1) << 0) 1155b8e80941Smrg#define MALI_TEX_MIN(mode) (((mode) & 1) << 1) 1156b8e80941Smrg 1157b8e80941Smrg#define MALI_TEX_MAG_MASK (1) 1158b8e80941Smrg#define MALI_TEX_MIN_MASK (2) 1159b8e80941Smrg 1160b8e80941Smrg#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR") 1161b8e80941Smrg 1162b8e80941Smrg/* Used for lod encoding. Thanks @urjaman for pointing out these routines can 1163b8e80941Smrg * be cleaned up a lot. */ 1164b8e80941Smrg 1165b8e80941Smrg#define DECODE_FIXED_16(x) ((float) (x / 256.0)) 1166b8e80941Smrg 1167b8e80941Smrgstatic inline uint16_t 1168b8e80941SmrgFIXED_16(float x) 1169b8e80941Smrg{ 1170b8e80941Smrg /* Clamp inputs, accounting for float error */ 1171b8e80941Smrg float max_lod = (32.0 - (1.0 / 512.0)); 1172b8e80941Smrg 1173b8e80941Smrg x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x)); 1174b8e80941Smrg 1175b8e80941Smrg return (int) (x * 256.0); 1176b8e80941Smrg} 1177b8e80941Smrg 1178b8e80941Smrgstruct mali_sampler_descriptor { 1179b8e80941Smrg uint32_t filter_mode; 1180b8e80941Smrg 1181b8e80941Smrg /* Fixed point. Upper 8-bits is before the decimal point, although it 1182b8e80941Smrg * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x * 1183b8e80941Smrg * 256)) */ 1184b8e80941Smrg 1185b8e80941Smrg uint16_t min_lod; 1186b8e80941Smrg uint16_t max_lod; 1187b8e80941Smrg 1188b8e80941Smrg /* All one word in reality, but packed a bit */ 1189b8e80941Smrg 1190b8e80941Smrg enum mali_wrap_mode wrap_s : 4; 1191b8e80941Smrg enum mali_wrap_mode wrap_t : 4; 1192b8e80941Smrg enum mali_wrap_mode wrap_r : 4; 1193b8e80941Smrg enum mali_alt_func compare_func : 3; 1194b8e80941Smrg 1195b8e80941Smrg /* A single set bit of unknown, ha! */ 1196b8e80941Smrg unsigned unknown2 : 1; 1197b8e80941Smrg 1198b8e80941Smrg unsigned zero : 16; 1199b8e80941Smrg 1200b8e80941Smrg uint32_t zero2; 1201b8e80941Smrg float border_color[4]; 1202b8e80941Smrg} __attribute__((packed)); 1203b8e80941Smrg 1204b8e80941Smrg/* TODO: What are the floats? Apparently always { -inf, -inf, inf, inf }, 1205b8e80941Smrg * unless the scissor test is enabled. 1206b8e80941Smrg * 1207b8e80941Smrg * viewport0/viewport1 form the arguments to glViewport. viewport1 is modified 1208b8e80941Smrg * by MALI_POSITIVE; viewport0 is as-is. 1209b8e80941Smrg */ 1210b8e80941Smrg 1211b8e80941Smrgstruct mali_viewport { 1212b8e80941Smrg /* XY clipping planes */ 1213b8e80941Smrg float clip_minx; 1214b8e80941Smrg float clip_miny; 1215b8e80941Smrg float clip_maxx; 1216b8e80941Smrg float clip_maxy; 1217b8e80941Smrg 1218b8e80941Smrg /* Depth clipping planes */ 1219b8e80941Smrg float clip_minz; 1220b8e80941Smrg float clip_maxz; 1221b8e80941Smrg 1222b8e80941Smrg u16 viewport0[2]; 1223b8e80941Smrg u16 viewport1[2]; 1224b8e80941Smrg} __attribute__((packed)); 1225b8e80941Smrg 1226b8e80941Smrg/* From presentations, 16x16 tiles externally. Use shift for fast computation 1227b8e80941Smrg * of tile numbers. */ 1228b8e80941Smrg 1229b8e80941Smrg#define MALI_TILE_SHIFT 4 1230b8e80941Smrg#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT) 1231b8e80941Smrg 1232b8e80941Smrg/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to 1233b8e80941Smrg * each component. Notice that this provides a theoretical upper bound of (1 << 1234b8e80941Smrg * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size 1235b8e80941Smrg * 65536x65536. Multiplying that together, times another four given that Mali 1236b8e80941Smrg * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16 1237b8e80941Smrg * gigabytes of RAM just to store the uncompressed framebuffer itself, let 1238b8e80941Smrg * alone rendering in real-time to such a buffer. 1239b8e80941Smrg * 1240b8e80941Smrg * Nice job, guys.*/ 1241b8e80941Smrg 1242b8e80941Smrg/* From mali_kbase_10969_workaround.c */ 1243b8e80941Smrg#define MALI_X_COORD_MASK 0x00000FFF 1244b8e80941Smrg#define MALI_Y_COORD_MASK 0x0FFF0000 1245b8e80941Smrg 1246b8e80941Smrg/* Extract parts of a tile coordinate */ 1247b8e80941Smrg 1248b8e80941Smrg#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK) 1249b8e80941Smrg#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16) 1250b8e80941Smrg#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK)) 1251b8e80941Smrg 1252b8e80941Smrg/* No known flags yet, but just in case...? */ 1253b8e80941Smrg 1254b8e80941Smrg#define MALI_TILE_NO_FLAG (0) 1255b8e80941Smrg 1256b8e80941Smrg/* Helpers to generate tile coordinates based on the boundary coordinates in 1257b8e80941Smrg * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these 1258b8e80941Smrg * functions would convert it to the bounding tiles (0, 0) to (7, 7). 1259b8e80941Smrg * Intentional "off-by-one"; finding the tile number is a form of fencepost 1260b8e80941Smrg * problem. */ 1261b8e80941Smrg 1262b8e80941Smrg#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16)) 1263b8e80941Smrg#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT) 1264b8e80941Smrg#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias)) 1265b8e80941Smrg#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0) 1266b8e80941Smrg#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1) 1267b8e80941Smrg 1268b8e80941Smrgstruct mali_payload_fragment { 1269b8e80941Smrg u32 min_tile_coord; 1270b8e80941Smrg u32 max_tile_coord; 1271b8e80941Smrg mali_ptr framebuffer; 1272b8e80941Smrg} __attribute__((packed)); 1273b8e80941Smrg 1274b8e80941Smrg/* (Single?) Framebuffer Descriptor */ 1275b8e80941Smrg 1276b8e80941Smrg/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is 1277b8e80941Smrg * configured for 4x. With MSAA_8, it is configured for 8x. */ 1278b8e80941Smrg 1279b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3) 1280b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_A (1 << 4) 1281b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_B (1 << 23) 1282b8e80941Smrg 1283b8e80941Smrg/* Fast/slow based on whether all three buffers are cleared at once */ 1284b8e80941Smrg 1285b8e80941Smrg#define MALI_CLEAR_FAST (1 << 18) 1286b8e80941Smrg#define MALI_CLEAR_SLOW (1 << 28) 1287b8e80941Smrg#define MALI_CLEAR_SLOW_STENCIL (1 << 31) 1288b8e80941Smrg 1289b8e80941Smrgstruct mali_single_framebuffer { 1290b8e80941Smrg u32 unknown1; 1291b8e80941Smrg u32 unknown2; 1292b8e80941Smrg u64 unknown_address_0; 1293b8e80941Smrg u64 zero1; 1294b8e80941Smrg u64 zero0; 1295b8e80941Smrg 1296b8e80941Smrg /* Exact format is ironically not known, since EGL is finnicky with the 1297b8e80941Smrg * blob. MSAA, colourspace, etc are configured here. */ 1298b8e80941Smrg 1299b8e80941Smrg u32 format; 1300b8e80941Smrg 1301b8e80941Smrg u32 clear_flags; 1302b8e80941Smrg u32 zero2; 1303b8e80941Smrg 1304b8e80941Smrg /* Purposeful off-by-one in these fields should be accounted for by the 1305b8e80941Smrg * MALI_DIMENSION macro */ 1306b8e80941Smrg 1307b8e80941Smrg u16 width; 1308b8e80941Smrg u16 height; 1309b8e80941Smrg 1310b8e80941Smrg u32 zero3[8]; 1311b8e80941Smrg 1312b8e80941Smrg /* By default, the framebuffer is upside down from OpenGL's 1313b8e80941Smrg * perspective. Set framebuffer to the end and negate the stride to 1314b8e80941Smrg * flip in the Y direction */ 1315b8e80941Smrg 1316b8e80941Smrg mali_ptr framebuffer; 1317b8e80941Smrg int32_t stride; 1318b8e80941Smrg 1319b8e80941Smrg u32 zero4; 1320b8e80941Smrg 1321b8e80941Smrg /* Depth and stencil buffers are interleaved, it appears, as they are 1322b8e80941Smrg * set to the same address in captures. Both fields set to zero if the 1323b8e80941Smrg * buffer is not being cleared. Depending on GL_ENABLE magic, you might 1324b8e80941Smrg * get a zero enable despite the buffer being present; that still is 1325b8e80941Smrg * disabled. */ 1326b8e80941Smrg 1327b8e80941Smrg mali_ptr depth_buffer; // not SAME_VA 1328b8e80941Smrg u64 depth_buffer_enable; 1329b8e80941Smrg 1330b8e80941Smrg mali_ptr stencil_buffer; // not SAME_VA 1331b8e80941Smrg u64 stencil_buffer_enable; 1332b8e80941Smrg 1333b8e80941Smrg u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware 1334b8e80941Smrg u32 clear_color_2; // always equal, but unclear function? 1335b8e80941Smrg u32 clear_color_3; // always equal, but unclear function? 1336b8e80941Smrg u32 clear_color_4; // always equal, but unclear function? 1337b8e80941Smrg 1338b8e80941Smrg /* Set to zero if not cleared */ 1339b8e80941Smrg 1340b8e80941Smrg float clear_depth_1; // float32, ditto 1341b8e80941Smrg float clear_depth_2; // float32, ditto 1342b8e80941Smrg float clear_depth_3; // float32, ditto 1343b8e80941Smrg float clear_depth_4; // float32, ditto 1344b8e80941Smrg 1345b8e80941Smrg u32 clear_stencil; // Exactly as it appears in OpenGL 1346b8e80941Smrg 1347b8e80941Smrg u32 zero6[7]; 1348b8e80941Smrg 1349b8e80941Smrg /* Very weird format, see generation code in trans_builder.c */ 1350b8e80941Smrg u32 resolution_check; 1351b8e80941Smrg 1352b8e80941Smrg u32 tiler_flags; 1353b8e80941Smrg 1354b8e80941Smrg u64 unknown_address_1; /* Pointing towards... a zero buffer? */ 1355b8e80941Smrg u64 unknown_address_2; 1356b8e80941Smrg 1357b8e80941Smrg /* See mali_kbase_replay.c */ 1358b8e80941Smrg u64 tiler_heap_free; 1359b8e80941Smrg u64 tiler_heap_end; 1360b8e80941Smrg 1361b8e80941Smrg /* More below this, maybe */ 1362b8e80941Smrg} __attribute__((packed)); 1363b8e80941Smrg 1364b8e80941Smrg/* Format bits for the render target flags */ 1365b8e80941Smrg 1366b8e80941Smrg#define MALI_MFBD_FORMAT_AFBC (1 << 5) 1367b8e80941Smrg#define MALI_MFBD_FORMAT_MSAA (1 << 7) 1368b8e80941Smrg 1369b8e80941Smrgstruct mali_rt_format { 1370b8e80941Smrg unsigned unk1 : 32; 1371b8e80941Smrg unsigned unk2 : 3; 1372b8e80941Smrg 1373b8e80941Smrg unsigned nr_channels : 2; /* MALI_POSITIVE */ 1374b8e80941Smrg 1375b8e80941Smrg unsigned flags : 11; 1376b8e80941Smrg 1377b8e80941Smrg unsigned swizzle : 12; 1378b8e80941Smrg 1379b8e80941Smrg unsigned unk4 : 4; 1380b8e80941Smrg} __attribute__((packed)); 1381b8e80941Smrg 1382b8e80941Smrgstruct bifrost_render_target { 1383b8e80941Smrg struct mali_rt_format format; 1384b8e80941Smrg 1385b8e80941Smrg u64 zero1; 1386b8e80941Smrg 1387b8e80941Smrg union { 1388b8e80941Smrg struct { 1389b8e80941Smrg /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled, 1390b8e80941Smrg * there is an extra metadata buffer that contains 16 bytes per tile. 1391b8e80941Smrg * The framebuffer needs to be the same size as before, since we don't 1392b8e80941Smrg * know ahead of time how much space it will take up. The 1393b8e80941Smrg * framebuffer_stride is set to 0, since the data isn't stored linearly 1394b8e80941Smrg * anymore. 1395b8e80941Smrg */ 1396b8e80941Smrg 1397b8e80941Smrg mali_ptr metadata; 1398b8e80941Smrg u32 stride; // stride in units of tiles 1399b8e80941Smrg u32 unk; // = 0x20000 1400b8e80941Smrg } afbc; 1401b8e80941Smrg 1402b8e80941Smrg struct { 1403b8e80941Smrg /* Heck if I know */ 1404b8e80941Smrg u64 unk; 1405b8e80941Smrg mali_ptr pointer; 1406b8e80941Smrg } chunknown; 1407b8e80941Smrg }; 1408b8e80941Smrg 1409b8e80941Smrg mali_ptr framebuffer; 1410b8e80941Smrg 1411b8e80941Smrg u32 zero2 : 4; 1412b8e80941Smrg u32 framebuffer_stride : 28; // in units of bytes 1413b8e80941Smrg u32 zero3; 1414b8e80941Smrg 1415b8e80941Smrg u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware 1416b8e80941Smrg u32 clear_color_2; // always equal, but unclear function? 1417b8e80941Smrg u32 clear_color_3; // always equal, but unclear function? 1418b8e80941Smrg u32 clear_color_4; // always equal, but unclear function? 1419b8e80941Smrg} __attribute__((packed)); 1420b8e80941Smrg 1421b8e80941Smrg/* An optional part of bifrost_framebuffer. It comes between the main structure 1422b8e80941Smrg * and the array of render targets. It must be included if any of these are 1423b8e80941Smrg * enabled: 1424b8e80941Smrg * 1425b8e80941Smrg * - Transaction Elimination 1426b8e80941Smrg * - Depth/stencil 1427b8e80941Smrg * - TODO: Anything else? 1428b8e80941Smrg */ 1429b8e80941Smrg 1430b8e80941Smrg/* Flags field: note, these are guesses */ 1431b8e80941Smrg 1432b8e80941Smrg#define MALI_EXTRA_PRESENT (0x400) 1433b8e80941Smrg#define MALI_EXTRA_AFBC (0x20) 1434b8e80941Smrg#define MALI_EXTRA_AFBC_ZS (0x10) 1435b8e80941Smrg#define MALI_EXTRA_ZS (0x4) 1436b8e80941Smrg 1437b8e80941Smrgstruct bifrost_fb_extra { 1438b8e80941Smrg mali_ptr checksum; 1439b8e80941Smrg /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */ 1440b8e80941Smrg u32 checksum_stride; 1441b8e80941Smrg 1442b8e80941Smrg u32 flags; 1443b8e80941Smrg 1444b8e80941Smrg union { 1445b8e80941Smrg /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */ 1446b8e80941Smrg struct { 1447b8e80941Smrg mali_ptr depth_stencil_afbc_metadata; 1448b8e80941Smrg u32 depth_stencil_afbc_stride; // in units of tiles 1449b8e80941Smrg u32 zero1; 1450b8e80941Smrg 1451b8e80941Smrg mali_ptr depth_stencil; 1452b8e80941Smrg 1453b8e80941Smrg u64 padding; 1454b8e80941Smrg } ds_afbc; 1455b8e80941Smrg 1456b8e80941Smrg struct { 1457b8e80941Smrg /* Depth becomes depth/stencil in case of combined D/S */ 1458b8e80941Smrg mali_ptr depth; 1459b8e80941Smrg u32 depth_stride_zero : 4; 1460b8e80941Smrg u32 depth_stride : 28; 1461b8e80941Smrg u32 zero1; 1462b8e80941Smrg 1463b8e80941Smrg mali_ptr stencil; 1464b8e80941Smrg u32 stencil_stride_zero : 4; 1465b8e80941Smrg u32 stencil_stride : 28; 1466b8e80941Smrg u32 zero2; 1467b8e80941Smrg } ds_linear; 1468b8e80941Smrg }; 1469b8e80941Smrg 1470b8e80941Smrg 1471b8e80941Smrg u64 zero3, zero4; 1472b8e80941Smrg} __attribute__((packed)); 1473b8e80941Smrg 1474b8e80941Smrg/* flags for unk3 */ 1475b8e80941Smrg 1476b8e80941Smrg/* Enables writing depth results back to main memory (rather than keeping them 1477b8e80941Smrg * on-chip in the tile buffer and then discarding) */ 1478b8e80941Smrg 1479b8e80941Smrg#define MALI_MFBD_DEPTH_WRITE (1 << 10) 1480b8e80941Smrg 1481b8e80941Smrg/* The MFBD contains the extra bifrost_fb_extra section */ 1482b8e80941Smrg 1483b8e80941Smrg#define MALI_MFBD_EXTRA (1 << 13) 1484b8e80941Smrg 1485b8e80941Smrgstruct bifrost_framebuffer { 1486b8e80941Smrg u32 unk0; // = 0x10 1487b8e80941Smrg 1488b8e80941Smrg u32 unknown2; // = 0x1f, same as SFBD 1489b8e80941Smrg mali_ptr scratchpad; 1490b8e80941Smrg 1491b8e80941Smrg /* 0x10 */ 1492b8e80941Smrg mali_ptr sample_locations; 1493b8e80941Smrg mali_ptr unknown1; 1494b8e80941Smrg /* 0x20 */ 1495b8e80941Smrg u16 width1, height1; 1496b8e80941Smrg u32 zero3; 1497b8e80941Smrg u16 width2, height2; 1498b8e80941Smrg u32 unk1 : 19; // = 0x01000 1499b8e80941Smrg u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE) 1500b8e80941Smrg u32 unk2 : 3; // = 0 1501b8e80941Smrg u32 rt_count_2 : 3; // no off-by-one 1502b8e80941Smrg u32 zero4 : 5; 1503b8e80941Smrg /* 0x30 */ 1504b8e80941Smrg u32 clear_stencil : 8; 1505b8e80941Smrg u32 unk3 : 24; // = 0x100 1506b8e80941Smrg float clear_depth; 1507b8e80941Smrg mali_ptr tiler_meta; 1508b8e80941Smrg /* 0x40 */ 1509b8e80941Smrg 1510b8e80941Smrg /* Note: these are guesses! */ 1511b8e80941Smrg mali_ptr tiler_scratch_start; 1512b8e80941Smrg mali_ptr tiler_scratch_middle; 1513b8e80941Smrg 1514b8e80941Smrg /* These are not, since we see symmetry with replay jobs which name these explicitly */ 1515b8e80941Smrg mali_ptr tiler_heap_start; 1516b8e80941Smrg mali_ptr tiler_heap_end; 1517b8e80941Smrg 1518b8e80941Smrg u64 zero9, zero10, zero11, zero12; 1519b8e80941Smrg 1520b8e80941Smrg /* optional: struct bifrost_fb_extra extra */ 1521b8e80941Smrg /* struct bifrost_render_target rts[] */ 1522b8e80941Smrg} __attribute__((packed)); 1523b8e80941Smrg 1524b8e80941Smrg#endif /* __PANFROST_JOB_H__ */ 1525