panfrost-job.h revision b8e80941
1/* 2 * © Copyright 2017-2018 Alyssa Rosenzweig 3 * © Copyright 2017-2018 Connor Abbott 4 * © Copyright 2017-2018 Lyude Paul 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 * 25 */ 26 27#ifndef __PANFROST_JOB_H__ 28#define __PANFROST_JOB_H__ 29 30#include <stdint.h> 31#include <panfrost-misc.h> 32 33#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8) 34 35#define MALI_FBD_HIERARCHY_WEIGHTS 8 36 37#define MALI_PAYLOAD_SIZE 256 38 39typedef u32 mali_jd_core_req; 40 41enum mali_job_type { 42 JOB_NOT_STARTED = 0, 43 JOB_TYPE_NULL = 1, 44 JOB_TYPE_SET_VALUE = 2, 45 JOB_TYPE_CACHE_FLUSH = 3, 46 JOB_TYPE_COMPUTE = 4, 47 JOB_TYPE_VERTEX = 5, 48 JOB_TYPE_GEOMETRY = 6, 49 JOB_TYPE_TILER = 7, 50 JOB_TYPE_FUSED = 8, 51 JOB_TYPE_FRAGMENT = 9, 52}; 53 54enum mali_draw_mode { 55 MALI_DRAW_NONE = 0x0, 56 MALI_POINTS = 0x1, 57 MALI_LINES = 0x2, 58 MALI_LINE_STRIP = 0x4, 59 MALI_LINE_LOOP = 0x6, 60 MALI_TRIANGLES = 0x8, 61 MALI_TRIANGLE_STRIP = 0xA, 62 MALI_TRIANGLE_FAN = 0xC, 63 MALI_POLYGON = 0xD, 64 MALI_QUADS = 0xE, 65 MALI_QUAD_STRIP = 0xF, 66 67 /* All other modes invalid */ 68}; 69 70/* Applies to tiler_gl_enables */ 71 72 73#define MALI_OCCLUSION_QUERY (1 << 3) 74#define MALI_OCCLUSION_PRECISE (1 << 4) 75 76#define MALI_FRONT_FACE(v) (v << 5) 77#define MALI_CCW (0) 78#define MALI_CW (1) 79 80#define MALI_CULL_FACE_FRONT (1 << 6) 81#define MALI_CULL_FACE_BACK (1 << 7) 82 83/* TODO: Might this actually be a finer bitfield? */ 84#define MALI_DEPTH_STENCIL_ENABLE 0x6400 85 86#define DS_ENABLE(field) \ 87 (field == MALI_DEPTH_STENCIL_ENABLE) \ 88 ? "MALI_DEPTH_STENCIL_ENABLE" \ 89 : (field == 0) ? "0" \ 90 : "0 /* XXX: Unknown, check hexdump */" 91 92/* Used in stencil and depth tests */ 93 94enum mali_func { 95 MALI_FUNC_NEVER = 0, 96 MALI_FUNC_LESS = 1, 97 MALI_FUNC_EQUAL = 2, 98 MALI_FUNC_LEQUAL = 3, 99 MALI_FUNC_GREATER = 4, 100 MALI_FUNC_NOTEQUAL = 5, 101 MALI_FUNC_GEQUAL = 6, 102 MALI_FUNC_ALWAYS = 7 103}; 104 105/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */ 106 107enum mali_alt_func { 108 MALI_ALT_FUNC_NEVER = 0, 109 MALI_ALT_FUNC_GREATER = 1, 110 MALI_ALT_FUNC_EQUAL = 2, 111 MALI_ALT_FUNC_GEQUAL = 3, 112 MALI_ALT_FUNC_LESS = 4, 113 MALI_ALT_FUNC_NOTEQUAL = 5, 114 MALI_ALT_FUNC_LEQUAL = 6, 115 MALI_ALT_FUNC_ALWAYS = 7 116}; 117 118/* Flags apply to unknown2_3? */ 119 120#define MALI_HAS_MSAA (1 << 0) 121#define MALI_CAN_DISCARD (1 << 5) 122 123/* Applies on SFBD systems, specifying that programmable blending is in use */ 124#define MALI_HAS_BLEND_SHADER (1 << 6) 125 126/* func is mali_func */ 127#define MALI_DEPTH_FUNC(func) (func << 8) 128#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7) 129#define MALI_DEPTH_FUNC_MASK MALI_DEPTH_FUNC(0x7) 130 131#define MALI_DEPTH_TEST (1 << 11) 132 133/* Next flags to unknown2_4 */ 134#define MALI_STENCIL_TEST (1 << 0) 135 136/* What?! */ 137#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1) 138 139#define MALI_NO_DITHER (1 << 9) 140#define MALI_DEPTH_RANGE_A (1 << 12) 141#define MALI_DEPTH_RANGE_B (1 << 13) 142#define MALI_NO_MSAA (1 << 14) 143 144/* Stencil test state is all encoded in a single u32, just with a lot of 145 * enums... */ 146 147enum mali_stencil_op { 148 MALI_STENCIL_KEEP = 0, 149 MALI_STENCIL_REPLACE = 1, 150 MALI_STENCIL_ZERO = 2, 151 MALI_STENCIL_INVERT = 3, 152 MALI_STENCIL_INCR_WRAP = 4, 153 MALI_STENCIL_DECR_WRAP = 5, 154 MALI_STENCIL_INCR = 6, 155 MALI_STENCIL_DECR = 7 156}; 157 158struct mali_stencil_test { 159 unsigned ref : 8; 160 unsigned mask : 8; 161 enum mali_func func : 3; 162 enum mali_stencil_op sfail : 3; 163 enum mali_stencil_op dpfail : 3; 164 enum mali_stencil_op dppass : 3; 165 unsigned zero : 4; 166} __attribute__((packed)); 167 168/* Blending is a mess, since anything fancy triggers a blend shader, and 169 * -those- are not understood whatsover yet */ 170 171#define MALI_MASK_R (1 << 0) 172#define MALI_MASK_G (1 << 1) 173#define MALI_MASK_B (1 << 2) 174#define MALI_MASK_A (1 << 3) 175 176enum mali_nondominant_mode { 177 MALI_BLEND_NON_MIRROR = 0, 178 MALI_BLEND_NON_ZERO = 1 179}; 180 181enum mali_dominant_blend { 182 MALI_BLEND_DOM_SOURCE = 0, 183 MALI_BLEND_DOM_DESTINATION = 1 184}; 185 186enum mali_dominant_factor { 187 MALI_DOMINANT_UNK0 = 0, 188 MALI_DOMINANT_ZERO = 1, 189 MALI_DOMINANT_SRC_COLOR = 2, 190 MALI_DOMINANT_DST_COLOR = 3, 191 MALI_DOMINANT_UNK4 = 4, 192 MALI_DOMINANT_SRC_ALPHA = 5, 193 MALI_DOMINANT_DST_ALPHA = 6, 194 MALI_DOMINANT_CONSTANT = 7, 195}; 196 197enum mali_blend_modifier { 198 MALI_BLEND_MOD_UNK0 = 0, 199 MALI_BLEND_MOD_NORMAL = 1, 200 MALI_BLEND_MOD_SOURCE_ONE = 2, 201 MALI_BLEND_MOD_DEST_ONE = 3, 202}; 203 204struct mali_blend_mode { 205 enum mali_blend_modifier clip_modifier : 2; 206 unsigned unused_0 : 1; 207 unsigned negate_source : 1; 208 209 enum mali_dominant_blend dominant : 1; 210 211 enum mali_nondominant_mode nondominant_mode : 1; 212 213 unsigned unused_1 : 1; 214 215 unsigned negate_dest : 1; 216 217 enum mali_dominant_factor dominant_factor : 3; 218 unsigned complement_dominant : 1; 219} __attribute__((packed)); 220 221struct mali_blend_equation { 222 /* Of type mali_blend_mode */ 223 unsigned rgb_mode : 12; 224 unsigned alpha_mode : 12; 225 226 unsigned zero1 : 4; 227 228 /* Corresponds to MALI_MASK_* above and glColorMask arguments */ 229 230 unsigned color_mask : 4; 231 232 /* Attached constant for CONSTANT_ALPHA, etc */ 233 234#ifndef BIFROST 235 float constant; 236#endif 237} __attribute__((packed)); 238 239/* Used with channel swizzling */ 240enum mali_channel { 241 MALI_CHANNEL_RED = 0, 242 MALI_CHANNEL_GREEN = 1, 243 MALI_CHANNEL_BLUE = 2, 244 MALI_CHANNEL_ALPHA = 3, 245 MALI_CHANNEL_ZERO = 4, 246 MALI_CHANNEL_ONE = 5, 247 MALI_CHANNEL_RESERVED_0 = 6, 248 MALI_CHANNEL_RESERVED_1 = 7, 249}; 250 251struct mali_channel_swizzle { 252 enum mali_channel r : 3; 253 enum mali_channel g : 3; 254 enum mali_channel b : 3; 255 enum mali_channel a : 3; 256} __attribute__((packed)); 257 258/* Compressed per-pixel formats. Each of these formats expands to one to four 259 * floating-point or integer numbers, as defined by the OpenGL specification. 260 * There are various places in OpenGL where the user can specify a compressed 261 * format in memory, which all use the same 8-bit enum in the various 262 * descriptors, although different hardware units support different formats. 263 */ 264 265/* The top 3 bits specify how the bits of each component are interpreted. */ 266 267/* e.g. R11F_G11F_B10F */ 268#define MALI_FORMAT_SPECIAL (2 << 5) 269 270/* signed normalized, e.g. RGBA8_SNORM */ 271#define MALI_FORMAT_SNORM (3 << 5) 272 273/* e.g. RGBA8UI */ 274#define MALI_FORMAT_UINT (4 << 5) 275 276/* e.g. RGBA8 and RGBA32F */ 277#define MALI_FORMAT_UNORM (5 << 5) 278 279/* e.g. RGBA8I and RGBA16F */ 280#define MALI_FORMAT_SINT (6 << 5) 281 282/* These formats seem to largely duplicate the others. They're used at least 283 * for Bifrost framebuffer output. 284 */ 285#define MALI_FORMAT_SPECIAL2 (7 << 5) 286 287/* If the high 3 bits are 3 to 6 these two bits say how many components 288 * there are. 289 */ 290#define MALI_NR_CHANNELS(n) ((n - 1) << 3) 291 292/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each 293 * component is, except the special MALI_CHANNEL_FLOAT which overrides what the 294 * bits mean. 295 */ 296 297#define MALI_CHANNEL_4 2 298 299#define MALI_CHANNEL_8 3 300 301#define MALI_CHANNEL_16 4 302 303#define MALI_CHANNEL_32 5 304 305/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For 306 * MALI_FORMAT_UNORM, it means a 32-bit float. 307 */ 308#define MALI_CHANNEL_FLOAT 7 309 310enum mali_format { 311 MALI_RGB565 = MALI_FORMAT_SPECIAL | 0x0, 312 MALI_RGB5_A1_UNORM = MALI_FORMAT_SPECIAL | 0x2, 313 MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3, 314 MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5, 315 MALI_RGB10_A2UI = MALI_FORMAT_SPECIAL | 0x7, 316 MALI_RGB10_A2I = MALI_FORMAT_SPECIAL | 0x9, 317 318 /* YUV formats */ 319 MALI_NV12 = MALI_FORMAT_SPECIAL | 0xc, 320 321 MALI_Z32_UNORM = MALI_FORMAT_SPECIAL | 0xD, 322 MALI_R32_FIXED = MALI_FORMAT_SPECIAL | 0x11, 323 MALI_RG32_FIXED = MALI_FORMAT_SPECIAL | 0x12, 324 MALI_RGB32_FIXED = MALI_FORMAT_SPECIAL | 0x13, 325 MALI_RGBA32_FIXED = MALI_FORMAT_SPECIAL | 0x14, 326 MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19, 327 /* Only used for varyings, to indicate the transformed gl_Position */ 328 MALI_VARYING_POS = MALI_FORMAT_SPECIAL | 0x1e, 329 /* Only used for varyings, to indicate that the write should be 330 * discarded. 331 */ 332 MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f, 333 334 MALI_R8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 335 MALI_R16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 336 MALI_R32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 337 MALI_RG8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 338 MALI_RG16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 339 MALI_RG32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 340 MALI_RGB8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 341 MALI_RGB16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 342 MALI_RGB32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 343 MALI_RGBA8_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 344 MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 345 MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 346 347 MALI_R8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 348 MALI_R16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 349 MALI_R32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 350 MALI_RG8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 351 MALI_RG16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 352 MALI_RG32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 353 MALI_RGB8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 354 MALI_RGB16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 355 MALI_RGB32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 356 MALI_RGBA8UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 357 MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 358 MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 359 360 MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 361 MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 362 MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 363 MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, 364 MALI_RG8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 365 MALI_RG16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 366 MALI_RG32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 367 MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, 368 MALI_RGB8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 369 MALI_RGB16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 370 MALI_RGB32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 371 MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, 372 MALI_RGBA4_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4, 373 MALI_RGBA8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 374 MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 375 MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 376 MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, 377 378 MALI_R8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8, 379 MALI_R16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16, 380 MALI_R32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32, 381 MALI_R16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT, 382 MALI_RG8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8, 383 MALI_RG16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16, 384 MALI_RG32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32, 385 MALI_RG16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT, 386 MALI_RGB8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8, 387 MALI_RGB16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16, 388 MALI_RGB32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32, 389 MALI_RGB16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT, 390 MALI_RGBA8I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8, 391 MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16, 392 MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32, 393 MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT, 394 395 MALI_RGBA4 = MALI_FORMAT_SPECIAL2 | 0x8, 396 MALI_RGBA8_2 = MALI_FORMAT_SPECIAL2 | 0xd, 397 MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe, 398}; 399 400 401/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion 402 * literally performing a bitwise invert. This function produces slightly wrong 403 * results and I'm not sure why; some rounding issue I suppose... */ 404 405#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f)) 406#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f) 407 408/* Applies to unknown1 */ 409#define MALI_NO_ALPHA_TO_COVERAGE (1 << 10) 410 411/* Flags denoting the fragment shader's use of tilebuffer readback. If the 412 * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If 413 * it might read depth/stencil in particular, also set MALI_READS_ZS */ 414 415#define MALI_READS_ZS (1 << 12) 416#define MALI_READS_TILEBUFFER (1 << 16) 417 418struct mali_blend_meta { 419#ifndef BIFROST 420 /* Base value of 0x200. 421 * OR with 0x1 for blending (anything other than REPLACE). 422 * OR with 0x2 for programmable blending 423 */ 424 425 u64 unk1; 426 427 union { 428 struct mali_blend_equation blend_equation_1; 429 mali_ptr blend_shader; 430 }; 431 432 u64 zero2; 433 struct mali_blend_equation blend_equation_2; 434#else 435 u32 unk1; // = 0x200 436 struct mali_blend_equation blend_equation; 437 /* 438 * - 0x19 normally 439 * - 0x3 when this slot is unused (everything else is 0 except the index) 440 * - 0x11 when this is the fourth slot (and it's used) 441+ * - 0 when there is a blend shader 442 */ 443 u16 unk2; 444 /* increments from 0 to 3 */ 445 u16 index; 446 447 union { 448 struct { 449 /* So far, I've only seen: 450 * - R001 for 1-component formats 451 * - RG01 for 2-component formats 452 * - RGB1 for 3-component formats 453 * - RGBA for 4-component formats 454 */ 455 u32 swizzle : 12; 456 enum mali_format format : 8; 457 458 /* Type of the shader output variable. Note, this can 459 * be different from the format. 460 * 461 * 0: f16 (mediump float) 462 * 1: f32 (highp float) 463 * 2: i32 (highp int) 464 * 3: u32 (highp uint) 465 * 4: i16 (mediump int) 466 * 5: u16 (mediump uint) 467 */ 468 u32 shader_type : 3; 469 u32 zero : 9; 470 }; 471 472 /* Only the low 32 bits of the blend shader are stored, the 473 * high 32 bits are implicitly the same as the original shader. 474 * According to the kernel driver, the program counter for 475 * shaders is actually only 24 bits, so shaders cannot cross 476 * the 2^24-byte boundary, and neither can the blend shader. 477 * The blob handles this by allocating a 2^24 byte pool for 478 * shaders, and making sure that any blend shaders are stored 479 * in the same pool as the original shader. The kernel will 480 * make sure this allocation is aligned to 2^24 bytes. 481 */ 482 u32 blend_shader; 483 }; 484#endif 485} __attribute__((packed)); 486 487struct mali_shader_meta { 488 mali_ptr shader; 489 u16 texture_count; 490 u16 sampler_count; 491 u16 attribute_count; 492 u16 varying_count; 493 494 union { 495 struct { 496 u32 uniform_buffer_count : 4; 497 u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler 498 } bifrost1; 499 struct { 500 /* 0x200 except MALI_NO_ALPHA_TO_COVERAGE. Mysterious 1 501 * other times. Who knows really? */ 502 u16 unknown1; 503 504 /* Whole number of uniform registers used, times two; 505 * whole number of work registers used (no scale). 506 */ 507 unsigned work_count : 5; 508 unsigned uniform_count : 5; 509 unsigned unknown2 : 6; 510 } midgard1; 511 }; 512 513 /* On bifrost: Exactly the same as glPolygonOffset() for both. 514 * On midgard: Depth factor is exactly as passed to glPolygonOffset. 515 * Depth units is equal to the value passed to glDeptOhffset + 1.0f 516 * (use MALI_NEGATIVE) 517 */ 518 float depth_units; 519 float depth_factor; 520 521 u32 unknown2_2; 522 523 u16 alpha_coverage; 524 u16 unknown2_3; 525 526 u8 stencil_mask_front; 527 u8 stencil_mask_back; 528 u16 unknown2_4; 529 530 struct mali_stencil_test stencil_front; 531 struct mali_stencil_test stencil_back; 532 533 union { 534 struct { 535 u32 unk3 : 7; 536 /* On Bifrost, some system values are preloaded in 537 * registers R55-R62 by the thread dispatcher prior to 538 * the start of shader execution. This is a bitfield 539 * with one entry for each register saying which 540 * registers need to be preloaded. Right now, the known 541 * values are: 542 * 543 * Vertex/compute: 544 * - R55 : gl_LocalInvocationID.xy 545 * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits 546 * - R57 : gl_WorkGroupID.x 547 * - R58 : gl_WorkGroupID.y 548 * - R59 : gl_WorkGroupID.z 549 * - R60 : gl_GlobalInvocationID.x 550 * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base) 551 * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base) 552 * 553 * Fragment: 554 * - R55 : unknown, never seen (but the bit for this is 555 * always set?) 556 * - R56 : unknown (bit always unset) 557 * - R57 : gl_PrimitiveID 558 * - R58 : gl_FrontFacing in low bit, potentially other stuff 559 * - R59 : u16 fragment coordinates (used to compute 560 * gl_FragCoord.xy, together with sample positions) 561 * - R60 : gl_SampleMask (used in epilog, so pretty 562 * much always used, but the bit is always 0 -- is 563 * this just always pushed?) 564 * - R61 : gl_SampleMaskIn and gl_SampleID, used by 565 * varying interpolation. 566 * - R62 : unknown (bit always unset). 567 */ 568 u32 preload_regs : 8; 569 /* In units of 8 bytes or 64 bits, since the 570 * uniform/const port loads 64 bits at a time. 571 */ 572 u32 uniform_count : 7; 573 u32 unk4 : 10; // = 2 574 } bifrost2; 575 struct { 576 u32 unknown2_7; 577 } midgard2; 578 }; 579 580 /* zero on bifrost */ 581 u32 unknown2_8; 582 583 /* Blending information for the older non-MRT Midgard HW. Check for 584 * MALI_HAS_BLEND_SHADER to decide how to interpret. 585 */ 586 587 union { 588 mali_ptr blend_shader; 589 struct mali_blend_equation blend_equation; 590 }; 591 592 /* There can be up to 4 blend_meta's. None of them are required for 593 * vertex shaders or the non-MRT case for Midgard (so the blob doesn't 594 * allocate any space). 595 */ 596 struct mali_blend_meta blend_meta[]; 597 598} __attribute__((packed)); 599 600/* This only concerns hardware jobs */ 601 602/* Possible values for job_descriptor_size */ 603 604#define MALI_JOB_32 0 605#define MALI_JOB_64 1 606 607struct mali_job_descriptor_header { 608 u32 exception_status; 609 u32 first_incomplete_task; 610 u64 fault_pointer; 611 u8 job_descriptor_size : 1; 612 enum mali_job_type job_type : 7; 613 u8 job_barrier : 1; 614 u8 unknown_flags : 7; 615 u16 job_index; 616 u16 job_dependency_index_1; 617 u16 job_dependency_index_2; 618 619 union { 620 u64 next_job_64; 621 u32 next_job_32; 622 }; 623} __attribute__((packed)); 624 625struct mali_payload_set_value { 626 u64 out; 627 u64 unknown; 628} __attribute__((packed)); 629 630/* Special attributes have a fixed index */ 631#define MALI_SPECIAL_ATTRIBUTE_BASE 16 632#define MALI_VERTEX_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 0) 633#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1) 634 635/* 636 * Mali Attributes 637 * 638 * This structure lets the attribute unit compute the address of an attribute 639 * given the vertex and instance ID. Unfortunately, the way this works is 640 * rather complicated when instancing is enabled. 641 * 642 * To explain this, first we need to explain how compute and vertex threads are 643 * dispatched. This is a guess (although a pretty firm guess!) since the 644 * details are mostly hidden from the driver, except for attribute instancing. 645 * When a quad is dispatched, it receives a single, linear index. However, we 646 * need to translate that index into a (vertex id, instance id) pair, or a 647 * (local id x, local id y, local id z) triple for compute shaders (although 648 * vertex shaders and compute shaders are handled almost identically). 649 * Focusing on vertex shaders, one option would be to do: 650 * 651 * vertex_id = linear_id % num_vertices 652 * instance_id = linear_id / num_vertices 653 * 654 * but this involves a costly division and modulus by an arbitrary number. 655 * Instead, we could pad num_vertices. We dispatch padded_num_vertices * 656 * num_instances threads instead of num_vertices * num_instances, which results 657 * in some "extra" threads with vertex_id >= num_vertices, which we have to 658 * discard. The more we pad num_vertices, the more "wasted" threads we 659 * dispatch, but the division is potentially easier. 660 * 661 * One straightforward choice is to pad num_vertices to the next power of two, 662 * which means that the division and modulus are just simple bit shifts and 663 * masking. But the actual algorithm is a bit more complicated. The thread 664 * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition 665 * to dividing by a power of two. This is possibly using the technique 666 * described in patent US20170010862A1. As a result, padded_num_vertices can be 667 * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads, 668 * since we need less padding. 669 * 670 * padded_num_vertices is picked by the hardware. The driver just specifies the 671 * actual number of vertices. At least for Mali G71, the first few cases are 672 * given by: 673 * 674 * num_vertices | padded_num_vertices 675 * 3 | 4 676 * 4-7 | 8 677 * 8-11 | 12 (3 * 4) 678 * 12-15 | 16 679 * 16-19 | 20 (5 * 4) 680 * 681 * Note that padded_num_vertices is a multiple of four (presumably because 682 * threads are dispatched in groups of 4). Also, padded_num_vertices is always 683 * at least one more than num_vertices, which seems like a quirk of the 684 * hardware. For larger num_vertices, the hardware uses the following 685 * algorithm: using the binary representation of num_vertices, we look at the 686 * most significant set bit as well as the following 3 bits. Let n be the 687 * number of bits after those 4 bits. Then we set padded_num_vertices according 688 * to the following table: 689 * 690 * high bits | padded_num_vertices 691 * 1000 | 9 * 2^n 692 * 1001 | 5 * 2^(n+1) 693 * 101x | 3 * 2^(n+2) 694 * 110x | 7 * 2^(n+1) 695 * 111x | 2^(n+4) 696 * 697 * For example, if num_vertices = 70 is passed to glDraw(), its binary 698 * representation is 1000110, so n = 3 and the high bits are 1000, and 699 * therefore padded_num_vertices = 9 * 2^3 = 72. 700 * 701 * The attribute unit works in terms of the original linear_id. if 702 * num_instances = 1, then they are the same, and everything is simple. 703 * However, with instancing things get more complicated. There are four 704 * possible modes, two of them we can group together: 705 * 706 * 1. Use the linear_id directly. Only used when there is no instancing. 707 * 708 * 2. Use the linear_id modulo a constant. This is used for per-vertex 709 * attributes with instancing enabled by making the constant equal 710 * padded_num_vertices. Because the modulus is always padded_num_vertices, this 711 * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9. 712 * The shift field specifies the power of two, while the extra_flags field 713 * specifies the odd number. If shift = n and extra_flags = m, then the modulus 714 * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed 715 * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and 716 * shift = 3. Note that we must exactly follow the hardware algorithm used to 717 * get padded_num_vertices in order to correctly implement per-vertex 718 * attributes. 719 * 720 * 3. Divide the linear_id by a constant. In order to correctly implement 721 * instance divisors, we have to divide linear_id by padded_num_vertices times 722 * to user-specified divisor. So first we compute padded_num_vertices, again 723 * following the exact same algorithm that the hardware uses, then multiply it 724 * by the GL-level divisor to get the hardware-level divisor. This case is 725 * further divided into two more cases. If the hardware-level divisor is a 726 * power of two, then we just need to shift. The shift amount is specified by 727 * the shift field, so that the hardware-level divisor is just 2^shift. 728 * 729 * If it isn't a power of two, then we have to divide by an arbitrary integer. 730 * For that, we use the well-known technique of multiplying by an approximation 731 * of the inverse. The driver must compute the magic multiplier and shift 732 * amount, and then the hardware does the multiplication and shift. The 733 * hardware and driver also use the "round-down" optimization as described in 734 * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf. 735 * The hardware further assumes the multiplier is between 2^31 and 2^32, so the 736 * high bit is implicitly set to 1 even though it is set to 0 by the driver -- 737 * presumably this simplifies the hardware multiplier a little. The hardware 738 * first multiplies linear_id by the multiplier and takes the high 32 bits, 739 * then applies the round-down correction if extra_flags = 1, then finally 740 * shifts right by the shift field. 741 * 742 * There are some differences between ridiculousfish's algorithm and the Mali 743 * hardware algorithm, which means that the reference code from ridiculousfish 744 * doesn't always produce the right constants. Mali does not use the pre-shift 745 * optimization, since that would make a hardware implementation slower (it 746 * would have to always do the pre-shift, multiply, and post-shift operations). 747 * It also forces the multplier to be at least 2^31, which means that the 748 * exponent is entirely fixed, so there is no trial-and-error. Altogether, 749 * given the divisor d, the algorithm the driver must follow is: 750 * 751 * 1. Set shift = floor(log2(d)). 752 * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d. 753 * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set 754 * magic_divisor = m - 1 and extra_flags = 1. 755 * 4. Otherwise, set magic_divisor = m and extra_flags = 0. 756 */ 757 758enum mali_attr_mode { 759 MALI_ATTR_UNUSED = 0, 760 MALI_ATTR_LINEAR = 1, 761 MALI_ATTR_POT_DIVIDE = 2, 762 MALI_ATTR_MODULO = 3, 763 MALI_ATTR_NPOT_DIVIDE = 4, 764}; 765 766/* This magic "pseudo-address" is used as `elements` to implement 767 * gl_PointCoord. When read from a fragment shader, it generates a point 768 * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces 769 * require an affine transformation in the shader. */ 770 771#define MALI_VARYING_POINT_COORD (0x60) 772 773union mali_attr { 774 /* This is used for actual attributes. */ 775 struct { 776 /* The bottom 3 bits are the mode */ 777 mali_ptr elements : 64 - 8; 778 u32 shift : 5; 779 u32 extra_flags : 3; 780 u32 stride; 781 u32 size; 782 }; 783 /* The entry after an NPOT_DIVIDE entry has this format. It stores 784 * extra information that wouldn't fit in a normal entry. 785 */ 786 struct { 787 u32 unk; /* = 0x20 */ 788 u32 magic_divisor; 789 u32 zero; 790 /* This is the original, GL-level divisor. */ 791 u32 divisor; 792 }; 793} __attribute__((packed)); 794 795struct mali_attr_meta { 796 /* Vertex buffer index */ 797 u8 index; 798 799 unsigned unknown1 : 2; 800 unsigned swizzle : 12; 801 enum mali_format format : 8; 802 803 /* Always observed to be zero at the moment */ 804 unsigned unknown3 : 2; 805 806 /* When packing multiple attributes in a buffer, offset addresses by this value */ 807 uint32_t src_offset; 808} __attribute__((packed)); 809 810enum mali_fbd_type { 811 MALI_SFBD = 0, 812 MALI_MFBD = 1, 813}; 814 815#define FBD_TYPE (1) 816#define FBD_MASK (~0x3f) 817 818struct mali_uniform_buffer_meta { 819 /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16 820 * bytes. This gives a maximum of 2^14 bytes, which just so happens to 821 * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE. 822 */ 823 u64 size : 10; 824 825 /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits 826 * should be 0 for userspace pointers, according to 827 * https://lwn.net/Articles/718895/. By reusing these bits, we can make 828 * each entry in the table only 64 bits. 829 */ 830 mali_ptr ptr : 64 - 10; 831}; 832 833/* On Bifrost, these fields are the same between the vertex and tiler payloads. 834 * They also seem to be the same between Bifrost and Midgard. They're shared in 835 * fused payloads. 836 */ 837 838/* Applies to unknown_draw */ 839 840#define MALI_DRAW_INDEXED_UINT8 (0x10) 841#define MALI_DRAW_INDEXED_UINT16 (0x20) 842#define MALI_DRAW_INDEXED_UINT32 (0x30) 843#define MALI_DRAW_VARYING_SIZE (0x100) 844#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000) 845 846struct mali_vertex_tiler_prefix { 847 /* This is a dynamic bitfield containing the following things in this order: 848 * 849 * - gl_WorkGroupSize.x 850 * - gl_WorkGroupSize.y 851 * - gl_WorkGroupSize.z 852 * - gl_NumWorkGroups.x 853 * - gl_NumWorkGroups.y 854 * - gl_NumWorkGroups.z 855 * 856 * The number of bits allocated for each number is based on the *_shift 857 * fields below. For example, workgroups_y_shift gives the bit that 858 * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit 859 * that gl_NumWorkGroups.z starts at (and therefore one after the bit 860 * that gl_NumWorkGroups.y ends at). The actual value for each gl_* 861 * value is one more than the stored value, since if any of the values 862 * are zero, then there would be no invocations (and hence no job). If 863 * there were 0 bits allocated to a given field, then it must be zero, 864 * and hence the real value is one. 865 * 866 * Vertex jobs reuse the same job dispatch mechanism as compute jobs, 867 * effectively doing glDispatchCompute(1, vertex_count, instance_count) 868 * where vertex count is the number of vertices. 869 */ 870 u32 invocation_count; 871 872 u32 size_y_shift : 5; 873 u32 size_z_shift : 5; 874 u32 workgroups_x_shift : 6; 875 u32 workgroups_y_shift : 6; 876 u32 workgroups_z_shift : 6; 877 /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */ 878 u32 workgroups_x_shift_2 : 4; 879 880 u32 draw_mode : 4; 881 u32 unknown_draw : 22; 882 883 /* This is the the same as workgroups_x_shift_2 in compute shaders, but 884 * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has 885 * something to do with how many quads get put in the same execution 886 * engine, which is a balance (you don't want to starve the engine, but 887 * you also want to distribute work evenly). 888 */ 889 u32 workgroups_x_shift_3 : 6; 890 891 892 /* Negative of draw_start for TILER jobs from what I've seen */ 893 int32_t negative_start; 894 u32 zero1; 895 896 /* Like many other strictly nonzero quantities, index_count is 897 * subtracted by one. For an indexed cube, this is equal to 35 = 6 898 * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is, 899 * for an indexed draw, index_count is the number of actual vertices 900 * rendered whereas invocation_count is the number of unique vertices 901 * rendered (the number of times the vertex shader must be invoked). 902 * For non-indexed draws, this is just equal to invocation_count. */ 903 904 u32 index_count; 905 906 /* No hidden structure; literally just a pointer to an array of uint 907 * indices (width depends on flags). Thanks, guys, for not making my 908 * life insane for once! NULL for non-indexed draws. */ 909 910 uintptr_t indices; 911} __attribute__((packed)); 912 913/* Point size / line width can either be specified as a 32-bit float (for 914 * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer 915 * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler 916 * payload, the contents of varying_pointer will be intepreted as an array of 917 * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by 918 * creating a special MALI_R16F varying writing to varying_pointer. */ 919 920union midgard_primitive_size { 921 float constant; 922 uintptr_t pointer; 923}; 924 925struct bifrost_vertex_only { 926 u32 unk2; /* =0x2 */ 927 928 u32 zero0; 929 930 u64 zero1; 931} __attribute__((packed)); 932 933struct bifrost_tiler_heap_meta { 934 u32 zero; 935 u32 heap_size; 936 /* note: these are just guesses! */ 937 mali_ptr tiler_heap_start; 938 mali_ptr tiler_heap_free; 939 mali_ptr tiler_heap_end; 940 941 /* hierarchy weights? but they're still 0 after the job has run... */ 942 u32 zeros[12]; 943} __attribute__((packed)); 944 945struct bifrost_tiler_meta { 946 u64 zero0; 947 u32 unk; // = 0xf0 948 u16 width; 949 u16 height; 950 u64 zero1; 951 mali_ptr tiler_heap_meta; 952 /* TODO what is this used for? */ 953 u64 zeros[20]; 954} __attribute__((packed)); 955 956struct bifrost_tiler_only { 957 /* 0x20 */ 958 union midgard_primitive_size primitive_size; 959 960 mali_ptr tiler_meta; 961 962 u64 zero1, zero2, zero3, zero4, zero5, zero6; 963 964 u32 gl_enables; 965 u32 zero7; 966 u64 zero8; 967} __attribute__((packed)); 968 969struct bifrost_scratchpad { 970 u32 zero; 971 u32 flags; // = 0x1f 972 /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated 973 * during startup. It seems to serve the same purpose as the 974 * gpu_scratchpad in the SFBD for Midgard, although it's slightly 975 * larger. 976 */ 977 mali_ptr gpu_scratchpad; 978} __attribute__((packed)); 979 980struct mali_vertex_tiler_postfix { 981 /* Zero for vertex jobs. Pointer to the position (gl_Position) varying 982 * output from the vertex shader for tiler jobs. 983 */ 984 985 uintptr_t position_varying; 986 987 /* An array of mali_uniform_buffer_meta's. The size is given by the 988 * shader_meta. 989 */ 990 uintptr_t uniform_buffers; 991 992 /* This is a pointer to an array of pointers to the texture 993 * descriptors, number of pointers bounded by number of textures. The 994 * indirection is needed to accomodate varying numbers and sizes of 995 * texture descriptors */ 996 uintptr_t texture_trampoline; 997 998 /* For OpenGL, from what I've seen, this is intimately connected to 999 * texture_meta. cwabbott says this is not the case under Vulkan, hence 1000 * why this field is seperate (Midgard is Vulkan capable). Pointer to 1001 * array of sampler descriptors (which are uniform in size) */ 1002 uintptr_t sampler_descriptor; 1003 1004 uintptr_t uniforms; 1005 u8 flags : 4; 1006 uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */ 1007 uintptr_t attributes; /* struct attribute_buffer[] */ 1008 uintptr_t attribute_meta; /* attribute_meta[] */ 1009 uintptr_t varyings; /* struct attr */ 1010 uintptr_t varying_meta; /* pointer */ 1011 uintptr_t viewport; 1012 uintptr_t occlusion_counter; /* A single bit as far as I can tell */ 1013 1014 /* Note: on Bifrost, this isn't actually the FBD. It points to 1015 * bifrost_scratchpad instead. However, it does point to the same thing 1016 * in vertex and tiler jobs. 1017 */ 1018 mali_ptr framebuffer; 1019 1020#ifdef __LP64__ 1021#ifdef BIFROST 1022 /* most likely padding to make this a multiple of 64 bytes */ 1023 u64 zero7; 1024#endif 1025#endif 1026} __attribute__((packed)); 1027 1028struct midgard_payload_vertex_tiler { 1029#ifndef __LP64__ 1030 union midgard_primitive_size primitive_size; 1031#endif 1032 1033 struct mali_vertex_tiler_prefix prefix; 1034 1035#ifndef __LP64__ 1036 u32 zero3; 1037#endif 1038 1039 u32 gl_enables; // 0x5 1040 1041 /* Offset for first vertex in buffer */ 1042 u32 draw_start; 1043 1044 uintptr_t zero5; 1045 1046 struct mali_vertex_tiler_postfix postfix; 1047 1048#ifdef __LP64__ 1049 union midgard_primitive_size primitive_size; 1050#endif 1051} __attribute__((packed)); 1052 1053struct bifrost_payload_vertex { 1054 struct mali_vertex_tiler_prefix prefix; 1055 struct bifrost_vertex_only vertex; 1056 struct mali_vertex_tiler_postfix postfix; 1057} __attribute__((packed)); 1058 1059struct bifrost_payload_tiler { 1060 struct mali_vertex_tiler_prefix prefix; 1061 struct bifrost_tiler_only tiler; 1062 struct mali_vertex_tiler_postfix postfix; 1063} __attribute__((packed)); 1064 1065struct bifrost_payload_fused { 1066 struct mali_vertex_tiler_prefix prefix; 1067 struct bifrost_tiler_only tiler; 1068 struct mali_vertex_tiler_postfix tiler_postfix; 1069 struct bifrost_vertex_only vertex; 1070 struct mali_vertex_tiler_postfix vertex_postfix; 1071} __attribute__((packed)); 1072 1073/* Pointed to from texture_trampoline, mostly unknown still, haven't 1074 * managed to replay successfully */ 1075 1076/* Purposeful off-by-one in width, height fields. For example, a (64, 64) 1077 * texture is stored as (63, 63) in these fields. This adjusts for that. 1078 * There's an identical pattern in the framebuffer descriptor. Even vertex 1079 * count fields work this way, hence the generic name -- integral fields that 1080 * are strictly positive generally need this adjustment. */ 1081 1082#define MALI_POSITIVE(dim) (dim - 1) 1083 1084/* Opposite of MALI_POSITIVE, found in the depth_units field */ 1085 1086#define MALI_NEGATIVE(dim) (dim + 1) 1087 1088/* Used with wrapping. Incomplete (this is a 4-bit field...) */ 1089 1090enum mali_wrap_mode { 1091 MALI_WRAP_REPEAT = 0x8, 1092 MALI_WRAP_CLAMP_TO_EDGE = 0x9, 1093 MALI_WRAP_CLAMP_TO_BORDER = 0xB, 1094 MALI_WRAP_MIRRORED_REPEAT = 0xC 1095}; 1096 1097/* 8192x8192 */ 1098#define MAX_MIP_LEVELS (13) 1099 1100/* Cubemap bloats everything up */ 1101#define MAX_FACES (6) 1102 1103/* Corresponds to the type passed to glTexImage2D and so forth */ 1104 1105struct mali_texture_format { 1106 unsigned swizzle : 12; 1107 enum mali_format format : 8; 1108 1109 unsigned usage1 : 3; 1110 unsigned is_not_cubemap : 1; 1111 unsigned usage2 : 8; 1112} __attribute__((packed)); 1113 1114struct mali_texture_descriptor { 1115 uint16_t width; 1116 uint16_t height; 1117 uint16_t depth; 1118 1119 uint16_t unknown1; 1120 1121 struct mali_texture_format format; 1122 1123 uint16_t unknown3; 1124 1125 /* One for non-mipmapped, zero for mipmapped */ 1126 uint8_t unknown3A; 1127 1128 /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */ 1129 uint8_t nr_mipmap_levels; 1130 1131 /* Swizzling is a single 32-bit word, broken up here for convenience. 1132 * Here, swizzling refers to the ES 3.0 texture parameters for channel 1133 * level swizzling, not the internal pixel-level swizzling which is 1134 * below OpenGL's reach */ 1135 1136 unsigned swizzle : 12; 1137 unsigned swizzle_zero : 20; 1138 1139 uint32_t unknown5; 1140 uint32_t unknown6; 1141 uint32_t unknown7; 1142 1143 mali_ptr swizzled_bitmaps[MAX_MIP_LEVELS * MAX_FACES]; 1144} __attribute__((packed)); 1145 1146/* Used as part of filter_mode */ 1147 1148#define MALI_LINEAR 0 1149#define MALI_NEAREST 1 1150#define MALI_MIP_LINEAR (0x18) 1151 1152/* Used to construct low bits of filter_mode */ 1153 1154#define MALI_TEX_MAG(mode) (((mode) & 1) << 0) 1155#define MALI_TEX_MIN(mode) (((mode) & 1) << 1) 1156 1157#define MALI_TEX_MAG_MASK (1) 1158#define MALI_TEX_MIN_MASK (2) 1159 1160#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR") 1161 1162/* Used for lod encoding. Thanks @urjaman for pointing out these routines can 1163 * be cleaned up a lot. */ 1164 1165#define DECODE_FIXED_16(x) ((float) (x / 256.0)) 1166 1167static inline uint16_t 1168FIXED_16(float x) 1169{ 1170 /* Clamp inputs, accounting for float error */ 1171 float max_lod = (32.0 - (1.0 / 512.0)); 1172 1173 x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x)); 1174 1175 return (int) (x * 256.0); 1176} 1177 1178struct mali_sampler_descriptor { 1179 uint32_t filter_mode; 1180 1181 /* Fixed point. Upper 8-bits is before the decimal point, although it 1182 * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x * 1183 * 256)) */ 1184 1185 uint16_t min_lod; 1186 uint16_t max_lod; 1187 1188 /* All one word in reality, but packed a bit */ 1189 1190 enum mali_wrap_mode wrap_s : 4; 1191 enum mali_wrap_mode wrap_t : 4; 1192 enum mali_wrap_mode wrap_r : 4; 1193 enum mali_alt_func compare_func : 3; 1194 1195 /* A single set bit of unknown, ha! */ 1196 unsigned unknown2 : 1; 1197 1198 unsigned zero : 16; 1199 1200 uint32_t zero2; 1201 float border_color[4]; 1202} __attribute__((packed)); 1203 1204/* TODO: What are the floats? Apparently always { -inf, -inf, inf, inf }, 1205 * unless the scissor test is enabled. 1206 * 1207 * viewport0/viewport1 form the arguments to glViewport. viewport1 is modified 1208 * by MALI_POSITIVE; viewport0 is as-is. 1209 */ 1210 1211struct mali_viewport { 1212 /* XY clipping planes */ 1213 float clip_minx; 1214 float clip_miny; 1215 float clip_maxx; 1216 float clip_maxy; 1217 1218 /* Depth clipping planes */ 1219 float clip_minz; 1220 float clip_maxz; 1221 1222 u16 viewport0[2]; 1223 u16 viewport1[2]; 1224} __attribute__((packed)); 1225 1226/* From presentations, 16x16 tiles externally. Use shift for fast computation 1227 * of tile numbers. */ 1228 1229#define MALI_TILE_SHIFT 4 1230#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT) 1231 1232/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to 1233 * each component. Notice that this provides a theoretical upper bound of (1 << 1234 * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size 1235 * 65536x65536. Multiplying that together, times another four given that Mali 1236 * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16 1237 * gigabytes of RAM just to store the uncompressed framebuffer itself, let 1238 * alone rendering in real-time to such a buffer. 1239 * 1240 * Nice job, guys.*/ 1241 1242/* From mali_kbase_10969_workaround.c */ 1243#define MALI_X_COORD_MASK 0x00000FFF 1244#define MALI_Y_COORD_MASK 0x0FFF0000 1245 1246/* Extract parts of a tile coordinate */ 1247 1248#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK) 1249#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16) 1250#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK)) 1251 1252/* No known flags yet, but just in case...? */ 1253 1254#define MALI_TILE_NO_FLAG (0) 1255 1256/* Helpers to generate tile coordinates based on the boundary coordinates in 1257 * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these 1258 * functions would convert it to the bounding tiles (0, 0) to (7, 7). 1259 * Intentional "off-by-one"; finding the tile number is a form of fencepost 1260 * problem. */ 1261 1262#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16)) 1263#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT) 1264#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias)) 1265#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0) 1266#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1) 1267 1268struct mali_payload_fragment { 1269 u32 min_tile_coord; 1270 u32 max_tile_coord; 1271 mali_ptr framebuffer; 1272} __attribute__((packed)); 1273 1274/* (Single?) Framebuffer Descriptor */ 1275 1276/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is 1277 * configured for 4x. With MSAA_8, it is configured for 8x. */ 1278 1279#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3) 1280#define MALI_FRAMEBUFFER_MSAA_A (1 << 4) 1281#define MALI_FRAMEBUFFER_MSAA_B (1 << 23) 1282 1283/* Fast/slow based on whether all three buffers are cleared at once */ 1284 1285#define MALI_CLEAR_FAST (1 << 18) 1286#define MALI_CLEAR_SLOW (1 << 28) 1287#define MALI_CLEAR_SLOW_STENCIL (1 << 31) 1288 1289struct mali_single_framebuffer { 1290 u32 unknown1; 1291 u32 unknown2; 1292 u64 unknown_address_0; 1293 u64 zero1; 1294 u64 zero0; 1295 1296 /* Exact format is ironically not known, since EGL is finnicky with the 1297 * blob. MSAA, colourspace, etc are configured here. */ 1298 1299 u32 format; 1300 1301 u32 clear_flags; 1302 u32 zero2; 1303 1304 /* Purposeful off-by-one in these fields should be accounted for by the 1305 * MALI_DIMENSION macro */ 1306 1307 u16 width; 1308 u16 height; 1309 1310 u32 zero3[8]; 1311 1312 /* By default, the framebuffer is upside down from OpenGL's 1313 * perspective. Set framebuffer to the end and negate the stride to 1314 * flip in the Y direction */ 1315 1316 mali_ptr framebuffer; 1317 int32_t stride; 1318 1319 u32 zero4; 1320 1321 /* Depth and stencil buffers are interleaved, it appears, as they are 1322 * set to the same address in captures. Both fields set to zero if the 1323 * buffer is not being cleared. Depending on GL_ENABLE magic, you might 1324 * get a zero enable despite the buffer being present; that still is 1325 * disabled. */ 1326 1327 mali_ptr depth_buffer; // not SAME_VA 1328 u64 depth_buffer_enable; 1329 1330 mali_ptr stencil_buffer; // not SAME_VA 1331 u64 stencil_buffer_enable; 1332 1333 u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware 1334 u32 clear_color_2; // always equal, but unclear function? 1335 u32 clear_color_3; // always equal, but unclear function? 1336 u32 clear_color_4; // always equal, but unclear function? 1337 1338 /* Set to zero if not cleared */ 1339 1340 float clear_depth_1; // float32, ditto 1341 float clear_depth_2; // float32, ditto 1342 float clear_depth_3; // float32, ditto 1343 float clear_depth_4; // float32, ditto 1344 1345 u32 clear_stencil; // Exactly as it appears in OpenGL 1346 1347 u32 zero6[7]; 1348 1349 /* Very weird format, see generation code in trans_builder.c */ 1350 u32 resolution_check; 1351 1352 u32 tiler_flags; 1353 1354 u64 unknown_address_1; /* Pointing towards... a zero buffer? */ 1355 u64 unknown_address_2; 1356 1357 /* See mali_kbase_replay.c */ 1358 u64 tiler_heap_free; 1359 u64 tiler_heap_end; 1360 1361 /* More below this, maybe */ 1362} __attribute__((packed)); 1363 1364/* Format bits for the render target flags */ 1365 1366#define MALI_MFBD_FORMAT_AFBC (1 << 5) 1367#define MALI_MFBD_FORMAT_MSAA (1 << 7) 1368 1369struct mali_rt_format { 1370 unsigned unk1 : 32; 1371 unsigned unk2 : 3; 1372 1373 unsigned nr_channels : 2; /* MALI_POSITIVE */ 1374 1375 unsigned flags : 11; 1376 1377 unsigned swizzle : 12; 1378 1379 unsigned unk4 : 4; 1380} __attribute__((packed)); 1381 1382struct bifrost_render_target { 1383 struct mali_rt_format format; 1384 1385 u64 zero1; 1386 1387 union { 1388 struct { 1389 /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled, 1390 * there is an extra metadata buffer that contains 16 bytes per tile. 1391 * The framebuffer needs to be the same size as before, since we don't 1392 * know ahead of time how much space it will take up. The 1393 * framebuffer_stride is set to 0, since the data isn't stored linearly 1394 * anymore. 1395 */ 1396 1397 mali_ptr metadata; 1398 u32 stride; // stride in units of tiles 1399 u32 unk; // = 0x20000 1400 } afbc; 1401 1402 struct { 1403 /* Heck if I know */ 1404 u64 unk; 1405 mali_ptr pointer; 1406 } chunknown; 1407 }; 1408 1409 mali_ptr framebuffer; 1410 1411 u32 zero2 : 4; 1412 u32 framebuffer_stride : 28; // in units of bytes 1413 u32 zero3; 1414 1415 u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware 1416 u32 clear_color_2; // always equal, but unclear function? 1417 u32 clear_color_3; // always equal, but unclear function? 1418 u32 clear_color_4; // always equal, but unclear function? 1419} __attribute__((packed)); 1420 1421/* An optional part of bifrost_framebuffer. It comes between the main structure 1422 * and the array of render targets. It must be included if any of these are 1423 * enabled: 1424 * 1425 * - Transaction Elimination 1426 * - Depth/stencil 1427 * - TODO: Anything else? 1428 */ 1429 1430/* Flags field: note, these are guesses */ 1431 1432#define MALI_EXTRA_PRESENT (0x400) 1433#define MALI_EXTRA_AFBC (0x20) 1434#define MALI_EXTRA_AFBC_ZS (0x10) 1435#define MALI_EXTRA_ZS (0x4) 1436 1437struct bifrost_fb_extra { 1438 mali_ptr checksum; 1439 /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */ 1440 u32 checksum_stride; 1441 1442 u32 flags; 1443 1444 union { 1445 /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */ 1446 struct { 1447 mali_ptr depth_stencil_afbc_metadata; 1448 u32 depth_stencil_afbc_stride; // in units of tiles 1449 u32 zero1; 1450 1451 mali_ptr depth_stencil; 1452 1453 u64 padding; 1454 } ds_afbc; 1455 1456 struct { 1457 /* Depth becomes depth/stencil in case of combined D/S */ 1458 mali_ptr depth; 1459 u32 depth_stride_zero : 4; 1460 u32 depth_stride : 28; 1461 u32 zero1; 1462 1463 mali_ptr stencil; 1464 u32 stencil_stride_zero : 4; 1465 u32 stencil_stride : 28; 1466 u32 zero2; 1467 } ds_linear; 1468 }; 1469 1470 1471 u64 zero3, zero4; 1472} __attribute__((packed)); 1473 1474/* flags for unk3 */ 1475 1476/* Enables writing depth results back to main memory (rather than keeping them 1477 * on-chip in the tile buffer and then discarding) */ 1478 1479#define MALI_MFBD_DEPTH_WRITE (1 << 10) 1480 1481/* The MFBD contains the extra bifrost_fb_extra section */ 1482 1483#define MALI_MFBD_EXTRA (1 << 13) 1484 1485struct bifrost_framebuffer { 1486 u32 unk0; // = 0x10 1487 1488 u32 unknown2; // = 0x1f, same as SFBD 1489 mali_ptr scratchpad; 1490 1491 /* 0x10 */ 1492 mali_ptr sample_locations; 1493 mali_ptr unknown1; 1494 /* 0x20 */ 1495 u16 width1, height1; 1496 u32 zero3; 1497 u16 width2, height2; 1498 u32 unk1 : 19; // = 0x01000 1499 u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE) 1500 u32 unk2 : 3; // = 0 1501 u32 rt_count_2 : 3; // no off-by-one 1502 u32 zero4 : 5; 1503 /* 0x30 */ 1504 u32 clear_stencil : 8; 1505 u32 unk3 : 24; // = 0x100 1506 float clear_depth; 1507 mali_ptr tiler_meta; 1508 /* 0x40 */ 1509 1510 /* Note: these are guesses! */ 1511 mali_ptr tiler_scratch_start; 1512 mali_ptr tiler_scratch_middle; 1513 1514 /* These are not, since we see symmetry with replay jobs which name these explicitly */ 1515 mali_ptr tiler_heap_start; 1516 mali_ptr tiler_heap_end; 1517 1518 u64 zero9, zero10, zero11, zero12; 1519 1520 /* optional: struct bifrost_fb_extra extra */ 1521 /* struct bifrost_render_target rts[] */ 1522} __attribute__((packed)); 1523 1524#endif /* __PANFROST_JOB_H__ */ 1525