1b8e80941Smrg/*
2b8e80941Smrg * © Copyright 2017-2018 Alyssa Rosenzweig
3b8e80941Smrg * © Copyright 2017-2018 Connor Abbott
4b8e80941Smrg * © Copyright 2017-2018 Lyude Paul
5b8e80941Smrg *
6b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
7b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
8b8e80941Smrg * to deal in the Software without restriction, including without limitation
9b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
11b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
12b8e80941Smrg *
13b8e80941Smrg * The above copyright notice and this permission notice (including the next
14b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
15b8e80941Smrg * Software.
16b8e80941Smrg *
17b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23b8e80941Smrg * SOFTWARE.
24b8e80941Smrg *
25b8e80941Smrg */
26b8e80941Smrg
27b8e80941Smrg#ifndef __PANFROST_JOB_H__
28b8e80941Smrg#define __PANFROST_JOB_H__
29b8e80941Smrg
30b8e80941Smrg#include <stdint.h>
31b8e80941Smrg#include <panfrost-misc.h>
32b8e80941Smrg
33b8e80941Smrg#define MALI_SHORT_PTR_BITS (sizeof(uintptr_t)*8)
34b8e80941Smrg
35b8e80941Smrg#define MALI_FBD_HIERARCHY_WEIGHTS 8
36b8e80941Smrg
37b8e80941Smrg#define MALI_PAYLOAD_SIZE 256
38b8e80941Smrg
39b8e80941Smrgtypedef u32 mali_jd_core_req;
40b8e80941Smrg
41b8e80941Smrgenum mali_job_type {
42b8e80941Smrg        JOB_NOT_STARTED	= 0,
43b8e80941Smrg        JOB_TYPE_NULL = 1,
44b8e80941Smrg        JOB_TYPE_SET_VALUE = 2,
45b8e80941Smrg        JOB_TYPE_CACHE_FLUSH = 3,
46b8e80941Smrg        JOB_TYPE_COMPUTE = 4,
47b8e80941Smrg        JOB_TYPE_VERTEX = 5,
48b8e80941Smrg        JOB_TYPE_GEOMETRY = 6,
49b8e80941Smrg        JOB_TYPE_TILER = 7,
50b8e80941Smrg        JOB_TYPE_FUSED = 8,
51b8e80941Smrg        JOB_TYPE_FRAGMENT = 9,
52b8e80941Smrg};
53b8e80941Smrg
54b8e80941Smrgenum mali_draw_mode {
55b8e80941Smrg        MALI_DRAW_NONE      = 0x0,
56b8e80941Smrg        MALI_POINTS         = 0x1,
57b8e80941Smrg        MALI_LINES          = 0x2,
58b8e80941Smrg        MALI_LINE_STRIP     = 0x4,
59b8e80941Smrg        MALI_LINE_LOOP      = 0x6,
60b8e80941Smrg        MALI_TRIANGLES      = 0x8,
61b8e80941Smrg        MALI_TRIANGLE_STRIP = 0xA,
62b8e80941Smrg        MALI_TRIANGLE_FAN   = 0xC,
63b8e80941Smrg        MALI_POLYGON        = 0xD,
64b8e80941Smrg        MALI_QUADS          = 0xE,
65b8e80941Smrg        MALI_QUAD_STRIP     = 0xF,
66b8e80941Smrg
67b8e80941Smrg        /* All other modes invalid */
68b8e80941Smrg};
69b8e80941Smrg
70b8e80941Smrg/* Applies to tiler_gl_enables */
71b8e80941Smrg
72b8e80941Smrg
73b8e80941Smrg#define MALI_OCCLUSION_QUERY    (1 << 3)
74b8e80941Smrg#define MALI_OCCLUSION_PRECISE  (1 << 4)
75b8e80941Smrg
76b8e80941Smrg#define MALI_FRONT_FACE(v)      (v << 5)
77b8e80941Smrg#define MALI_CCW (0)
78b8e80941Smrg#define MALI_CW  (1)
79b8e80941Smrg
80b8e80941Smrg#define MALI_CULL_FACE_FRONT    (1 << 6)
81b8e80941Smrg#define MALI_CULL_FACE_BACK     (1 << 7)
82b8e80941Smrg
83b8e80941Smrg/* TODO: Might this actually be a finer bitfield? */
84b8e80941Smrg#define MALI_DEPTH_STENCIL_ENABLE 0x6400
85b8e80941Smrg
86b8e80941Smrg#define DS_ENABLE(field) \
87b8e80941Smrg	(field == MALI_DEPTH_STENCIL_ENABLE) \
88b8e80941Smrg	? "MALI_DEPTH_STENCIL_ENABLE" \
89b8e80941Smrg	: (field == 0) ? "0" \
90b8e80941Smrg	: "0 /* XXX: Unknown, check hexdump */"
91b8e80941Smrg
92b8e80941Smrg/* Used in stencil and depth tests */
93b8e80941Smrg
94b8e80941Smrgenum mali_func {
95b8e80941Smrg        MALI_FUNC_NEVER    = 0,
96b8e80941Smrg        MALI_FUNC_LESS     = 1,
97b8e80941Smrg        MALI_FUNC_EQUAL    = 2,
98b8e80941Smrg        MALI_FUNC_LEQUAL   = 3,
99b8e80941Smrg        MALI_FUNC_GREATER  = 4,
100b8e80941Smrg        MALI_FUNC_NOTEQUAL = 5,
101b8e80941Smrg        MALI_FUNC_GEQUAL   = 6,
102b8e80941Smrg        MALI_FUNC_ALWAYS   = 7
103b8e80941Smrg};
104b8e80941Smrg
105b8e80941Smrg/* Same OpenGL, but mixed up. Why? Because forget me, that's why! */
106b8e80941Smrg
107b8e80941Smrgenum mali_alt_func {
108b8e80941Smrg        MALI_ALT_FUNC_NEVER    = 0,
109b8e80941Smrg        MALI_ALT_FUNC_GREATER  = 1,
110b8e80941Smrg        MALI_ALT_FUNC_EQUAL    = 2,
111b8e80941Smrg        MALI_ALT_FUNC_GEQUAL   = 3,
112b8e80941Smrg        MALI_ALT_FUNC_LESS     = 4,
113b8e80941Smrg        MALI_ALT_FUNC_NOTEQUAL = 5,
114b8e80941Smrg        MALI_ALT_FUNC_LEQUAL   = 6,
115b8e80941Smrg        MALI_ALT_FUNC_ALWAYS   = 7
116b8e80941Smrg};
117b8e80941Smrg
118b8e80941Smrg/* Flags apply to unknown2_3? */
119b8e80941Smrg
120b8e80941Smrg#define MALI_HAS_MSAA		(1 << 0)
121b8e80941Smrg#define MALI_CAN_DISCARD 	(1 << 5)
122b8e80941Smrg
123b8e80941Smrg/* Applies on SFBD systems, specifying that programmable blending is in use */
124b8e80941Smrg#define MALI_HAS_BLEND_SHADER 	(1 << 6)
125b8e80941Smrg
126b8e80941Smrg/* func is mali_func */
127b8e80941Smrg#define MALI_DEPTH_FUNC(func)	   (func << 8)
128b8e80941Smrg#define MALI_GET_DEPTH_FUNC(flags) ((flags >> 8) & 0x7)
129b8e80941Smrg#define MALI_DEPTH_FUNC_MASK	   MALI_DEPTH_FUNC(0x7)
130b8e80941Smrg
131b8e80941Smrg#define MALI_DEPTH_TEST		(1 << 11)
132b8e80941Smrg
133b8e80941Smrg/* Next flags to unknown2_4 */
134b8e80941Smrg#define MALI_STENCIL_TEST      	(1 << 0)
135b8e80941Smrg
136b8e80941Smrg/* What?! */
137b8e80941Smrg#define MALI_SAMPLE_ALPHA_TO_COVERAGE_NO_BLEND_SHADER (1 << 1)
138b8e80941Smrg
139b8e80941Smrg#define MALI_NO_DITHER		(1 << 9)
140b8e80941Smrg#define MALI_DEPTH_RANGE_A	(1 << 12)
141b8e80941Smrg#define MALI_DEPTH_RANGE_B	(1 << 13)
142b8e80941Smrg#define MALI_NO_MSAA		(1 << 14)
143b8e80941Smrg
144b8e80941Smrg/* Stencil test state is all encoded in a single u32, just with a lot of
145b8e80941Smrg * enums... */
146b8e80941Smrg
147b8e80941Smrgenum mali_stencil_op {
148b8e80941Smrg        MALI_STENCIL_KEEP 	= 0,
149b8e80941Smrg        MALI_STENCIL_REPLACE 	= 1,
150b8e80941Smrg        MALI_STENCIL_ZERO 	= 2,
151b8e80941Smrg        MALI_STENCIL_INVERT 	= 3,
152b8e80941Smrg        MALI_STENCIL_INCR_WRAP 	= 4,
153b8e80941Smrg        MALI_STENCIL_DECR_WRAP 	= 5,
154b8e80941Smrg        MALI_STENCIL_INCR 	= 6,
155b8e80941Smrg        MALI_STENCIL_DECR 	= 7
156b8e80941Smrg};
157b8e80941Smrg
158b8e80941Smrgstruct mali_stencil_test {
159b8e80941Smrg        unsigned ref  			: 8;
160b8e80941Smrg        unsigned mask 			: 8;
161b8e80941Smrg        enum mali_func func 		: 3;
162b8e80941Smrg        enum mali_stencil_op sfail 	: 3;
163b8e80941Smrg        enum mali_stencil_op dpfail 	: 3;
164b8e80941Smrg        enum mali_stencil_op dppass 	: 3;
165b8e80941Smrg        unsigned zero			: 4;
166b8e80941Smrg} __attribute__((packed));
167b8e80941Smrg
168b8e80941Smrg/* Blending is a mess, since anything fancy triggers a blend shader, and
169b8e80941Smrg * -those- are not understood whatsover yet */
170b8e80941Smrg
171b8e80941Smrg#define MALI_MASK_R (1 << 0)
172b8e80941Smrg#define MALI_MASK_G (1 << 1)
173b8e80941Smrg#define MALI_MASK_B (1 << 2)
174b8e80941Smrg#define MALI_MASK_A (1 << 3)
175b8e80941Smrg
176b8e80941Smrgenum mali_nondominant_mode {
177b8e80941Smrg        MALI_BLEND_NON_MIRROR = 0,
178b8e80941Smrg        MALI_BLEND_NON_ZERO = 1
179b8e80941Smrg};
180b8e80941Smrg
181b8e80941Smrgenum mali_dominant_blend {
182b8e80941Smrg        MALI_BLEND_DOM_SOURCE = 0,
183b8e80941Smrg        MALI_BLEND_DOM_DESTINATION  = 1
184b8e80941Smrg};
185b8e80941Smrg
186b8e80941Smrgenum mali_dominant_factor {
187b8e80941Smrg        MALI_DOMINANT_UNK0 = 0,
188b8e80941Smrg        MALI_DOMINANT_ZERO = 1,
189b8e80941Smrg        MALI_DOMINANT_SRC_COLOR = 2,
190b8e80941Smrg        MALI_DOMINANT_DST_COLOR = 3,
191b8e80941Smrg        MALI_DOMINANT_UNK4 = 4,
192b8e80941Smrg        MALI_DOMINANT_SRC_ALPHA = 5,
193b8e80941Smrg        MALI_DOMINANT_DST_ALPHA = 6,
194b8e80941Smrg        MALI_DOMINANT_CONSTANT = 7,
195b8e80941Smrg};
196b8e80941Smrg
197b8e80941Smrgenum mali_blend_modifier {
198b8e80941Smrg        MALI_BLEND_MOD_UNK0 = 0,
199b8e80941Smrg        MALI_BLEND_MOD_NORMAL = 1,
200b8e80941Smrg        MALI_BLEND_MOD_SOURCE_ONE = 2,
201b8e80941Smrg        MALI_BLEND_MOD_DEST_ONE = 3,
202b8e80941Smrg};
203b8e80941Smrg
204b8e80941Smrgstruct mali_blend_mode {
205b8e80941Smrg        enum mali_blend_modifier clip_modifier : 2;
206b8e80941Smrg        unsigned unused_0 : 1;
207b8e80941Smrg        unsigned negate_source : 1;
208b8e80941Smrg
209b8e80941Smrg        enum mali_dominant_blend dominant : 1;
210b8e80941Smrg
211b8e80941Smrg        enum mali_nondominant_mode nondominant_mode : 1;
212b8e80941Smrg
213b8e80941Smrg        unsigned unused_1 : 1;
214b8e80941Smrg
215b8e80941Smrg        unsigned negate_dest : 1;
216b8e80941Smrg
217b8e80941Smrg        enum mali_dominant_factor dominant_factor : 3;
218b8e80941Smrg        unsigned complement_dominant : 1;
219b8e80941Smrg} __attribute__((packed));
220b8e80941Smrg
221b8e80941Smrgstruct mali_blend_equation {
222b8e80941Smrg        /* Of type mali_blend_mode */
223b8e80941Smrg        unsigned rgb_mode : 12;
224b8e80941Smrg        unsigned alpha_mode : 12;
225b8e80941Smrg
226b8e80941Smrg        unsigned zero1 : 4;
227b8e80941Smrg
228b8e80941Smrg        /* Corresponds to MALI_MASK_* above and glColorMask arguments */
229b8e80941Smrg
230b8e80941Smrg        unsigned color_mask : 4;
231b8e80941Smrg
232b8e80941Smrg        /* Attached constant for CONSTANT_ALPHA, etc */
233b8e80941Smrg
234b8e80941Smrg#ifndef BIFROST
235b8e80941Smrg        float constant;
236b8e80941Smrg#endif
237b8e80941Smrg} __attribute__((packed));
238b8e80941Smrg
239b8e80941Smrg/* Used with channel swizzling */
240b8e80941Smrgenum mali_channel {
241b8e80941Smrg	MALI_CHANNEL_RED = 0,
242b8e80941Smrg	MALI_CHANNEL_GREEN = 1,
243b8e80941Smrg	MALI_CHANNEL_BLUE = 2,
244b8e80941Smrg	MALI_CHANNEL_ALPHA = 3,
245b8e80941Smrg	MALI_CHANNEL_ZERO = 4,
246b8e80941Smrg	MALI_CHANNEL_ONE = 5,
247b8e80941Smrg	MALI_CHANNEL_RESERVED_0 = 6,
248b8e80941Smrg	MALI_CHANNEL_RESERVED_1 = 7,
249b8e80941Smrg};
250b8e80941Smrg
251b8e80941Smrgstruct mali_channel_swizzle {
252b8e80941Smrg	enum mali_channel r : 3;
253b8e80941Smrg	enum mali_channel g : 3;
254b8e80941Smrg	enum mali_channel b : 3;
255b8e80941Smrg	enum mali_channel a : 3;
256b8e80941Smrg} __attribute__((packed));
257b8e80941Smrg
258b8e80941Smrg/* Compressed per-pixel formats. Each of these formats expands to one to four
259b8e80941Smrg * floating-point or integer numbers, as defined by the OpenGL specification.
260b8e80941Smrg * There are various places in OpenGL where the user can specify a compressed
261b8e80941Smrg * format in memory, which all use the same 8-bit enum in the various
262b8e80941Smrg * descriptors, although different hardware units support different formats.
263b8e80941Smrg */
264b8e80941Smrg
265b8e80941Smrg/* The top 3 bits specify how the bits of each component are interpreted. */
266b8e80941Smrg
267b8e80941Smrg/* e.g. R11F_G11F_B10F */
268b8e80941Smrg#define MALI_FORMAT_SPECIAL (2 << 5)
269b8e80941Smrg
270b8e80941Smrg/* signed normalized, e.g. RGBA8_SNORM */
271b8e80941Smrg#define MALI_FORMAT_SNORM (3 << 5)
272b8e80941Smrg
273b8e80941Smrg/* e.g. RGBA8UI */
274b8e80941Smrg#define MALI_FORMAT_UINT (4 << 5)
275b8e80941Smrg
276b8e80941Smrg/* e.g. RGBA8 and RGBA32F */
277b8e80941Smrg#define MALI_FORMAT_UNORM (5 << 5)
278b8e80941Smrg
279b8e80941Smrg/* e.g. RGBA8I and RGBA16F */
280b8e80941Smrg#define MALI_FORMAT_SINT (6 << 5)
281b8e80941Smrg
282b8e80941Smrg/* These formats seem to largely duplicate the others. They're used at least
283b8e80941Smrg * for Bifrost framebuffer output.
284b8e80941Smrg */
285b8e80941Smrg#define MALI_FORMAT_SPECIAL2 (7 << 5)
286b8e80941Smrg
287b8e80941Smrg/* If the high 3 bits are 3 to 6 these two bits say how many components
288b8e80941Smrg * there are.
289b8e80941Smrg */
290b8e80941Smrg#define MALI_NR_CHANNELS(n) ((n - 1) << 3)
291b8e80941Smrg
292b8e80941Smrg/* If the high 3 bits are 3 to 6, then the low 3 bits say how big each
293b8e80941Smrg * component is, except the special MALI_CHANNEL_FLOAT which overrides what the
294b8e80941Smrg * bits mean.
295b8e80941Smrg */
296b8e80941Smrg
297b8e80941Smrg#define MALI_CHANNEL_4 2
298b8e80941Smrg
299b8e80941Smrg#define MALI_CHANNEL_8 3
300b8e80941Smrg
301b8e80941Smrg#define MALI_CHANNEL_16 4
302b8e80941Smrg
303b8e80941Smrg#define MALI_CHANNEL_32 5
304b8e80941Smrg
305b8e80941Smrg/* For MALI_FORMAT_SINT it means a half-float (e.g. RG16F). For
306b8e80941Smrg * MALI_FORMAT_UNORM, it means a 32-bit float.
307b8e80941Smrg */
308b8e80941Smrg#define MALI_CHANNEL_FLOAT 7
309b8e80941Smrg
310b8e80941Smrgenum mali_format {
311b8e80941Smrg	MALI_RGB565         = MALI_FORMAT_SPECIAL | 0x0,
312b8e80941Smrg	MALI_RGB5_A1_UNORM  = MALI_FORMAT_SPECIAL | 0x2,
313b8e80941Smrg	MALI_RGB10_A2_UNORM = MALI_FORMAT_SPECIAL | 0x3,
314b8e80941Smrg	MALI_RGB10_A2_SNORM = MALI_FORMAT_SPECIAL | 0x5,
315b8e80941Smrg	MALI_RGB10_A2UI     = MALI_FORMAT_SPECIAL | 0x7,
316b8e80941Smrg	MALI_RGB10_A2I      = MALI_FORMAT_SPECIAL | 0x9,
317b8e80941Smrg
318b8e80941Smrg	/* YUV formats */
319b8e80941Smrg	MALI_NV12           = MALI_FORMAT_SPECIAL | 0xc,
320b8e80941Smrg
321b8e80941Smrg	MALI_Z32_UNORM      = MALI_FORMAT_SPECIAL | 0xD,
322b8e80941Smrg	MALI_R32_FIXED      = MALI_FORMAT_SPECIAL | 0x11,
323b8e80941Smrg	MALI_RG32_FIXED     = MALI_FORMAT_SPECIAL | 0x12,
324b8e80941Smrg	MALI_RGB32_FIXED    = MALI_FORMAT_SPECIAL | 0x13,
325b8e80941Smrg	MALI_RGBA32_FIXED   = MALI_FORMAT_SPECIAL | 0x14,
326b8e80941Smrg	MALI_R11F_G11F_B10F = MALI_FORMAT_SPECIAL | 0x19,
327b8e80941Smrg	/* Only used for varyings, to indicate the transformed gl_Position */
328b8e80941Smrg	MALI_VARYING_POS    = MALI_FORMAT_SPECIAL | 0x1e,
329b8e80941Smrg	/* Only used for varyings, to indicate that the write should be
330b8e80941Smrg	 * discarded.
331b8e80941Smrg	 */
332b8e80941Smrg	MALI_VARYING_DISCARD = MALI_FORMAT_SPECIAL | 0x1f,
333b8e80941Smrg
334b8e80941Smrg	MALI_R8_SNORM     = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
335b8e80941Smrg	MALI_R16_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
336b8e80941Smrg	MALI_R32_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
337b8e80941Smrg	MALI_RG8_SNORM    = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
338b8e80941Smrg	MALI_RG16_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
339b8e80941Smrg	MALI_RG32_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
340b8e80941Smrg	MALI_RGB8_SNORM   = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
341b8e80941Smrg	MALI_RGB16_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
342b8e80941Smrg	MALI_RGB32_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
343b8e80941Smrg	MALI_RGBA8_SNORM  = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
344b8e80941Smrg	MALI_RGBA16_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
345b8e80941Smrg	MALI_RGBA32_SNORM = MALI_FORMAT_SNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
346b8e80941Smrg
347b8e80941Smrg	MALI_R8UI     = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
348b8e80941Smrg	MALI_R16UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
349b8e80941Smrg	MALI_R32UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
350b8e80941Smrg	MALI_RG8UI    = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
351b8e80941Smrg	MALI_RG16UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
352b8e80941Smrg	MALI_RG32UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
353b8e80941Smrg	MALI_RGB8UI   = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
354b8e80941Smrg	MALI_RGB16UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
355b8e80941Smrg	MALI_RGB32UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
356b8e80941Smrg	MALI_RGBA8UI  = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
357b8e80941Smrg	MALI_RGBA16UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
358b8e80941Smrg	MALI_RGBA32UI = MALI_FORMAT_UINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
359b8e80941Smrg
360b8e80941Smrg	MALI_R8_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
361b8e80941Smrg	MALI_R16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
362b8e80941Smrg	MALI_R32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
363b8e80941Smrg	MALI_R32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
364b8e80941Smrg	MALI_RG8_UNORM    = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
365b8e80941Smrg	MALI_RG16_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
366b8e80941Smrg	MALI_RG32_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
367b8e80941Smrg	MALI_RG32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
368b8e80941Smrg	MALI_RGB8_UNORM   = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
369b8e80941Smrg	MALI_RGB16_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
370b8e80941Smrg	MALI_RGB32_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
371b8e80941Smrg	MALI_RGB32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
372b8e80941Smrg	MALI_RGBA4_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_4,
373b8e80941Smrg	MALI_RGBA8_UNORM  = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
374b8e80941Smrg	MALI_RGBA16_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
375b8e80941Smrg	MALI_RGBA32_UNORM = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
376b8e80941Smrg	MALI_RGBA32F = MALI_FORMAT_UNORM | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
377b8e80941Smrg
378b8e80941Smrg	MALI_R8I     = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_8,
379b8e80941Smrg	MALI_R16I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_16,
380b8e80941Smrg	MALI_R32I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_32,
381b8e80941Smrg	MALI_R16F    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(1) | MALI_CHANNEL_FLOAT,
382b8e80941Smrg	MALI_RG8I    = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_8,
383b8e80941Smrg	MALI_RG16I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_16,
384b8e80941Smrg	MALI_RG32I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_32,
385b8e80941Smrg	MALI_RG16F   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(2) | MALI_CHANNEL_FLOAT,
386b8e80941Smrg	MALI_RGB8I   = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_8,
387b8e80941Smrg	MALI_RGB16I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_16,
388b8e80941Smrg	MALI_RGB32I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_32,
389b8e80941Smrg	MALI_RGB16F  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(3) | MALI_CHANNEL_FLOAT,
390b8e80941Smrg	MALI_RGBA8I  = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_8,
391b8e80941Smrg	MALI_RGBA16I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_16,
392b8e80941Smrg	MALI_RGBA32I = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_32,
393b8e80941Smrg	MALI_RGBA16F = MALI_FORMAT_SINT | MALI_NR_CHANNELS(4) | MALI_CHANNEL_FLOAT,
394b8e80941Smrg
395b8e80941Smrg	MALI_RGBA4      = MALI_FORMAT_SPECIAL2 | 0x8,
396b8e80941Smrg	MALI_RGBA8_2    = MALI_FORMAT_SPECIAL2 | 0xd,
397b8e80941Smrg	MALI_RGB10_A2_2 = MALI_FORMAT_SPECIAL2 | 0xe,
398b8e80941Smrg};
399b8e80941Smrg
400b8e80941Smrg
401b8e80941Smrg/* Alpha coverage is encoded as 4-bits (from a clampf), with inversion
402b8e80941Smrg * literally performing a bitwise invert. This function produces slightly wrong
403b8e80941Smrg * results and I'm not sure why; some rounding issue I suppose... */
404b8e80941Smrg
405b8e80941Smrg#define MALI_ALPHA_COVERAGE(clampf) ((uint16_t) (int) (clampf * 15.0f))
406b8e80941Smrg#define MALI_GET_ALPHA_COVERAGE(nibble) ((float) nibble / 15.0f)
407b8e80941Smrg
408b8e80941Smrg/* Applies to unknown1 */
409b8e80941Smrg#define MALI_NO_ALPHA_TO_COVERAGE (1 << 10)
410b8e80941Smrg
411b8e80941Smrg/* Flags denoting the fragment shader's use of tilebuffer readback. If the
412b8e80941Smrg * shader might read any part of the tilebuffer, set MALI_READS_TILEBUFFER. If
413b8e80941Smrg * it might read depth/stencil in particular, also set MALI_READS_ZS */
414b8e80941Smrg
415b8e80941Smrg#define MALI_READS_ZS (1 << 12)
416b8e80941Smrg#define MALI_READS_TILEBUFFER (1 << 16)
417b8e80941Smrg
418b8e80941Smrgstruct mali_blend_meta {
419b8e80941Smrg#ifndef BIFROST
420b8e80941Smrg        /* Base value of 0x200.
421b8e80941Smrg         * OR with 0x1 for blending (anything other than REPLACE).
422b8e80941Smrg         * OR with 0x2 for programmable blending
423b8e80941Smrg         */
424b8e80941Smrg
425b8e80941Smrg        u64 unk1;
426b8e80941Smrg
427b8e80941Smrg        union {
428b8e80941Smrg                struct mali_blend_equation blend_equation_1;
429b8e80941Smrg                mali_ptr blend_shader;
430b8e80941Smrg        };
431b8e80941Smrg
432b8e80941Smrg        u64 zero2;
433b8e80941Smrg        struct mali_blend_equation blend_equation_2;
434b8e80941Smrg#else
435b8e80941Smrg        u32 unk1; // = 0x200
436b8e80941Smrg        struct mali_blend_equation blend_equation;
437b8e80941Smrg        /*
438b8e80941Smrg         * - 0x19 normally
439b8e80941Smrg         * - 0x3 when this slot is unused (everything else is 0 except the index)
440b8e80941Smrg         * - 0x11 when this is the fourth slot (and it's used)
441b8e80941Smrg+	 * - 0 when there is a blend shader
442b8e80941Smrg         */
443b8e80941Smrg        u16 unk2;
444b8e80941Smrg        /* increments from 0 to 3 */
445b8e80941Smrg        u16 index;
446b8e80941Smrg
447b8e80941Smrg	union {
448b8e80941Smrg		struct {
449b8e80941Smrg			/* So far, I've only seen:
450b8e80941Smrg			 * - R001 for 1-component formats
451b8e80941Smrg			 * - RG01 for 2-component formats
452b8e80941Smrg			 * - RGB1 for 3-component formats
453b8e80941Smrg			 * - RGBA for 4-component formats
454b8e80941Smrg			 */
455b8e80941Smrg			u32 swizzle : 12;
456b8e80941Smrg			enum mali_format format : 8;
457b8e80941Smrg
458b8e80941Smrg			/* Type of the shader output variable. Note, this can
459b8e80941Smrg			 * be different from the format.
460b8e80941Smrg			 *
461b8e80941Smrg			 * 0: f16 (mediump float)
462b8e80941Smrg			 * 1: f32 (highp float)
463b8e80941Smrg			 * 2: i32 (highp int)
464b8e80941Smrg			 * 3: u32 (highp uint)
465b8e80941Smrg			 * 4: i16 (mediump int)
466b8e80941Smrg			 * 5: u16 (mediump uint)
467b8e80941Smrg			 */
468b8e80941Smrg			u32 shader_type : 3;
469b8e80941Smrg			u32 zero : 9;
470b8e80941Smrg		};
471b8e80941Smrg
472b8e80941Smrg		/* Only the low 32 bits of the blend shader are stored, the
473b8e80941Smrg		 * high 32 bits are implicitly the same as the original shader.
474b8e80941Smrg		 * According to the kernel driver, the program counter for
475b8e80941Smrg		 * shaders is actually only 24 bits, so shaders cannot cross
476b8e80941Smrg		 * the 2^24-byte boundary, and neither can the blend shader.
477b8e80941Smrg		 * The blob handles this by allocating a 2^24 byte pool for
478b8e80941Smrg		 * shaders, and making sure that any blend shaders are stored
479b8e80941Smrg		 * in the same pool as the original shader. The kernel will
480b8e80941Smrg		 * make sure this allocation is aligned to 2^24 bytes.
481b8e80941Smrg		 */
482b8e80941Smrg		u32 blend_shader;
483b8e80941Smrg	};
484b8e80941Smrg#endif
485b8e80941Smrg} __attribute__((packed));
486b8e80941Smrg
487b8e80941Smrgstruct mali_shader_meta {
488b8e80941Smrg        mali_ptr shader;
489b8e80941Smrg        u16 texture_count;
490b8e80941Smrg        u16 sampler_count;
491b8e80941Smrg        u16 attribute_count;
492b8e80941Smrg        u16 varying_count;
493b8e80941Smrg
494b8e80941Smrg        union {
495b8e80941Smrg                struct {
496b8e80941Smrg                        u32 uniform_buffer_count : 4;
497b8e80941Smrg                        u32 unk1 : 28; // = 0x800000 for vertex, 0x958020 for tiler
498b8e80941Smrg                } bifrost1;
499b8e80941Smrg                struct {
500b8e80941Smrg                        /* 0x200 except MALI_NO_ALPHA_TO_COVERAGE. Mysterious 1
501b8e80941Smrg                         * other times. Who knows really? */
502b8e80941Smrg                        u16 unknown1;
503b8e80941Smrg
504b8e80941Smrg                        /* Whole number of uniform registers used, times two;
505b8e80941Smrg                         * whole number of work registers used (no scale).
506b8e80941Smrg                         */
507b8e80941Smrg                        unsigned work_count : 5;
508b8e80941Smrg                        unsigned uniform_count : 5;
509b8e80941Smrg                        unsigned unknown2 : 6;
510b8e80941Smrg                } midgard1;
511b8e80941Smrg        };
512b8e80941Smrg
513b8e80941Smrg        /* On bifrost: Exactly the same as glPolygonOffset() for both.
514b8e80941Smrg         * On midgard: Depth factor is exactly as passed to glPolygonOffset.
515b8e80941Smrg         * Depth units is equal to the value passed to glDeptOhffset + 1.0f
516b8e80941Smrg         * (use MALI_NEGATIVE)
517b8e80941Smrg         */
518b8e80941Smrg        float depth_units;
519b8e80941Smrg        float depth_factor;
520b8e80941Smrg
521b8e80941Smrg        u32 unknown2_2;
522b8e80941Smrg
523b8e80941Smrg        u16 alpha_coverage;
524b8e80941Smrg        u16 unknown2_3;
525b8e80941Smrg
526b8e80941Smrg        u8 stencil_mask_front;
527b8e80941Smrg        u8 stencil_mask_back;
528b8e80941Smrg        u16 unknown2_4;
529b8e80941Smrg
530b8e80941Smrg        struct mali_stencil_test stencil_front;
531b8e80941Smrg        struct mali_stencil_test stencil_back;
532b8e80941Smrg
533b8e80941Smrg        union {
534b8e80941Smrg                struct {
535b8e80941Smrg                        u32 unk3 : 7;
536b8e80941Smrg                        /* On Bifrost, some system values are preloaded in
537b8e80941Smrg                         * registers R55-R62 by the thread dispatcher prior to
538b8e80941Smrg                         * the start of shader execution. This is a bitfield
539b8e80941Smrg                         * with one entry for each register saying which
540b8e80941Smrg                         * registers need to be preloaded. Right now, the known
541b8e80941Smrg                         * values are:
542b8e80941Smrg                         *
543b8e80941Smrg                         * Vertex/compute:
544b8e80941Smrg                         * - R55 : gl_LocalInvocationID.xy
545b8e80941Smrg                         * - R56 : gl_LocalInvocationID.z + unknown in high 16 bits
546b8e80941Smrg                         * - R57 : gl_WorkGroupID.x
547b8e80941Smrg                         * - R58 : gl_WorkGroupID.y
548b8e80941Smrg                         * - R59 : gl_WorkGroupID.z
549b8e80941Smrg                         * - R60 : gl_GlobalInvocationID.x
550b8e80941Smrg                         * - R61 : gl_GlobalInvocationID.y/gl_VertexID (without base)
551b8e80941Smrg                         * - R62 : gl_GlobalInvocationID.z/gl_InstanceID (without base)
552b8e80941Smrg                         *
553b8e80941Smrg                         * Fragment:
554b8e80941Smrg                         * - R55 : unknown, never seen (but the bit for this is
555b8e80941Smrg                         *   always set?)
556b8e80941Smrg                         * - R56 : unknown (bit always unset)
557b8e80941Smrg                         * - R57 : gl_PrimitiveID
558b8e80941Smrg                         * - R58 : gl_FrontFacing in low bit, potentially other stuff
559b8e80941Smrg                         * - R59 : u16 fragment coordinates (used to compute
560b8e80941Smrg                         *   gl_FragCoord.xy, together with sample positions)
561b8e80941Smrg                         * - R60 : gl_SampleMask (used in epilog, so pretty
562b8e80941Smrg                         *   much always used, but the bit is always 0 -- is
563b8e80941Smrg                         *   this just always pushed?)
564b8e80941Smrg                         * - R61 : gl_SampleMaskIn and gl_SampleID, used by
565b8e80941Smrg                         *   varying interpolation.
566b8e80941Smrg                         * - R62 : unknown (bit always unset).
567b8e80941Smrg                         */
568b8e80941Smrg                        u32 preload_regs : 8;
569b8e80941Smrg                        /* In units of 8 bytes or 64 bits, since the
570b8e80941Smrg                         * uniform/const port loads 64 bits at a time.
571b8e80941Smrg                         */
572b8e80941Smrg                        u32 uniform_count : 7;
573b8e80941Smrg                        u32 unk4 : 10; // = 2
574b8e80941Smrg                } bifrost2;
575b8e80941Smrg                struct {
576b8e80941Smrg                        u32 unknown2_7;
577b8e80941Smrg                } midgard2;
578b8e80941Smrg        };
579b8e80941Smrg
580b8e80941Smrg        /* zero on bifrost */
581b8e80941Smrg        u32 unknown2_8;
582b8e80941Smrg
583b8e80941Smrg        /* Blending information for the older non-MRT Midgard HW. Check for
584b8e80941Smrg         * MALI_HAS_BLEND_SHADER to decide how to interpret.
585b8e80941Smrg         */
586b8e80941Smrg
587b8e80941Smrg        union {
588b8e80941Smrg                mali_ptr blend_shader;
589b8e80941Smrg                struct mali_blend_equation blend_equation;
590b8e80941Smrg        };
591b8e80941Smrg
592b8e80941Smrg        /* There can be up to 4 blend_meta's. None of them are required for
593b8e80941Smrg         * vertex shaders or the non-MRT case for Midgard (so the blob doesn't
594b8e80941Smrg         * allocate any space).
595b8e80941Smrg         */
596b8e80941Smrg        struct mali_blend_meta blend_meta[];
597b8e80941Smrg
598b8e80941Smrg} __attribute__((packed));
599b8e80941Smrg
600b8e80941Smrg/* This only concerns hardware jobs */
601b8e80941Smrg
602b8e80941Smrg/* Possible values for job_descriptor_size */
603b8e80941Smrg
604b8e80941Smrg#define MALI_JOB_32 0
605b8e80941Smrg#define MALI_JOB_64 1
606b8e80941Smrg
607b8e80941Smrgstruct mali_job_descriptor_header {
608b8e80941Smrg        u32 exception_status;
609b8e80941Smrg        u32 first_incomplete_task;
610b8e80941Smrg        u64 fault_pointer;
611b8e80941Smrg        u8 job_descriptor_size : 1;
612b8e80941Smrg        enum mali_job_type job_type : 7;
613b8e80941Smrg        u8 job_barrier : 1;
614b8e80941Smrg        u8 unknown_flags : 7;
615b8e80941Smrg        u16 job_index;
616b8e80941Smrg        u16 job_dependency_index_1;
617b8e80941Smrg        u16 job_dependency_index_2;
618b8e80941Smrg
619b8e80941Smrg        union {
620b8e80941Smrg                u64 next_job_64;
621b8e80941Smrg                u32 next_job_32;
622b8e80941Smrg        };
623b8e80941Smrg} __attribute__((packed));
624b8e80941Smrg
625b8e80941Smrgstruct mali_payload_set_value {
626b8e80941Smrg        u64 out;
627b8e80941Smrg        u64 unknown;
628b8e80941Smrg} __attribute__((packed));
629b8e80941Smrg
630b8e80941Smrg/* Special attributes have a fixed index */
631b8e80941Smrg#define MALI_SPECIAL_ATTRIBUTE_BASE 16
632b8e80941Smrg#define MALI_VERTEX_ID   (MALI_SPECIAL_ATTRIBUTE_BASE + 0)
633b8e80941Smrg#define MALI_INSTANCE_ID (MALI_SPECIAL_ATTRIBUTE_BASE + 1)
634b8e80941Smrg
635b8e80941Smrg/*
636b8e80941Smrg * Mali Attributes
637b8e80941Smrg *
638b8e80941Smrg * This structure lets the attribute unit compute the address of an attribute
639b8e80941Smrg * given the vertex and instance ID. Unfortunately, the way this works is
640b8e80941Smrg * rather complicated when instancing is enabled.
641b8e80941Smrg *
642b8e80941Smrg * To explain this, first we need to explain how compute and vertex threads are
643b8e80941Smrg * dispatched. This is a guess (although a pretty firm guess!) since the
644b8e80941Smrg * details are mostly hidden from the driver, except for attribute instancing.
645b8e80941Smrg * When a quad is dispatched, it receives a single, linear index. However, we
646b8e80941Smrg * need to translate that index into a (vertex id, instance id) pair, or a
647b8e80941Smrg * (local id x, local id y, local id z) triple for compute shaders (although
648b8e80941Smrg * vertex shaders and compute shaders are handled almost identically).
649b8e80941Smrg * Focusing on vertex shaders, one option would be to do:
650b8e80941Smrg *
651b8e80941Smrg * vertex_id = linear_id % num_vertices
652b8e80941Smrg * instance_id = linear_id / num_vertices
653b8e80941Smrg *
654b8e80941Smrg * but this involves a costly division and modulus by an arbitrary number.
655b8e80941Smrg * Instead, we could pad num_vertices. We dispatch padded_num_vertices *
656b8e80941Smrg * num_instances threads instead of num_vertices * num_instances, which results
657b8e80941Smrg * in some "extra" threads with vertex_id >= num_vertices, which we have to
658b8e80941Smrg * discard.  The more we pad num_vertices, the more "wasted" threads we
659b8e80941Smrg * dispatch, but the division is potentially easier.
660b8e80941Smrg *
661b8e80941Smrg * One straightforward choice is to pad num_vertices to the next power of two,
662b8e80941Smrg * which means that the division and modulus are just simple bit shifts and
663b8e80941Smrg * masking. But the actual algorithm is a bit more complicated. The thread
664b8e80941Smrg * dispatcher has special support for dividing by 3, 5, 7, and 9, in addition
665b8e80941Smrg * to dividing by a power of two. This is possibly using the technique
666b8e80941Smrg * described in patent US20170010862A1. As a result, padded_num_vertices can be
667b8e80941Smrg * 1, 3, 5, 7, or 9 times a power of two. This results in less wasted threads,
668b8e80941Smrg * since we need less padding.
669b8e80941Smrg *
670b8e80941Smrg * padded_num_vertices is picked by the hardware. The driver just specifies the
671b8e80941Smrg * actual number of vertices. At least for Mali G71, the first few cases are
672b8e80941Smrg * given by:
673b8e80941Smrg *
674b8e80941Smrg * num_vertices	| padded_num_vertices
675b8e80941Smrg * 3		| 4
676b8e80941Smrg * 4-7		| 8
677b8e80941Smrg * 8-11		| 12 (3 * 4)
678b8e80941Smrg * 12-15	| 16
679b8e80941Smrg * 16-19	| 20 (5 * 4)
680b8e80941Smrg *
681b8e80941Smrg * Note that padded_num_vertices is a multiple of four (presumably because
682b8e80941Smrg * threads are dispatched in groups of 4). Also, padded_num_vertices is always
683b8e80941Smrg * at least one more than num_vertices, which seems like a quirk of the
684b8e80941Smrg * hardware. For larger num_vertices, the hardware uses the following
685b8e80941Smrg * algorithm: using the binary representation of num_vertices, we look at the
686b8e80941Smrg * most significant set bit as well as the following 3 bits. Let n be the
687b8e80941Smrg * number of bits after those 4 bits. Then we set padded_num_vertices according
688b8e80941Smrg * to the following table:
689b8e80941Smrg *
690b8e80941Smrg * high bits	| padded_num_vertices
691b8e80941Smrg * 1000		| 9 * 2^n
692b8e80941Smrg * 1001		| 5 * 2^(n+1)
693b8e80941Smrg * 101x		| 3 * 2^(n+2)
694b8e80941Smrg * 110x		| 7 * 2^(n+1)
695b8e80941Smrg * 111x		| 2^(n+4)
696b8e80941Smrg *
697b8e80941Smrg * For example, if num_vertices = 70 is passed to glDraw(), its binary
698b8e80941Smrg * representation is 1000110, so n = 3 and the high bits are 1000, and
699b8e80941Smrg * therefore padded_num_vertices = 9 * 2^3 = 72.
700b8e80941Smrg *
701b8e80941Smrg * The attribute unit works in terms of the original linear_id. if
702b8e80941Smrg * num_instances = 1, then they are the same, and everything is simple.
703b8e80941Smrg * However, with instancing things get more complicated. There are four
704b8e80941Smrg * possible modes, two of them we can group together:
705b8e80941Smrg *
706b8e80941Smrg * 1. Use the linear_id directly. Only used when there is no instancing.
707b8e80941Smrg *
708b8e80941Smrg * 2. Use the linear_id modulo a constant. This is used for per-vertex
709b8e80941Smrg * attributes with instancing enabled by making the constant equal
710b8e80941Smrg * padded_num_vertices. Because the modulus is always padded_num_vertices, this
711b8e80941Smrg * mode only supports a modulus that is a power of 2 times 1, 3, 5, 7, or 9.
712b8e80941Smrg * The shift field specifies the power of two, while the extra_flags field
713b8e80941Smrg * specifies the odd number. If shift = n and extra_flags = m, then the modulus
714b8e80941Smrg * is (2m + 1) * 2^n. As an example, if num_vertices = 70, then as computed
715b8e80941Smrg * above, padded_num_vertices = 9 * 2^3, so we should set extra_flags = 4 and
716b8e80941Smrg * shift = 3. Note that we must exactly follow the hardware algorithm used to
717b8e80941Smrg * get padded_num_vertices in order to correctly implement per-vertex
718b8e80941Smrg * attributes.
719b8e80941Smrg *
720b8e80941Smrg * 3. Divide the linear_id by a constant. In order to correctly implement
721b8e80941Smrg * instance divisors, we have to divide linear_id by padded_num_vertices times
722b8e80941Smrg * to user-specified divisor. So first we compute padded_num_vertices, again
723b8e80941Smrg * following the exact same algorithm that the hardware uses, then multiply it
724b8e80941Smrg * by the GL-level divisor to get the hardware-level divisor. This case is
725b8e80941Smrg * further divided into two more cases. If the hardware-level divisor is a
726b8e80941Smrg * power of two, then we just need to shift. The shift amount is specified by
727b8e80941Smrg * the shift field, so that the hardware-level divisor is just 2^shift.
728b8e80941Smrg *
729b8e80941Smrg * If it isn't a power of two, then we have to divide by an arbitrary integer.
730b8e80941Smrg * For that, we use the well-known technique of multiplying by an approximation
731b8e80941Smrg * of the inverse. The driver must compute the magic multiplier and shift
732b8e80941Smrg * amount, and then the hardware does the multiplication and shift. The
733b8e80941Smrg * hardware and driver also use the "round-down" optimization as described in
734b8e80941Smrg * http://ridiculousfish.com/files/faster_unsigned_division_by_constants.pdf.
735b8e80941Smrg * The hardware further assumes the multiplier is between 2^31 and 2^32, so the
736b8e80941Smrg * high bit is implicitly set to 1 even though it is set to 0 by the driver --
737b8e80941Smrg * presumably this simplifies the hardware multiplier a little. The hardware
738b8e80941Smrg * first multiplies linear_id by the multiplier and takes the high 32 bits,
739b8e80941Smrg * then applies the round-down correction if extra_flags = 1, then finally
740b8e80941Smrg * shifts right by the shift field.
741b8e80941Smrg *
742b8e80941Smrg * There are some differences between ridiculousfish's algorithm and the Mali
743b8e80941Smrg * hardware algorithm, which means that the reference code from ridiculousfish
744b8e80941Smrg * doesn't always produce the right constants. Mali does not use the pre-shift
745b8e80941Smrg * optimization, since that would make a hardware implementation slower (it
746b8e80941Smrg * would have to always do the pre-shift, multiply, and post-shift operations).
747b8e80941Smrg * It also forces the multplier to be at least 2^31, which means that the
748b8e80941Smrg * exponent is entirely fixed, so there is no trial-and-error. Altogether,
749b8e80941Smrg * given the divisor d, the algorithm the driver must follow is:
750b8e80941Smrg *
751b8e80941Smrg * 1. Set shift = floor(log2(d)).
752b8e80941Smrg * 2. Compute m = ceil(2^(shift + 32) / d) and e = 2^(shift + 32) % d.
753b8e80941Smrg * 3. If e <= 2^shift, then we need to use the round-down algorithm. Set
754b8e80941Smrg * magic_divisor = m - 1 and extra_flags = 1.
755b8e80941Smrg * 4. Otherwise, set magic_divisor = m and extra_flags = 0.
756b8e80941Smrg */
757b8e80941Smrg
758b8e80941Smrgenum mali_attr_mode {
759b8e80941Smrg	MALI_ATTR_UNUSED = 0,
760b8e80941Smrg	MALI_ATTR_LINEAR = 1,
761b8e80941Smrg	MALI_ATTR_POT_DIVIDE = 2,
762b8e80941Smrg	MALI_ATTR_MODULO = 3,
763b8e80941Smrg	MALI_ATTR_NPOT_DIVIDE = 4,
764b8e80941Smrg};
765b8e80941Smrg
766b8e80941Smrg/* This magic "pseudo-address" is used as `elements` to implement
767b8e80941Smrg * gl_PointCoord. When read from a fragment shader, it generates a point
768b8e80941Smrg * coordinate per the OpenGL ES 2.0 specification. Flipped coordinate spaces
769b8e80941Smrg * require an affine transformation in the shader. */
770b8e80941Smrg
771b8e80941Smrg#define MALI_VARYING_POINT_COORD (0x60)
772b8e80941Smrg
773b8e80941Smrgunion mali_attr {
774b8e80941Smrg	/* This is used for actual attributes. */
775b8e80941Smrg	struct {
776b8e80941Smrg		/* The bottom 3 bits are the mode */
777b8e80941Smrg		mali_ptr elements : 64 - 8;
778b8e80941Smrg		u32 shift : 5;
779b8e80941Smrg		u32 extra_flags : 3;
780b8e80941Smrg		u32 stride;
781b8e80941Smrg		u32 size;
782b8e80941Smrg	};
783b8e80941Smrg	/* The entry after an NPOT_DIVIDE entry has this format. It stores
784b8e80941Smrg	 * extra information that wouldn't fit in a normal entry.
785b8e80941Smrg	 */
786b8e80941Smrg	struct {
787b8e80941Smrg		u32 unk; /* = 0x20 */
788b8e80941Smrg		u32 magic_divisor;
789b8e80941Smrg		u32 zero;
790b8e80941Smrg		/* This is the original, GL-level divisor. */
791b8e80941Smrg		u32 divisor;
792b8e80941Smrg	};
793b8e80941Smrg} __attribute__((packed));
794b8e80941Smrg
795b8e80941Smrgstruct mali_attr_meta {
796b8e80941Smrg        /* Vertex buffer index */
797b8e80941Smrg        u8 index;
798b8e80941Smrg
799b8e80941Smrg        unsigned unknown1 : 2;
800b8e80941Smrg        unsigned swizzle : 12;
801b8e80941Smrg        enum mali_format format : 8;
802b8e80941Smrg
803b8e80941Smrg        /* Always observed to be zero at the moment */
804b8e80941Smrg        unsigned unknown3 : 2;
805b8e80941Smrg
806b8e80941Smrg        /* When packing multiple attributes in a buffer, offset addresses by this value */
807b8e80941Smrg        uint32_t src_offset;
808b8e80941Smrg} __attribute__((packed));
809b8e80941Smrg
810b8e80941Smrgenum mali_fbd_type {
811b8e80941Smrg        MALI_SFBD = 0,
812b8e80941Smrg        MALI_MFBD = 1,
813b8e80941Smrg};
814b8e80941Smrg
815b8e80941Smrg#define FBD_TYPE (1)
816b8e80941Smrg#define FBD_MASK (~0x3f)
817b8e80941Smrg
818b8e80941Smrgstruct mali_uniform_buffer_meta {
819b8e80941Smrg        /* This is actually the size minus 1 (MALI_POSITIVE), in units of 16
820b8e80941Smrg         * bytes. This gives a maximum of 2^14 bytes, which just so happens to
821b8e80941Smrg         * be the GL minimum-maximum for GL_MAX_UNIFORM_BLOCK_SIZE.
822b8e80941Smrg         */
823b8e80941Smrg        u64 size : 10;
824b8e80941Smrg
825b8e80941Smrg        /* This is missing the bottom 2 bits and top 8 bits. The top 8 bits
826b8e80941Smrg         * should be 0 for userspace pointers, according to
827b8e80941Smrg         * https://lwn.net/Articles/718895/. By reusing these bits, we can make
828b8e80941Smrg         * each entry in the table only 64 bits.
829b8e80941Smrg         */
830b8e80941Smrg        mali_ptr ptr : 64 - 10;
831b8e80941Smrg};
832b8e80941Smrg
833b8e80941Smrg/* On Bifrost, these fields are the same between the vertex and tiler payloads.
834b8e80941Smrg * They also seem to be the same between Bifrost and Midgard. They're shared in
835b8e80941Smrg * fused payloads.
836b8e80941Smrg */
837b8e80941Smrg
838b8e80941Smrg/* Applies to unknown_draw */
839b8e80941Smrg
840b8e80941Smrg#define MALI_DRAW_INDEXED_UINT8  (0x10)
841b8e80941Smrg#define MALI_DRAW_INDEXED_UINT16 (0x20)
842b8e80941Smrg#define MALI_DRAW_INDEXED_UINT32 (0x30)
843b8e80941Smrg#define MALI_DRAW_VARYING_SIZE   (0x100)
844b8e80941Smrg#define MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX (0x10000)
845b8e80941Smrg
846b8e80941Smrgstruct mali_vertex_tiler_prefix {
847b8e80941Smrg        /* This is a dynamic bitfield containing the following things in this order:
848b8e80941Smrg         *
849b8e80941Smrg         * - gl_WorkGroupSize.x
850b8e80941Smrg         * - gl_WorkGroupSize.y
851b8e80941Smrg         * - gl_WorkGroupSize.z
852b8e80941Smrg         * - gl_NumWorkGroups.x
853b8e80941Smrg         * - gl_NumWorkGroups.y
854b8e80941Smrg         * - gl_NumWorkGroups.z
855b8e80941Smrg         *
856b8e80941Smrg         * The number of bits allocated for each number is based on the *_shift
857b8e80941Smrg         * fields below. For example, workgroups_y_shift gives the bit that
858b8e80941Smrg         * gl_NumWorkGroups.y starts at, and workgroups_z_shift gives the bit
859b8e80941Smrg         * that gl_NumWorkGroups.z starts at (and therefore one after the bit
860b8e80941Smrg         * that gl_NumWorkGroups.y ends at). The actual value for each gl_*
861b8e80941Smrg         * value is one more than the stored value, since if any of the values
862b8e80941Smrg         * are zero, then there would be no invocations (and hence no job). If
863b8e80941Smrg         * there were 0 bits allocated to a given field, then it must be zero,
864b8e80941Smrg         * and hence the real value is one.
865b8e80941Smrg         *
866b8e80941Smrg         * Vertex jobs reuse the same job dispatch mechanism as compute jobs,
867b8e80941Smrg         * effectively doing glDispatchCompute(1, vertex_count, instance_count)
868b8e80941Smrg         * where vertex count is the number of vertices.
869b8e80941Smrg         */
870b8e80941Smrg        u32 invocation_count;
871b8e80941Smrg
872b8e80941Smrg        u32 size_y_shift : 5;
873b8e80941Smrg        u32 size_z_shift : 5;
874b8e80941Smrg        u32 workgroups_x_shift : 6;
875b8e80941Smrg        u32 workgroups_y_shift : 6;
876b8e80941Smrg        u32 workgroups_z_shift : 6;
877b8e80941Smrg        /* This is max(workgroups_x_shift, 2) in all the cases I've seen. */
878b8e80941Smrg        u32 workgroups_x_shift_2 : 4;
879b8e80941Smrg
880b8e80941Smrg        u32 draw_mode : 4;
881b8e80941Smrg        u32 unknown_draw : 22;
882b8e80941Smrg
883b8e80941Smrg        /* This is the the same as workgroups_x_shift_2 in compute shaders, but
884b8e80941Smrg         * always 5 for vertex jobs and 6 for tiler jobs. I suspect this has
885b8e80941Smrg         * something to do with how many quads get put in the same execution
886b8e80941Smrg         * engine, which is a balance (you don't want to starve the engine, but
887b8e80941Smrg         * you also want to distribute work evenly).
888b8e80941Smrg         */
889b8e80941Smrg        u32 workgroups_x_shift_3 : 6;
890b8e80941Smrg
891b8e80941Smrg
892b8e80941Smrg        /* Negative of draw_start for TILER jobs from what I've seen */
893b8e80941Smrg        int32_t negative_start;
894b8e80941Smrg        u32 zero1;
895b8e80941Smrg
896b8e80941Smrg        /* Like many other strictly nonzero quantities, index_count is
897b8e80941Smrg         * subtracted by one. For an indexed cube, this is equal to 35 = 6
898b8e80941Smrg         * faces * 2 triangles/per face * 3 vertices/per triangle - 1. That is,
899b8e80941Smrg         * for an indexed draw, index_count is the number of actual vertices
900b8e80941Smrg         * rendered whereas invocation_count is the number of unique vertices
901b8e80941Smrg         * rendered (the number of times the vertex shader must be invoked).
902b8e80941Smrg         * For non-indexed draws, this is just equal to invocation_count. */
903b8e80941Smrg
904b8e80941Smrg        u32 index_count;
905b8e80941Smrg
906b8e80941Smrg        /* No hidden structure; literally just a pointer to an array of uint
907b8e80941Smrg         * indices (width depends on flags). Thanks, guys, for not making my
908b8e80941Smrg         * life insane for once! NULL for non-indexed draws. */
909b8e80941Smrg
910b8e80941Smrg        uintptr_t indices;
911b8e80941Smrg} __attribute__((packed));
912b8e80941Smrg
913b8e80941Smrg/* Point size / line width can either be specified as a 32-bit float (for
914b8e80941Smrg * constant size) or as a [machine word size]-bit GPU pointer (for varying size). If a pointer
915b8e80941Smrg * is selected, by setting the appropriate MALI_DRAW_VARYING_SIZE bit in the tiler
916b8e80941Smrg * payload, the contents of varying_pointer will be intepreted as an array of
917b8e80941Smrg * fp16 sizes, one for each vertex. gl_PointSize is therefore implemented by
918b8e80941Smrg * creating a special MALI_R16F varying writing to varying_pointer. */
919b8e80941Smrg
920b8e80941Smrgunion midgard_primitive_size {
921b8e80941Smrg        float constant;
922b8e80941Smrg        uintptr_t pointer;
923b8e80941Smrg};
924b8e80941Smrg
925b8e80941Smrgstruct bifrost_vertex_only {
926b8e80941Smrg        u32 unk2; /* =0x2 */
927b8e80941Smrg
928b8e80941Smrg        u32 zero0;
929b8e80941Smrg
930b8e80941Smrg        u64 zero1;
931b8e80941Smrg} __attribute__((packed));
932b8e80941Smrg
933b8e80941Smrgstruct bifrost_tiler_heap_meta {
934b8e80941Smrg        u32 zero;
935b8e80941Smrg        u32 heap_size;
936b8e80941Smrg        /* note: these are just guesses! */
937b8e80941Smrg        mali_ptr tiler_heap_start;
938b8e80941Smrg        mali_ptr tiler_heap_free;
939b8e80941Smrg        mali_ptr tiler_heap_end;
940b8e80941Smrg
941b8e80941Smrg        /* hierarchy weights? but they're still 0 after the job has run... */
942b8e80941Smrg        u32 zeros[12];
943b8e80941Smrg} __attribute__((packed));
944b8e80941Smrg
945b8e80941Smrgstruct bifrost_tiler_meta {
946b8e80941Smrg        u64 zero0;
947b8e80941Smrg        u32 unk; // = 0xf0
948b8e80941Smrg        u16 width;
949b8e80941Smrg        u16 height;
950b8e80941Smrg        u64 zero1;
951b8e80941Smrg        mali_ptr tiler_heap_meta;
952b8e80941Smrg        /* TODO what is this used for? */
953b8e80941Smrg        u64 zeros[20];
954b8e80941Smrg} __attribute__((packed));
955b8e80941Smrg
956b8e80941Smrgstruct bifrost_tiler_only {
957b8e80941Smrg        /* 0x20 */
958b8e80941Smrg        union midgard_primitive_size primitive_size;
959b8e80941Smrg
960b8e80941Smrg        mali_ptr tiler_meta;
961b8e80941Smrg
962b8e80941Smrg        u64 zero1, zero2, zero3, zero4, zero5, zero6;
963b8e80941Smrg
964b8e80941Smrg        u32 gl_enables;
965b8e80941Smrg        u32 zero7;
966b8e80941Smrg        u64 zero8;
967b8e80941Smrg} __attribute__((packed));
968b8e80941Smrg
969b8e80941Smrgstruct bifrost_scratchpad {
970b8e80941Smrg        u32 zero;
971b8e80941Smrg        u32 flags; // = 0x1f
972b8e80941Smrg        /* This is a pointer to a CPU-inaccessible buffer, 16 pages, allocated
973b8e80941Smrg         * during startup. It seems to serve the same purpose as the
974b8e80941Smrg         * gpu_scratchpad in the SFBD for Midgard, although it's slightly
975b8e80941Smrg         * larger.
976b8e80941Smrg         */
977b8e80941Smrg        mali_ptr gpu_scratchpad;
978b8e80941Smrg} __attribute__((packed));
979b8e80941Smrg
980b8e80941Smrgstruct mali_vertex_tiler_postfix {
981b8e80941Smrg        /* Zero for vertex jobs. Pointer to the position (gl_Position) varying
982b8e80941Smrg         * output from the vertex shader for tiler jobs.
983b8e80941Smrg         */
984b8e80941Smrg
985b8e80941Smrg        uintptr_t position_varying;
986b8e80941Smrg
987b8e80941Smrg        /* An array of mali_uniform_buffer_meta's. The size is given by the
988b8e80941Smrg         * shader_meta.
989b8e80941Smrg         */
990b8e80941Smrg        uintptr_t uniform_buffers;
991b8e80941Smrg
992b8e80941Smrg        /* This is a pointer to an array of pointers to the texture
993b8e80941Smrg         * descriptors, number of pointers bounded by number of textures. The
994b8e80941Smrg         * indirection is needed to accomodate varying numbers and sizes of
995b8e80941Smrg         * texture descriptors */
996b8e80941Smrg        uintptr_t texture_trampoline;
997b8e80941Smrg
998b8e80941Smrg        /* For OpenGL, from what I've seen, this is intimately connected to
999b8e80941Smrg         * texture_meta. cwabbott says this is not the case under Vulkan, hence
1000b8e80941Smrg         * why this field is seperate (Midgard is Vulkan capable). Pointer to
1001b8e80941Smrg         * array of sampler descriptors (which are uniform in size) */
1002b8e80941Smrg        uintptr_t sampler_descriptor;
1003b8e80941Smrg
1004b8e80941Smrg        uintptr_t uniforms;
1005b8e80941Smrg        u8 flags : 4;
1006b8e80941Smrg        uintptr_t _shader_upper : MALI_SHORT_PTR_BITS - 4; /* struct shader_meta */
1007b8e80941Smrg        uintptr_t attributes; /* struct attribute_buffer[] */
1008b8e80941Smrg        uintptr_t attribute_meta; /* attribute_meta[] */
1009b8e80941Smrg        uintptr_t varyings; /* struct attr */
1010b8e80941Smrg        uintptr_t varying_meta; /* pointer */
1011b8e80941Smrg        uintptr_t viewport;
1012b8e80941Smrg        uintptr_t occlusion_counter; /* A single bit as far as I can tell */
1013b8e80941Smrg
1014b8e80941Smrg        /* Note: on Bifrost, this isn't actually the FBD. It points to
1015b8e80941Smrg         * bifrost_scratchpad instead. However, it does point to the same thing
1016b8e80941Smrg         * in vertex and tiler jobs.
1017b8e80941Smrg         */
1018b8e80941Smrg        mali_ptr framebuffer;
1019b8e80941Smrg
1020b8e80941Smrg#ifdef __LP64__
1021b8e80941Smrg#ifdef BIFROST
1022b8e80941Smrg        /* most likely padding to make this a multiple of 64 bytes */
1023b8e80941Smrg        u64 zero7;
1024b8e80941Smrg#endif
1025b8e80941Smrg#endif
1026b8e80941Smrg} __attribute__((packed));
1027b8e80941Smrg
1028b8e80941Smrgstruct midgard_payload_vertex_tiler {
1029b8e80941Smrg#ifndef __LP64__
1030b8e80941Smrg        union midgard_primitive_size primitive_size;
1031b8e80941Smrg#endif
1032b8e80941Smrg
1033b8e80941Smrg        struct mali_vertex_tiler_prefix prefix;
1034b8e80941Smrg
1035b8e80941Smrg#ifndef __LP64__
1036b8e80941Smrg        u32 zero3;
1037b8e80941Smrg#endif
1038b8e80941Smrg
1039b8e80941Smrg        u32 gl_enables; // 0x5
1040b8e80941Smrg
1041b8e80941Smrg        /* Offset for first vertex in buffer */
1042b8e80941Smrg        u32 draw_start;
1043b8e80941Smrg
1044b8e80941Smrg	uintptr_t zero5;
1045b8e80941Smrg
1046b8e80941Smrg        struct mali_vertex_tiler_postfix postfix;
1047b8e80941Smrg
1048b8e80941Smrg#ifdef __LP64__
1049b8e80941Smrg        union midgard_primitive_size primitive_size;
1050b8e80941Smrg#endif
1051b8e80941Smrg} __attribute__((packed));
1052b8e80941Smrg
1053b8e80941Smrgstruct bifrost_payload_vertex {
1054b8e80941Smrg        struct mali_vertex_tiler_prefix prefix;
1055b8e80941Smrg        struct bifrost_vertex_only vertex;
1056b8e80941Smrg        struct mali_vertex_tiler_postfix postfix;
1057b8e80941Smrg} __attribute__((packed));
1058b8e80941Smrg
1059b8e80941Smrgstruct bifrost_payload_tiler {
1060b8e80941Smrg        struct mali_vertex_tiler_prefix prefix;
1061b8e80941Smrg        struct bifrost_tiler_only tiler;
1062b8e80941Smrg        struct mali_vertex_tiler_postfix postfix;
1063b8e80941Smrg} __attribute__((packed));
1064b8e80941Smrg
1065b8e80941Smrgstruct bifrost_payload_fused {
1066b8e80941Smrg        struct mali_vertex_tiler_prefix prefix;
1067b8e80941Smrg        struct bifrost_tiler_only tiler;
1068b8e80941Smrg        struct mali_vertex_tiler_postfix tiler_postfix;
1069b8e80941Smrg        struct bifrost_vertex_only vertex;
1070b8e80941Smrg        struct mali_vertex_tiler_postfix vertex_postfix;
1071b8e80941Smrg} __attribute__((packed));
1072b8e80941Smrg
1073b8e80941Smrg/* Pointed to from texture_trampoline, mostly unknown still, haven't
1074b8e80941Smrg * managed to replay successfully */
1075b8e80941Smrg
1076b8e80941Smrg/* Purposeful off-by-one in width, height fields. For example, a (64, 64)
1077b8e80941Smrg * texture is stored as (63, 63) in these fields. This adjusts for that.
1078b8e80941Smrg * There's an identical pattern in the framebuffer descriptor. Even vertex
1079b8e80941Smrg * count fields work this way, hence the generic name -- integral fields that
1080b8e80941Smrg * are strictly positive generally need this adjustment. */
1081b8e80941Smrg
1082b8e80941Smrg#define MALI_POSITIVE(dim) (dim - 1)
1083b8e80941Smrg
1084b8e80941Smrg/* Opposite of MALI_POSITIVE, found in the depth_units field */
1085b8e80941Smrg
1086b8e80941Smrg#define MALI_NEGATIVE(dim) (dim + 1)
1087b8e80941Smrg
1088b8e80941Smrg/* Used with wrapping. Incomplete (this is a 4-bit field...) */
1089b8e80941Smrg
1090b8e80941Smrgenum mali_wrap_mode {
1091b8e80941Smrg        MALI_WRAP_REPEAT = 0x8,
1092b8e80941Smrg        MALI_WRAP_CLAMP_TO_EDGE = 0x9,
1093b8e80941Smrg        MALI_WRAP_CLAMP_TO_BORDER = 0xB,
1094b8e80941Smrg        MALI_WRAP_MIRRORED_REPEAT = 0xC
1095b8e80941Smrg};
1096b8e80941Smrg
1097b8e80941Smrg/* 8192x8192 */
1098b8e80941Smrg#define MAX_MIP_LEVELS (13)
1099b8e80941Smrg
1100b8e80941Smrg/* Cubemap bloats everything up */
1101b8e80941Smrg#define MAX_FACES (6)
1102b8e80941Smrg
1103b8e80941Smrg/* Corresponds to the type passed to glTexImage2D and so forth */
1104b8e80941Smrg
1105b8e80941Smrgstruct mali_texture_format {
1106b8e80941Smrg        unsigned swizzle : 12;
1107b8e80941Smrg        enum mali_format format : 8;
1108b8e80941Smrg
1109b8e80941Smrg        unsigned usage1 : 3;
1110b8e80941Smrg        unsigned is_not_cubemap : 1;
1111b8e80941Smrg        unsigned usage2 : 8;
1112b8e80941Smrg} __attribute__((packed));
1113b8e80941Smrg
1114b8e80941Smrgstruct mali_texture_descriptor {
1115b8e80941Smrg        uint16_t width;
1116b8e80941Smrg        uint16_t height;
1117b8e80941Smrg        uint16_t depth;
1118b8e80941Smrg
1119b8e80941Smrg        uint16_t unknown1;
1120b8e80941Smrg
1121b8e80941Smrg        struct mali_texture_format format;
1122b8e80941Smrg
1123b8e80941Smrg        uint16_t unknown3;
1124b8e80941Smrg
1125b8e80941Smrg        /* One for non-mipmapped, zero for mipmapped */
1126b8e80941Smrg        uint8_t unknown3A;
1127b8e80941Smrg
1128b8e80941Smrg        /* Zero for non-mipmapped, (number of levels - 1) for mipmapped */
1129b8e80941Smrg        uint8_t nr_mipmap_levels;
1130b8e80941Smrg
1131b8e80941Smrg        /* Swizzling is a single 32-bit word, broken up here for convenience.
1132b8e80941Smrg         * Here, swizzling refers to the ES 3.0 texture parameters for channel
1133b8e80941Smrg         * level swizzling, not the internal pixel-level swizzling which is
1134b8e80941Smrg         * below OpenGL's reach */
1135b8e80941Smrg
1136b8e80941Smrg        unsigned swizzle : 12;
1137b8e80941Smrg        unsigned swizzle_zero       : 20;
1138b8e80941Smrg
1139b8e80941Smrg        uint32_t unknown5;
1140b8e80941Smrg        uint32_t unknown6;
1141b8e80941Smrg        uint32_t unknown7;
1142b8e80941Smrg
1143b8e80941Smrg        mali_ptr swizzled_bitmaps[MAX_MIP_LEVELS * MAX_FACES];
1144b8e80941Smrg} __attribute__((packed));
1145b8e80941Smrg
1146b8e80941Smrg/* Used as part of filter_mode */
1147b8e80941Smrg
1148b8e80941Smrg#define MALI_LINEAR 0
1149b8e80941Smrg#define MALI_NEAREST 1
1150b8e80941Smrg#define MALI_MIP_LINEAR (0x18)
1151b8e80941Smrg
1152b8e80941Smrg/* Used to construct low bits of filter_mode */
1153b8e80941Smrg
1154b8e80941Smrg#define MALI_TEX_MAG(mode) (((mode) & 1) << 0)
1155b8e80941Smrg#define MALI_TEX_MIN(mode) (((mode) & 1) << 1)
1156b8e80941Smrg
1157b8e80941Smrg#define MALI_TEX_MAG_MASK (1)
1158b8e80941Smrg#define MALI_TEX_MIN_MASK (2)
1159b8e80941Smrg
1160b8e80941Smrg#define MALI_FILTER_NAME(filter) (filter ? "MALI_NEAREST" : "MALI_LINEAR")
1161b8e80941Smrg
1162b8e80941Smrg/* Used for lod encoding. Thanks @urjaman for pointing out these routines can
1163b8e80941Smrg * be cleaned up a lot. */
1164b8e80941Smrg
1165b8e80941Smrg#define DECODE_FIXED_16(x) ((float) (x / 256.0))
1166b8e80941Smrg
1167b8e80941Smrgstatic inline uint16_t
1168b8e80941SmrgFIXED_16(float x)
1169b8e80941Smrg{
1170b8e80941Smrg        /* Clamp inputs, accounting for float error */
1171b8e80941Smrg        float max_lod = (32.0 - (1.0 / 512.0));
1172b8e80941Smrg
1173b8e80941Smrg        x = ((x > max_lod) ? max_lod : ((x < 0.0) ? 0.0 : x));
1174b8e80941Smrg
1175b8e80941Smrg        return (int) (x * 256.0);
1176b8e80941Smrg}
1177b8e80941Smrg
1178b8e80941Smrgstruct mali_sampler_descriptor {
1179b8e80941Smrg        uint32_t filter_mode;
1180b8e80941Smrg
1181b8e80941Smrg        /* Fixed point. Upper 8-bits is before the decimal point, although it
1182b8e80941Smrg         * caps [0-31]. Lower 8-bits is after the decimal point: int(round(x *
1183b8e80941Smrg         * 256)) */
1184b8e80941Smrg
1185b8e80941Smrg        uint16_t min_lod;
1186b8e80941Smrg        uint16_t max_lod;
1187b8e80941Smrg
1188b8e80941Smrg        /* All one word in reality, but packed a bit */
1189b8e80941Smrg
1190b8e80941Smrg        enum mali_wrap_mode wrap_s : 4;
1191b8e80941Smrg        enum mali_wrap_mode wrap_t : 4;
1192b8e80941Smrg        enum mali_wrap_mode wrap_r : 4;
1193b8e80941Smrg        enum mali_alt_func compare_func : 3;
1194b8e80941Smrg
1195b8e80941Smrg        /* A single set bit of unknown, ha! */
1196b8e80941Smrg        unsigned unknown2 : 1;
1197b8e80941Smrg
1198b8e80941Smrg        unsigned zero : 16;
1199b8e80941Smrg
1200b8e80941Smrg        uint32_t zero2;
1201b8e80941Smrg        float border_color[4];
1202b8e80941Smrg} __attribute__((packed));
1203b8e80941Smrg
1204b8e80941Smrg/* TODO: What are the floats? Apparently always { -inf, -inf, inf, inf },
1205b8e80941Smrg * unless the scissor test is enabled.
1206b8e80941Smrg *
1207b8e80941Smrg * viewport0/viewport1 form the arguments to glViewport. viewport1 is modified
1208b8e80941Smrg * by MALI_POSITIVE; viewport0 is as-is.
1209b8e80941Smrg */
1210b8e80941Smrg
1211b8e80941Smrgstruct mali_viewport {
1212b8e80941Smrg        /* XY clipping planes */
1213b8e80941Smrg        float clip_minx;
1214b8e80941Smrg        float clip_miny;
1215b8e80941Smrg        float clip_maxx;
1216b8e80941Smrg        float clip_maxy;
1217b8e80941Smrg
1218b8e80941Smrg        /* Depth clipping planes */
1219b8e80941Smrg        float clip_minz;
1220b8e80941Smrg        float clip_maxz;
1221b8e80941Smrg
1222b8e80941Smrg        u16 viewport0[2];
1223b8e80941Smrg        u16 viewport1[2];
1224b8e80941Smrg} __attribute__((packed));
1225b8e80941Smrg
1226b8e80941Smrg/* From presentations, 16x16 tiles externally. Use shift for fast computation
1227b8e80941Smrg * of tile numbers. */
1228b8e80941Smrg
1229b8e80941Smrg#define MALI_TILE_SHIFT 4
1230b8e80941Smrg#define MALI_TILE_LENGTH (1 << MALI_TILE_SHIFT)
1231b8e80941Smrg
1232b8e80941Smrg/* Tile coordinates are stored as a compact u32, as only 12 bits are needed to
1233b8e80941Smrg * each component. Notice that this provides a theoretical upper bound of (1 <<
1234b8e80941Smrg * 12) = 4096 tiles in each direction, addressing a maximum framebuffer of size
1235b8e80941Smrg * 65536x65536. Multiplying that together, times another four given that Mali
1236b8e80941Smrg * framebuffers are 32-bit ARGB8888, means that this upper bound would take 16
1237b8e80941Smrg * gigabytes of RAM just to store the uncompressed framebuffer itself, let
1238b8e80941Smrg * alone rendering in real-time to such a buffer.
1239b8e80941Smrg *
1240b8e80941Smrg * Nice job, guys.*/
1241b8e80941Smrg
1242b8e80941Smrg/* From mali_kbase_10969_workaround.c */
1243b8e80941Smrg#define MALI_X_COORD_MASK 0x00000FFF
1244b8e80941Smrg#define MALI_Y_COORD_MASK 0x0FFF0000
1245b8e80941Smrg
1246b8e80941Smrg/* Extract parts of a tile coordinate */
1247b8e80941Smrg
1248b8e80941Smrg#define MALI_TILE_COORD_X(coord) ((coord) & MALI_X_COORD_MASK)
1249b8e80941Smrg#define MALI_TILE_COORD_Y(coord) (((coord) & MALI_Y_COORD_MASK) >> 16)
1250b8e80941Smrg#define MALI_TILE_COORD_FLAGS(coord) ((coord) & ~(MALI_X_COORD_MASK | MALI_Y_COORD_MASK))
1251b8e80941Smrg
1252b8e80941Smrg/* No known flags yet, but just in case...? */
1253b8e80941Smrg
1254b8e80941Smrg#define MALI_TILE_NO_FLAG (0)
1255b8e80941Smrg
1256b8e80941Smrg/* Helpers to generate tile coordinates based on the boundary coordinates in
1257b8e80941Smrg * screen space. So, with the bounds (0, 0) to (128, 128) for the screen, these
1258b8e80941Smrg * functions would convert it to the bounding tiles (0, 0) to (7, 7).
1259b8e80941Smrg * Intentional "off-by-one"; finding the tile number is a form of fencepost
1260b8e80941Smrg * problem. */
1261b8e80941Smrg
1262b8e80941Smrg#define MALI_MAKE_TILE_COORDS(X, Y) ((X) | ((Y) << 16))
1263b8e80941Smrg#define MALI_BOUND_TO_TILE(B, bias) ((B - bias) >> MALI_TILE_SHIFT)
1264b8e80941Smrg#define MALI_COORDINATE_TO_TILE(W, H, bias) MALI_MAKE_TILE_COORDS(MALI_BOUND_TO_TILE(W, bias), MALI_BOUND_TO_TILE(H, bias))
1265b8e80941Smrg#define MALI_COORDINATE_TO_TILE_MIN(W, H) MALI_COORDINATE_TO_TILE(W, H, 0)
1266b8e80941Smrg#define MALI_COORDINATE_TO_TILE_MAX(W, H) MALI_COORDINATE_TO_TILE(W, H, 1)
1267b8e80941Smrg
1268b8e80941Smrgstruct mali_payload_fragment {
1269b8e80941Smrg        u32 min_tile_coord;
1270b8e80941Smrg        u32 max_tile_coord;
1271b8e80941Smrg        mali_ptr framebuffer;
1272b8e80941Smrg} __attribute__((packed));
1273b8e80941Smrg
1274b8e80941Smrg/* (Single?) Framebuffer Descriptor */
1275b8e80941Smrg
1276b8e80941Smrg/* Flags apply to format. With just MSAA_A and MSAA_B, the framebuffer is
1277b8e80941Smrg * configured for 4x. With MSAA_8, it is configured for 8x. */
1278b8e80941Smrg
1279b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_8 (1 << 3)
1280b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_A (1 << 4)
1281b8e80941Smrg#define MALI_FRAMEBUFFER_MSAA_B (1 << 23)
1282b8e80941Smrg
1283b8e80941Smrg/* Fast/slow based on whether all three buffers are cleared at once */
1284b8e80941Smrg
1285b8e80941Smrg#define MALI_CLEAR_FAST         (1 << 18)
1286b8e80941Smrg#define MALI_CLEAR_SLOW         (1 << 28)
1287b8e80941Smrg#define MALI_CLEAR_SLOW_STENCIL (1 << 31)
1288b8e80941Smrg
1289b8e80941Smrgstruct mali_single_framebuffer {
1290b8e80941Smrg        u32 unknown1;
1291b8e80941Smrg        u32 unknown2;
1292b8e80941Smrg        u64 unknown_address_0;
1293b8e80941Smrg        u64 zero1;
1294b8e80941Smrg        u64 zero0;
1295b8e80941Smrg
1296b8e80941Smrg        /* Exact format is ironically not known, since EGL is finnicky with the
1297b8e80941Smrg         * blob. MSAA, colourspace, etc are configured here. */
1298b8e80941Smrg
1299b8e80941Smrg        u32 format;
1300b8e80941Smrg
1301b8e80941Smrg        u32 clear_flags;
1302b8e80941Smrg        u32 zero2;
1303b8e80941Smrg
1304b8e80941Smrg        /* Purposeful off-by-one in these fields should be accounted for by the
1305b8e80941Smrg         * MALI_DIMENSION macro */
1306b8e80941Smrg
1307b8e80941Smrg        u16 width;
1308b8e80941Smrg        u16 height;
1309b8e80941Smrg
1310b8e80941Smrg        u32 zero3[8];
1311b8e80941Smrg
1312b8e80941Smrg        /* By default, the framebuffer is upside down from OpenGL's
1313b8e80941Smrg         * perspective. Set framebuffer to the end and negate the stride to
1314b8e80941Smrg         * flip in the Y direction */
1315b8e80941Smrg
1316b8e80941Smrg        mali_ptr framebuffer;
1317b8e80941Smrg        int32_t stride;
1318b8e80941Smrg
1319b8e80941Smrg        u32 zero4;
1320b8e80941Smrg
1321b8e80941Smrg        /* Depth and stencil buffers are interleaved, it appears, as they are
1322b8e80941Smrg         * set to the same address in captures. Both fields set to zero if the
1323b8e80941Smrg         * buffer is not being cleared. Depending on GL_ENABLE magic, you might
1324b8e80941Smrg         * get a zero enable despite the buffer being present; that still is
1325b8e80941Smrg         * disabled. */
1326b8e80941Smrg
1327b8e80941Smrg        mali_ptr depth_buffer; // not SAME_VA
1328b8e80941Smrg        u64 depth_buffer_enable;
1329b8e80941Smrg
1330b8e80941Smrg        mali_ptr stencil_buffer; // not SAME_VA
1331b8e80941Smrg        u64 stencil_buffer_enable;
1332b8e80941Smrg
1333b8e80941Smrg        u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
1334b8e80941Smrg        u32 clear_color_2; // always equal, but unclear function?
1335b8e80941Smrg        u32 clear_color_3; // always equal, but unclear function?
1336b8e80941Smrg        u32 clear_color_4; // always equal, but unclear function?
1337b8e80941Smrg
1338b8e80941Smrg        /* Set to zero if not cleared */
1339b8e80941Smrg
1340b8e80941Smrg        float clear_depth_1; // float32, ditto
1341b8e80941Smrg        float clear_depth_2; // float32, ditto
1342b8e80941Smrg        float clear_depth_3; // float32, ditto
1343b8e80941Smrg        float clear_depth_4; // float32, ditto
1344b8e80941Smrg
1345b8e80941Smrg        u32 clear_stencil; // Exactly as it appears in OpenGL
1346b8e80941Smrg
1347b8e80941Smrg        u32 zero6[7];
1348b8e80941Smrg
1349b8e80941Smrg        /* Very weird format, see generation code in trans_builder.c */
1350b8e80941Smrg        u32 resolution_check;
1351b8e80941Smrg
1352b8e80941Smrg        u32 tiler_flags;
1353b8e80941Smrg
1354b8e80941Smrg        u64 unknown_address_1; /* Pointing towards... a zero buffer? */
1355b8e80941Smrg        u64 unknown_address_2;
1356b8e80941Smrg
1357b8e80941Smrg        /* See mali_kbase_replay.c */
1358b8e80941Smrg        u64 tiler_heap_free;
1359b8e80941Smrg        u64 tiler_heap_end;
1360b8e80941Smrg
1361b8e80941Smrg        /* More below this, maybe */
1362b8e80941Smrg} __attribute__((packed));
1363b8e80941Smrg
1364b8e80941Smrg/* Format bits for the render target flags */
1365b8e80941Smrg
1366b8e80941Smrg#define MALI_MFBD_FORMAT_AFBC 	  (1 << 5)
1367b8e80941Smrg#define MALI_MFBD_FORMAT_MSAA 	  (1 << 7)
1368b8e80941Smrg
1369b8e80941Smrgstruct mali_rt_format {
1370b8e80941Smrg        unsigned unk1 : 32;
1371b8e80941Smrg        unsigned unk2 : 3;
1372b8e80941Smrg
1373b8e80941Smrg        unsigned nr_channels : 2; /* MALI_POSITIVE */
1374b8e80941Smrg
1375b8e80941Smrg        unsigned flags : 11;
1376b8e80941Smrg
1377b8e80941Smrg        unsigned swizzle : 12;
1378b8e80941Smrg
1379b8e80941Smrg        unsigned unk4 : 4;
1380b8e80941Smrg} __attribute__((packed));
1381b8e80941Smrg
1382b8e80941Smrgstruct bifrost_render_target {
1383b8e80941Smrg        struct mali_rt_format format;
1384b8e80941Smrg
1385b8e80941Smrg        u64 zero1;
1386b8e80941Smrg
1387b8e80941Smrg        union {
1388b8e80941Smrg                struct {
1389b8e80941Smrg                        /* Stuff related to ARM Framebuffer Compression. When AFBC is enabled,
1390b8e80941Smrg                         * there is an extra metadata buffer that contains 16 bytes per tile.
1391b8e80941Smrg                         * The framebuffer needs to be the same size as before, since we don't
1392b8e80941Smrg                         * know ahead of time how much space it will take up. The
1393b8e80941Smrg                         * framebuffer_stride is set to 0, since the data isn't stored linearly
1394b8e80941Smrg                         * anymore.
1395b8e80941Smrg                         */
1396b8e80941Smrg
1397b8e80941Smrg                        mali_ptr metadata;
1398b8e80941Smrg                        u32 stride; // stride in units of tiles
1399b8e80941Smrg                        u32 unk; // = 0x20000
1400b8e80941Smrg                } afbc;
1401b8e80941Smrg
1402b8e80941Smrg                struct {
1403b8e80941Smrg                        /* Heck if I know */
1404b8e80941Smrg                        u64 unk;
1405b8e80941Smrg                        mali_ptr pointer;
1406b8e80941Smrg                } chunknown;
1407b8e80941Smrg        };
1408b8e80941Smrg
1409b8e80941Smrg        mali_ptr framebuffer;
1410b8e80941Smrg
1411b8e80941Smrg        u32 zero2 : 4;
1412b8e80941Smrg        u32 framebuffer_stride : 28; // in units of bytes
1413b8e80941Smrg        u32 zero3;
1414b8e80941Smrg
1415b8e80941Smrg        u32 clear_color_1; // RGBA8888 from glClear, actually used by hardware
1416b8e80941Smrg        u32 clear_color_2; // always equal, but unclear function?
1417b8e80941Smrg        u32 clear_color_3; // always equal, but unclear function?
1418b8e80941Smrg        u32 clear_color_4; // always equal, but unclear function?
1419b8e80941Smrg} __attribute__((packed));
1420b8e80941Smrg
1421b8e80941Smrg/* An optional part of bifrost_framebuffer. It comes between the main structure
1422b8e80941Smrg * and the array of render targets. It must be included if any of these are
1423b8e80941Smrg * enabled:
1424b8e80941Smrg *
1425b8e80941Smrg * - Transaction Elimination
1426b8e80941Smrg * - Depth/stencil
1427b8e80941Smrg * - TODO: Anything else?
1428b8e80941Smrg */
1429b8e80941Smrg
1430b8e80941Smrg/* Flags field: note, these are guesses */
1431b8e80941Smrg
1432b8e80941Smrg#define MALI_EXTRA_PRESENT      (0x400)
1433b8e80941Smrg#define MALI_EXTRA_AFBC         (0x20)
1434b8e80941Smrg#define MALI_EXTRA_AFBC_ZS      (0x10)
1435b8e80941Smrg#define MALI_EXTRA_ZS           (0x4)
1436b8e80941Smrg
1437b8e80941Smrgstruct bifrost_fb_extra {
1438b8e80941Smrg        mali_ptr checksum;
1439b8e80941Smrg        /* Each tile has an 8 byte checksum, so the stride is "width in tiles * 8" */
1440b8e80941Smrg        u32 checksum_stride;
1441b8e80941Smrg
1442b8e80941Smrg        u32 flags;
1443b8e80941Smrg
1444b8e80941Smrg        union {
1445b8e80941Smrg                /* Note: AFBC is only allowed for 24/8 combined depth/stencil. */
1446b8e80941Smrg                struct {
1447b8e80941Smrg                        mali_ptr depth_stencil_afbc_metadata;
1448b8e80941Smrg                        u32 depth_stencil_afbc_stride; // in units of tiles
1449b8e80941Smrg                        u32 zero1;
1450b8e80941Smrg
1451b8e80941Smrg                        mali_ptr depth_stencil;
1452b8e80941Smrg
1453b8e80941Smrg                        u64 padding;
1454b8e80941Smrg                } ds_afbc;
1455b8e80941Smrg
1456b8e80941Smrg                struct {
1457b8e80941Smrg                        /* Depth becomes depth/stencil in case of combined D/S */
1458b8e80941Smrg                        mali_ptr depth;
1459b8e80941Smrg                        u32 depth_stride_zero : 4;
1460b8e80941Smrg                        u32 depth_stride : 28;
1461b8e80941Smrg                        u32 zero1;
1462b8e80941Smrg
1463b8e80941Smrg                        mali_ptr stencil;
1464b8e80941Smrg                        u32 stencil_stride_zero : 4;
1465b8e80941Smrg                        u32 stencil_stride : 28;
1466b8e80941Smrg                        u32 zero2;
1467b8e80941Smrg                } ds_linear;
1468b8e80941Smrg        };
1469b8e80941Smrg
1470b8e80941Smrg
1471b8e80941Smrg        u64 zero3, zero4;
1472b8e80941Smrg} __attribute__((packed));
1473b8e80941Smrg
1474b8e80941Smrg/* flags for unk3 */
1475b8e80941Smrg
1476b8e80941Smrg/* Enables writing depth results back to main memory (rather than keeping them
1477b8e80941Smrg * on-chip in the tile buffer and then discarding) */
1478b8e80941Smrg
1479b8e80941Smrg#define MALI_MFBD_DEPTH_WRITE (1 << 10)
1480b8e80941Smrg
1481b8e80941Smrg/* The MFBD contains the extra bifrost_fb_extra section */
1482b8e80941Smrg
1483b8e80941Smrg#define MALI_MFBD_EXTRA (1 << 13)
1484b8e80941Smrg
1485b8e80941Smrgstruct bifrost_framebuffer {
1486b8e80941Smrg        u32 unk0; // = 0x10
1487b8e80941Smrg
1488b8e80941Smrg        u32 unknown2; // = 0x1f, same as SFBD
1489b8e80941Smrg        mali_ptr scratchpad;
1490b8e80941Smrg
1491b8e80941Smrg        /* 0x10 */
1492b8e80941Smrg        mali_ptr sample_locations;
1493b8e80941Smrg        mali_ptr unknown1;
1494b8e80941Smrg        /* 0x20 */
1495b8e80941Smrg        u16 width1, height1;
1496b8e80941Smrg        u32 zero3;
1497b8e80941Smrg        u16 width2, height2;
1498b8e80941Smrg        u32 unk1 : 19; // = 0x01000
1499b8e80941Smrg        u32 rt_count_1 : 2; // off-by-one (use MALI_POSITIVE)
1500b8e80941Smrg        u32 unk2 : 3; // = 0
1501b8e80941Smrg        u32 rt_count_2 : 3; // no off-by-one
1502b8e80941Smrg        u32 zero4 : 5;
1503b8e80941Smrg        /* 0x30 */
1504b8e80941Smrg        u32 clear_stencil : 8;
1505b8e80941Smrg        u32 unk3 : 24; // = 0x100
1506b8e80941Smrg        float clear_depth;
1507b8e80941Smrg        mali_ptr tiler_meta;
1508b8e80941Smrg        /* 0x40 */
1509b8e80941Smrg
1510b8e80941Smrg        /* Note: these are guesses! */
1511b8e80941Smrg        mali_ptr tiler_scratch_start;
1512b8e80941Smrg        mali_ptr tiler_scratch_middle;
1513b8e80941Smrg
1514b8e80941Smrg        /* These are not, since we see symmetry with replay jobs which name these explicitly */
1515b8e80941Smrg        mali_ptr tiler_heap_start;
1516b8e80941Smrg        mali_ptr tiler_heap_end;
1517b8e80941Smrg
1518b8e80941Smrg        u64 zero9, zero10, zero11, zero12;
1519b8e80941Smrg
1520b8e80941Smrg        /* optional: struct bifrost_fb_extra extra */
1521b8e80941Smrg        /* struct bifrost_render_target rts[] */
1522b8e80941Smrg} __attribute__((packed));
1523b8e80941Smrg
1524b8e80941Smrg#endif /* __PANFROST_JOB_H__ */
1525