1/*
2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Rob Clark <robclark@freedesktop.org>
26 */
27
28#include "pipe/p_state.h"
29#include "util/u_string.h"
30#include "util/u_memory.h"
31#include "util/u_helpers.h"
32#include "util/u_format.h"
33#include "util/u_viewport.h"
34
35#include "freedreno_resource.h"
36#include "freedreno_query_hw.h"
37
38#include "fd6_emit.h"
39#include "fd6_blend.h"
40#include "fd6_context.h"
41#include "fd6_image.h"
42#include "fd6_program.h"
43#include "fd6_rasterizer.h"
44#include "fd6_texture.h"
45#include "fd6_format.h"
46#include "fd6_zsa.h"
47
48static uint32_t
49shader_t_to_opcode(gl_shader_stage type)
50{
51	switch (type) {
52	case MESA_SHADER_VERTEX:
53	case MESA_SHADER_TESS_CTRL:
54	case MESA_SHADER_TESS_EVAL:
55	case MESA_SHADER_GEOMETRY:
56		return CP_LOAD_STATE6_GEOM;
57	case MESA_SHADER_FRAGMENT:
58	case MESA_SHADER_COMPUTE:
59	case MESA_SHADER_KERNEL:
60		return CP_LOAD_STATE6_FRAG;
61	default:
62		unreachable("bad shader type");
63	}
64}
65
66/* regid:          base const register
67 * prsc or dwords: buffer containing constant values
68 * sizedwords:     size of const value buffer
69 */
70static void
71fd6_emit_const(struct fd_ringbuffer *ring, gl_shader_stage type,
72		uint32_t regid, uint32_t offset, uint32_t sizedwords,
73		const uint32_t *dwords, struct pipe_resource *prsc)
74{
75	uint32_t i, sz, align_sz;
76	enum a6xx_state_src src;
77
78	debug_assert((regid % 4) == 0);
79
80	if (prsc) {
81		sz = 0;
82		src = SS6_INDIRECT;
83	} else {
84		sz = sizedwords;
85		src = SS6_DIRECT;
86	}
87
88	align_sz = align(sz, 4);
89
90	OUT_PKT7(ring, shader_t_to_opcode(type), 3 + align_sz);
91	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
92			CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
93			CP_LOAD_STATE6_0_STATE_SRC(src) |
94			CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
95			CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(sizedwords, 4)));
96	if (prsc) {
97		struct fd_bo *bo = fd_resource(prsc)->bo;
98		OUT_RELOC(ring, bo, offset, 0, 0);
99	} else {
100		OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
101		OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
102		dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
103	}
104
105	for (i = 0; i < sz; i++) {
106		OUT_RING(ring, dwords[i]);
107	}
108
109	/* Zero-pad to multiple of 4 dwords */
110	for (i = sz; i < align_sz; i++) {
111		OUT_RING(ring, 0);
112	}
113}
114
115static void
116fd6_emit_const_bo(struct fd_ringbuffer *ring, gl_shader_stage type, boolean write,
117		uint32_t regid, uint32_t num, struct pipe_resource **prscs, uint32_t *offsets)
118{
119	uint32_t anum = align(num, 2);
120	uint32_t i;
121
122	debug_assert((regid % 4) == 0);
123
124	OUT_PKT7(ring, shader_t_to_opcode(type), 3 + (2 * anum));
125	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(regid/4) |
126			CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS)|
127			CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
128			CP_LOAD_STATE6_0_STATE_BLOCK(fd6_stage2shadersb(type)) |
129			CP_LOAD_STATE6_0_NUM_UNIT(anum/2));
130	OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
131	OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
132
133	for (i = 0; i < num; i++) {
134		if (prscs[i]) {
135			if (write) {
136				OUT_RELOCW(ring, fd_resource(prscs[i])->bo, offsets[i], 0, 0);
137			} else {
138				OUT_RELOC(ring, fd_resource(prscs[i])->bo, offsets[i], 0, 0);
139			}
140		} else {
141			OUT_RING(ring, 0xbad00000 | (i << 16));
142			OUT_RING(ring, 0xbad00000 | (i << 16));
143		}
144	}
145
146	for (; i < anum; i++) {
147		OUT_RING(ring, 0xffffffff);
148		OUT_RING(ring, 0xffffffff);
149	}
150}
151
152/* Border color layout is diff from a4xx/a5xx.. if it turns out to be
153 * the same as a6xx then move this somewhere common ;-)
154 *
155 * Entry layout looks like (total size, 0x60 bytes):
156 */
157
158struct PACKED bcolor_entry {
159	uint32_t fp32[4];
160	uint16_t ui16[4];
161	int16_t  si16[4];
162	uint16_t fp16[4];
163	uint16_t rgb565;
164	uint16_t rgb5a1;
165	uint16_t rgba4;
166	uint8_t __pad0[2];
167	uint8_t  ui8[4];
168	int8_t   si8[4];
169	uint32_t rgb10a2;
170	uint32_t z24; /* also s8? */
171	uint16_t srgb[4];      /* appears to duplicate fp16[], but clamped, used for srgb */
172	uint8_t  __pad1[56];
173};
174
175#define FD6_BORDER_COLOR_SIZE        sizeof(struct bcolor_entry)
176#define FD6_BORDER_COLOR_UPLOAD_SIZE (2 * PIPE_MAX_SAMPLERS * FD6_BORDER_COLOR_SIZE)
177
178static void
179setup_border_colors(struct fd_texture_stateobj *tex, struct bcolor_entry *entries)
180{
181	unsigned i, j;
182	STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
183
184	for (i = 0; i < tex->num_samplers; i++) {
185		struct bcolor_entry *e = &entries[i];
186		struct pipe_sampler_state *sampler = tex->samplers[i];
187		union pipe_color_union *bc;
188
189		if (!sampler)
190			continue;
191
192		bc = &sampler->border_color;
193
194		/*
195		 * XXX HACK ALERT XXX
196		 *
197		 * The border colors need to be swizzled in a particular
198		 * format-dependent order. Even though samplers don't know about
199		 * formats, we can assume that with a GL state tracker, there's a
200		 * 1:1 correspondence between sampler and texture. Take advantage
201		 * of that knowledge.
202		 */
203		if ((i >= tex->num_textures) || !tex->textures[i])
204			continue;
205
206		struct pipe_sampler_view *view = tex->textures[i];
207		enum pipe_format format = view->format;
208		const struct util_format_description *desc =
209				util_format_description(format);
210
211		e->rgb565 = 0;
212		e->rgb5a1 = 0;
213		e->rgba4 = 0;
214		e->rgb10a2 = 0;
215		e->z24 = 0;
216
217		unsigned char swiz[4];
218
219		fd6_tex_swiz(format, swiz,
220				view->swizzle_r, view->swizzle_g,
221				view->swizzle_b, view->swizzle_a);
222
223		for (j = 0; j < 4; j++) {
224			int c = swiz[j];
225			int cd = c;
226
227			/*
228			 * HACK: for PIPE_FORMAT_X24S8_UINT we end up w/ the
229			 * stencil border color value in bc->ui[0] but according
230			 * to desc->swizzle and desc->channel, the .x/.w component
231			 * is NONE and the stencil value is in the y component.
232			 * Meanwhile the hardware wants this in the .w component
233			 * for x24s8 and the .x component for x32_s8x24.
234			 */
235			if ((format == PIPE_FORMAT_X24S8_UINT) ||
236					(format == PIPE_FORMAT_X32_S8X24_UINT)) {
237				if (j == 0) {
238					c = 1;
239					cd = (format == PIPE_FORMAT_X32_S8X24_UINT) ? 0 : 3;
240				} else {
241					continue;
242				}
243			}
244
245			if (c >= 4)
246				continue;
247
248			if (desc->channel[c].pure_integer) {
249				uint16_t clamped;
250				switch (desc->channel[c].size) {
251				case 2:
252					assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
253					clamped = CLAMP(bc->ui[j], 0, 0x3);
254					break;
255				case 8:
256					if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
257						clamped = CLAMP(bc->i[j], -128, 127);
258					else
259						clamped = CLAMP(bc->ui[j], 0, 255);
260					break;
261				case 10:
262					assert(desc->channel[c].type == UTIL_FORMAT_TYPE_UNSIGNED);
263					clamped = CLAMP(bc->ui[j], 0, 0x3ff);
264					break;
265				case 16:
266					if (desc->channel[c].type == UTIL_FORMAT_TYPE_SIGNED)
267						clamped = CLAMP(bc->i[j], -32768, 32767);
268					else
269						clamped = CLAMP(bc->ui[j], 0, 65535);
270					break;
271				default:
272					assert(!"Unexpected bit size");
273				case 32:
274					clamped = 0;
275					break;
276				}
277				e->fp32[cd] = bc->ui[j];
278				e->fp16[cd] = clamped;
279			} else {
280				float f = bc->f[j];
281				float f_u = CLAMP(f, 0, 1);
282				float f_s = CLAMP(f, -1, 1);
283
284				e->fp32[c] = fui(f);
285				e->fp16[c] = util_float_to_half(f);
286				e->srgb[c] = util_float_to_half(f_u);
287				e->ui16[c] = f_u * 0xffff;
288				e->si16[c] = f_s * 0x7fff;
289				e->ui8[c]  = f_u * 0xff;
290				e->si8[c]  = f_s * 0x7f;
291				if (c == 1)
292					e->rgb565 |= (int)(f_u * 0x3f) << 5;
293				else if (c < 3)
294					e->rgb565 |= (int)(f_u * 0x1f) << (c ? 11 : 0);
295				if (c == 3)
296					e->rgb5a1 |= (f_u > 0.5) ? 0x8000 : 0;
297				else
298					e->rgb5a1 |= (int)(f_u * 0x1f) << (c * 5);
299				if (c == 3)
300					e->rgb10a2 |= (int)(f_u * 0x3) << 30;
301				else
302					e->rgb10a2 |= (int)(f_u * 0x3ff) << (c * 10);
303				e->rgba4 |= (int)(f_u * 0xf) << (c * 4);
304				if (c == 0)
305					e->z24 = f_u * 0xffffff;
306			}
307		}
308
309#ifdef DEBUG
310		memset(&e->__pad0, 0, sizeof(e->__pad0));
311		memset(&e->__pad1, 0, sizeof(e->__pad1));
312#endif
313	}
314}
315
316static void
317emit_border_color(struct fd_context *ctx, struct fd_ringbuffer *ring)
318{
319	struct fd6_context *fd6_ctx = fd6_context(ctx);
320	struct bcolor_entry *entries;
321	unsigned off;
322	void *ptr;
323
324	STATIC_ASSERT(sizeof(struct bcolor_entry) == FD6_BORDER_COLOR_SIZE);
325
326	u_upload_alloc(fd6_ctx->border_color_uploader,
327			0, FD6_BORDER_COLOR_UPLOAD_SIZE,
328			FD6_BORDER_COLOR_UPLOAD_SIZE, &off,
329			&fd6_ctx->border_color_buf,
330			&ptr);
331
332	entries = ptr;
333
334	setup_border_colors(&ctx->tex[PIPE_SHADER_VERTEX], &entries[0]);
335	setup_border_colors(&ctx->tex[PIPE_SHADER_FRAGMENT],
336			&entries[ctx->tex[PIPE_SHADER_VERTEX].num_samplers]);
337
338	OUT_PKT4(ring, REG_A6XX_SP_TP_BORDER_COLOR_BASE_ADDR_LO, 2);
339	OUT_RELOC(ring, fd_resource(fd6_ctx->border_color_buf)->bo, off, 0, 0);
340
341	u_upload_unmap(fd6_ctx->border_color_uploader);
342}
343
344static void
345fd6_emit_fb_tex(struct fd_ringbuffer *state, struct fd_context *ctx)
346{
347	struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
348	struct pipe_surface *psurf = pfb->cbufs[0];
349	struct fd_resource *rsc = fd_resource(psurf->texture);
350
351	uint32_t texconst0 = fd6_tex_const_0(psurf->texture, psurf->u.tex.level,
352			psurf->format, PIPE_SWIZZLE_X, PIPE_SWIZZLE_Y,
353			PIPE_SWIZZLE_Z, PIPE_SWIZZLE_W);
354
355	/* always TILE6_2 mode in GMEM.. which also means no swap: */
356	texconst0 &= ~(A6XX_TEX_CONST_0_SWAP__MASK | A6XX_TEX_CONST_0_TILE_MODE__MASK);
357	texconst0 |= A6XX_TEX_CONST_0_TILE_MODE(TILE6_2);
358
359	OUT_RING(state, texconst0);
360	OUT_RING(state, A6XX_TEX_CONST_1_WIDTH(pfb->width) |
361			A6XX_TEX_CONST_1_HEIGHT(pfb->height));
362	OUT_RINGP(state, A6XX_TEX_CONST_2_TYPE(A6XX_TEX_2D) |
363			A6XX_TEX_CONST_2_FETCHSIZE(TFETCH6_2_BYTE),
364			&ctx->batch->fb_read_patches);
365	OUT_RING(state, A6XX_TEX_CONST_3_ARRAY_PITCH(rsc->layer_size));
366
367	OUT_RING(state, A6XX_TEX_CONST_4_BASE_LO(ctx->screen->gmem_base));
368	OUT_RING(state, A6XX_TEX_CONST_5_BASE_HI(ctx->screen->gmem_base >> 32) |
369			A6XX_TEX_CONST_5_DEPTH(1));
370	OUT_RING(state, 0);   /* texconst6 */
371	OUT_RING(state, 0);   /* texconst7 */
372	OUT_RING(state, 0);   /* texconst8 */
373	OUT_RING(state, 0);   /* texconst9 */
374	OUT_RING(state, 0);   /* texconst10 */
375	OUT_RING(state, 0);   /* texconst11 */
376	OUT_RING(state, 0);
377	OUT_RING(state, 0);
378	OUT_RING(state, 0);
379	OUT_RING(state, 0);
380}
381
382bool
383fd6_emit_textures(struct fd_pipe *pipe, struct fd_ringbuffer *ring,
384		enum pipe_shader_type type, struct fd_texture_stateobj *tex,
385		unsigned bcolor_offset,
386		/* can be NULL if no image/SSBO/fb state to merge in: */
387		const struct ir3_shader_variant *v, struct fd_context *ctx)
388{
389	bool needs_border = false;
390	unsigned opcode, tex_samp_reg, tex_const_reg, tex_count_reg;
391	enum a6xx_state_block sb;
392
393	switch (type) {
394	case PIPE_SHADER_VERTEX:
395		sb = SB6_VS_TEX;
396		opcode = CP_LOAD_STATE6_GEOM;
397		tex_samp_reg = REG_A6XX_SP_VS_TEX_SAMP_LO;
398		tex_const_reg = REG_A6XX_SP_VS_TEX_CONST_LO;
399		tex_count_reg = REG_A6XX_SP_VS_TEX_COUNT;
400		break;
401	case PIPE_SHADER_FRAGMENT:
402		sb = SB6_FS_TEX;
403		opcode = CP_LOAD_STATE6_FRAG;
404		tex_samp_reg = REG_A6XX_SP_FS_TEX_SAMP_LO;
405		tex_const_reg = REG_A6XX_SP_FS_TEX_CONST_LO;
406		tex_count_reg = REG_A6XX_SP_FS_TEX_COUNT;
407		break;
408	case PIPE_SHADER_COMPUTE:
409		sb = SB6_CS_TEX;
410		opcode = CP_LOAD_STATE6_FRAG;
411		tex_samp_reg = REG_A6XX_SP_CS_TEX_SAMP_LO;
412		tex_const_reg = REG_A6XX_SP_CS_TEX_CONST_LO;
413		tex_count_reg = REG_A6XX_SP_CS_TEX_COUNT;
414		break;
415	default:
416		unreachable("bad state block");
417	}
418
419	if (tex->num_samplers > 0) {
420		struct fd_ringbuffer *state =
421			fd_ringbuffer_new_object(pipe, tex->num_samplers * 4 * 4);
422		for (unsigned i = 0; i < tex->num_samplers; i++) {
423			static const struct fd6_sampler_stateobj dummy_sampler = {};
424			const struct fd6_sampler_stateobj *sampler = tex->samplers[i] ?
425				fd6_sampler_stateobj(tex->samplers[i]) : &dummy_sampler;
426			OUT_RING(state, sampler->texsamp0);
427			OUT_RING(state, sampler->texsamp1);
428			OUT_RING(state, sampler->texsamp2 |
429				A6XX_TEX_SAMP_2_BCOLOR_OFFSET((i + bcolor_offset) * sizeof(struct bcolor_entry)));
430			OUT_RING(state, sampler->texsamp3);
431			needs_border |= sampler->needs_border;
432		}
433
434		/* output sampler state: */
435		OUT_PKT7(ring, opcode, 3);
436		OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
437			CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
438			CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
439			CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
440			CP_LOAD_STATE6_0_NUM_UNIT(tex->num_samplers));
441		OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
442
443		OUT_PKT4(ring, tex_samp_reg, 2);
444		OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
445
446		fd_ringbuffer_del(state);
447	}
448
449	unsigned num_merged_textures = tex->num_textures;
450	unsigned num_textures = tex->num_textures;
451	if (v) {
452		num_merged_textures += v->image_mapping.num_tex;
453
454		if (v->fb_read)
455			num_merged_textures++;
456
457		/* There could be more bound textures than what the shader uses.
458		 * Which isn't known at shader compile time.  So in the case we
459		 * are merging tex state, only emit the textures that the shader
460		 * uses (since the image/SSBO related tex state comes immediately
461		 * after)
462		 */
463		num_textures = v->image_mapping.tex_base;
464	}
465
466	if (num_merged_textures > 0) {
467		struct fd_ringbuffer *state =
468			fd_ringbuffer_new_object(pipe, num_merged_textures * 16 * 4);
469		for (unsigned i = 0; i < num_textures; i++) {
470			static const struct fd6_pipe_sampler_view dummy_view = {};
471			const struct fd6_pipe_sampler_view *view = tex->textures[i] ?
472				fd6_pipe_sampler_view(tex->textures[i]) : &dummy_view;
473			struct fd_resource *rsc = NULL;
474
475			if (view->base.texture)
476				rsc = fd_resource(view->base.texture);
477
478			OUT_RING(state, view->texconst0);
479			OUT_RING(state, view->texconst1);
480			OUT_RING(state, view->texconst2);
481			OUT_RING(state, view->texconst3);
482
483			if (rsc) {
484				if (view->base.format == PIPE_FORMAT_X32_S8X24_UINT)
485					rsc = rsc->stencil;
486				OUT_RELOC(state, rsc->bo, view->offset,
487					(uint64_t)view->texconst5 << 32, 0);
488			} else {
489				OUT_RING(state, 0x00000000);
490				OUT_RING(state, view->texconst5);
491			}
492
493			OUT_RING(state, view->texconst6);
494
495			if (rsc && view->ubwc_enabled) {
496				OUT_RELOC(state, rsc->bo, view->ubwc_offset, 0, 0);
497			} else {
498				OUT_RING(state, 0);
499				OUT_RING(state, 0);
500			}
501
502			OUT_RING(state, view->texconst9);
503			OUT_RING(state, view->texconst10);
504			OUT_RING(state, view->texconst11);
505			OUT_RING(state, 0);
506			OUT_RING(state, 0);
507			OUT_RING(state, 0);
508			OUT_RING(state, 0);
509		}
510
511		if (v) {
512			const struct ir3_ibo_mapping *mapping = &v->image_mapping;
513			struct fd_shaderbuf_stateobj *buf = &ctx->shaderbuf[type];
514			struct fd_shaderimg_stateobj *img = &ctx->shaderimg[type];
515
516			for (unsigned i = 0; i < mapping->num_tex; i++) {
517				unsigned idx = mapping->tex_to_image[i];
518				if (idx & IBO_SSBO) {
519					fd6_emit_ssbo_tex(state, &buf->sb[idx & ~IBO_SSBO]);
520				} else {
521					fd6_emit_image_tex(state, &img->si[idx]);
522				}
523			}
524
525			if (v->fb_read) {
526				fd6_emit_fb_tex(state, ctx);
527			}
528		}
529
530		/* emit texture state: */
531		OUT_PKT7(ring, opcode, 3);
532		OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
533			CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
534			CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
535			CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
536			CP_LOAD_STATE6_0_NUM_UNIT(num_merged_textures));
537		OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
538
539		OUT_PKT4(ring, tex_const_reg, 2);
540		OUT_RB(ring, state); /* SRC_ADDR_LO/HI */
541
542		fd_ringbuffer_del(state);
543	}
544
545	OUT_PKT4(ring, tex_count_reg, 1);
546	OUT_RING(ring, num_merged_textures);
547
548	return needs_border;
549}
550
551/* Emits combined texture state, which also includes any Image/SSBO
552 * related texture state merged in (because we must have all texture
553 * state for a given stage in a single buffer).  In the fast-path, if
554 * we don't need to merge in any image/ssbo related texture state, we
555 * just use cached texture stateobj.  Otherwise we generate a single-
556 * use stateobj.
557 *
558 * TODO Is there some sane way we can still use cached texture stateobj
559 * with image/ssbo in use?
560 *
561 * returns whether border_color is required:
562 */
563static bool
564fd6_emit_combined_textures(struct fd_ringbuffer *ring, struct fd6_emit *emit,
565		enum pipe_shader_type type, const struct ir3_shader_variant *v)
566{
567	struct fd_context *ctx = emit->ctx;
568	bool needs_border = false;
569
570	static const enum fd6_state_id state_id[PIPE_SHADER_TYPES] = {
571		[PIPE_SHADER_VERTEX]    = FD6_GROUP_VS_TEX,
572		[PIPE_SHADER_FRAGMENT]  = FD6_GROUP_FS_TEX,
573	};
574
575	debug_assert(state_id[type]);
576
577	if (!v->image_mapping.num_tex && !v->fb_read) {
578		/* in the fast-path, when we don't have to mix in any image/SSBO
579		 * related texture state, we can just lookup the stateobj and
580		 * re-emit that:
581		 *
582		 * Also, framebuffer-read is a slow-path because an extra
583		 * texture needs to be inserted.
584		 *
585		 * TODO we can probably simmplify things if we also treated
586		 * border_color as a slow-path.. this way the tex state key
587		 * wouldn't depend on bcolor_offset.. but fb_read might rather
588		 * be *somehow* a fast-path if we eventually used it for PLS.
589		 * I suppose there would be no harm in just *always* inserting
590		 * an fb_read texture?
591		 */
592		if ((ctx->dirty_shader[type] & FD_DIRTY_SHADER_TEX) &&
593				ctx->tex[type].num_textures > 0) {
594			struct fd6_texture_state *tex = fd6_texture_state(ctx,
595					type, &ctx->tex[type]);
596
597			needs_border |= tex->needs_border;
598
599			fd6_emit_add_group(emit, tex->stateobj, state_id[type], 0x7);
600		}
601	} else {
602		/* In the slow-path, create a one-shot texture state object
603		 * if either TEX|PROG|SSBO|IMAGE state is dirty:
604		 */
605		if ((ctx->dirty_shader[type] &
606				(FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG |
607				 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) ||
608				v->fb_read) {
609			struct fd_texture_stateobj *tex = &ctx->tex[type];
610			struct fd_ringbuffer *stateobj =
611				fd_submit_new_ringbuffer(ctx->batch->submit,
612					0x1000, FD_RINGBUFFER_STREAMING);
613			unsigned bcolor_offset =
614				fd6_border_color_offset(ctx, type, tex);
615
616			needs_border |= fd6_emit_textures(ctx->pipe, stateobj, type, tex,
617					bcolor_offset, v, ctx);
618
619			fd6_emit_add_group(emit, stateobj, state_id[type], 0x7);
620
621			fd_ringbuffer_del(stateobj);
622		}
623	}
624
625	return needs_border;
626}
627
628static struct fd_ringbuffer *
629build_vbo_state(struct fd6_emit *emit, const struct ir3_shader_variant *vp)
630{
631	const struct fd_vertex_state *vtx = emit->vtx;
632	int32_t i, j;
633
634	struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit,
635			4 * (10 * vp->inputs_count + 2), FD_RINGBUFFER_STREAMING);
636
637	for (i = 0, j = 0; i <= vp->inputs_count; i++) {
638		if (vp->inputs[i].sysval)
639			continue;
640		if (vp->inputs[i].compmask) {
641			struct pipe_vertex_element *elem = &vtx->vtx->pipe[i];
642			const struct pipe_vertex_buffer *vb =
643					&vtx->vertexbuf.vb[elem->vertex_buffer_index];
644			struct fd_resource *rsc = fd_resource(vb->buffer.resource);
645			enum pipe_format pfmt = elem->src_format;
646			enum a6xx_vtx_fmt fmt = fd6_pipe2vtx(pfmt);
647			bool isint = util_format_is_pure_integer(pfmt);
648			uint32_t off = vb->buffer_offset + elem->src_offset;
649			uint32_t size = fd_bo_size(rsc->bo) - off;
650			debug_assert(fmt != ~0);
651
652#ifdef DEBUG
653			/* see dEQP-GLES31.stress.vertex_attribute_binding.buffer_bounds.bind_vertex_buffer_offset_near_wrap_10
654			 */
655			if (off > fd_bo_size(rsc->bo))
656				continue;
657#endif
658
659			OUT_PKT4(ring, REG_A6XX_VFD_FETCH(j), 4);
660			OUT_RELOC(ring, rsc->bo, off, 0, 0);
661			OUT_RING(ring, size);           /* VFD_FETCH[j].SIZE */
662			OUT_RING(ring, vb->stride);     /* VFD_FETCH[j].STRIDE */
663
664			OUT_PKT4(ring, REG_A6XX_VFD_DECODE(j), 2);
665			OUT_RING(ring, A6XX_VFD_DECODE_INSTR_IDX(j) |
666					A6XX_VFD_DECODE_INSTR_FORMAT(fmt) |
667					COND(elem->instance_divisor, A6XX_VFD_DECODE_INSTR_INSTANCED) |
668					A6XX_VFD_DECODE_INSTR_SWAP(fd6_pipe2swap(pfmt)) |
669					A6XX_VFD_DECODE_INSTR_UNK30 |
670					COND(!isint, A6XX_VFD_DECODE_INSTR_FLOAT));
671			OUT_RING(ring, MAX2(1, elem->instance_divisor)); /* VFD_DECODE[j].STEP_RATE */
672
673			OUT_PKT4(ring, REG_A6XX_VFD_DEST_CNTL(j), 1);
674			OUT_RING(ring, A6XX_VFD_DEST_CNTL_INSTR_WRITEMASK(vp->inputs[i].compmask) |
675					A6XX_VFD_DEST_CNTL_INSTR_REGID(vp->inputs[i].regid));
676
677			j++;
678		}
679	}
680
681	OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_0, 1);
682	OUT_RING(ring, A6XX_VFD_CONTROL_0_VTXCNT(j) | (j << 8));
683
684	return ring;
685}
686
687static struct fd_ringbuffer *
688build_lrz(struct fd6_emit *emit, bool binning_pass)
689{
690	struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(emit->ctx->zsa);
691	struct pipe_framebuffer_state *pfb = &emit->ctx->batch->framebuffer;
692	struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
693	uint32_t gras_lrz_cntl = zsa->gras_lrz_cntl;
694	uint32_t rb_lrz_cntl = zsa->rb_lrz_cntl;
695
696	struct fd_ringbuffer *ring = fd_submit_new_ringbuffer(emit->ctx->batch->submit,
697			16, FD_RINGBUFFER_STREAMING);
698
699	if (emit->no_lrz_write || !rsc->lrz || !rsc->lrz_valid) {
700		gras_lrz_cntl = 0;
701		rb_lrz_cntl = 0;
702	} else if (binning_pass && zsa->lrz_write) {
703		gras_lrz_cntl |= A6XX_GRAS_LRZ_CNTL_LRZ_WRITE;
704	}
705
706	OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
707	OUT_RING(ring, gras_lrz_cntl);
708
709	OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
710	OUT_RING(ring, rb_lrz_cntl);
711
712	return ring;
713}
714
715static void
716fd6_emit_streamout(struct fd_ringbuffer *ring, struct fd6_emit *emit, struct ir3_stream_output_info *info)
717{
718	struct fd_context *ctx = emit->ctx;
719	const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
720	struct fd_streamout_stateobj *so = &ctx->streamout;
721
722	emit->streamout_mask = 0;
723
724	for (unsigned i = 0; i < so->num_targets; i++) {
725		struct pipe_stream_output_target *target = so->targets[i];
726
727		if (!target)
728			continue;
729
730		unsigned offset = (so->offsets[i] * info->stride[i] * 4) +
731				target->buffer_offset;
732
733		OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_BASE_LO(i), 3);
734		/* VPC_SO[i].BUFFER_BASE_LO: */
735		OUT_RELOCW(ring, fd_resource(target->buffer)->bo, 0, 0, 0);
736		OUT_RING(ring, target->buffer_size + offset);
737
738		OUT_PKT4(ring, REG_A6XX_VPC_SO_BUFFER_OFFSET(i), 3);
739		OUT_RING(ring, offset);
740		/* VPC_SO[i].FLUSH_BASE_LO/HI: */
741		// TODO just give hw a dummy addr for now.. we should
742		// be using this an then CP_MEM_TO_REG to set the
743		// VPC_SO[i].BUFFER_OFFSET for the next draw..
744		OUT_RELOCW(ring, fd6_context(ctx)->blit_mem, 0x100, 0, 0);
745
746		emit->streamout_mask |= (1 << i);
747	}
748
749	if (emit->streamout_mask) {
750		const struct fd6_streamout_state *tf = &prog->tf;
751
752		OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 12 + (2 * tf->prog_count));
753		OUT_RING(ring, REG_A6XX_VPC_SO_BUF_CNTL);
754		OUT_RING(ring, tf->vpc_so_buf_cntl);
755		OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(0));
756		OUT_RING(ring, tf->ncomp[0]);
757		OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(1));
758		OUT_RING(ring, tf->ncomp[1]);
759		OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(2));
760		OUT_RING(ring, tf->ncomp[2]);
761		OUT_RING(ring, REG_A6XX_VPC_SO_NCOMP(3));
762		OUT_RING(ring, tf->ncomp[3]);
763		OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
764		OUT_RING(ring, A6XX_VPC_SO_CNTL_ENABLE);
765		for (unsigned i = 0; i < tf->prog_count; i++) {
766			OUT_RING(ring, REG_A6XX_VPC_SO_PROG);
767			OUT_RING(ring, tf->prog[i]);
768		}
769
770		OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1);
771		OUT_RING(ring, 0x0);
772	} else {
773		OUT_PKT7(ring, CP_CONTEXT_REG_BUNCH, 4);
774		OUT_RING(ring, REG_A6XX_VPC_SO_CNTL);
775		OUT_RING(ring, 0);
776		OUT_RING(ring, REG_A6XX_VPC_SO_BUF_CNTL);
777		OUT_RING(ring, 0);
778
779		OUT_PKT4(ring, REG_A6XX_VPC_SO_OVERRIDE, 1);
780		OUT_RING(ring, A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
781	}
782
783}
784
785void
786fd6_emit_state(struct fd_ringbuffer *ring, struct fd6_emit *emit)
787{
788	struct fd_context *ctx = emit->ctx;
789	struct pipe_framebuffer_state *pfb = &ctx->batch->framebuffer;
790	const struct fd6_program_state *prog = fd6_emit_get_prog(emit);
791	const struct ir3_shader_variant *vp = emit->vs;
792	const struct ir3_shader_variant *fp = emit->fs;
793	const enum fd_dirty_3d_state dirty = emit->dirty;
794	bool needs_border = false;
795
796	emit_marker6(ring, 5);
797
798	/* NOTE: we track fb_read differently than _BLEND_ENABLED since
799	 * we might at some point decide to do sysmem in some cases when
800	 * blend is enabled:
801	 */
802	if (fp->fb_read)
803		ctx->batch->gmem_reason |= FD_GMEM_FB_READ;
804
805	if (emit->dirty & (FD_DIRTY_VTXBUF | FD_DIRTY_VTXSTATE)) {
806		struct fd_ringbuffer *state;
807
808		state = build_vbo_state(emit, emit->vs);
809		fd6_emit_add_group(emit, state, FD6_GROUP_VBO, 0x6);
810		fd_ringbuffer_del(state);
811
812		state = build_vbo_state(emit, emit->bs);
813		fd6_emit_add_group(emit, state, FD6_GROUP_VBO_BINNING, 0x1);
814		fd_ringbuffer_del(state);
815	}
816
817	if (dirty & FD_DIRTY_ZSA) {
818		struct fd6_zsa_stateobj *zsa = fd6_zsa_stateobj(ctx->zsa);
819
820		if (util_format_is_pure_integer(pipe_surface_format(pfb->cbufs[0])))
821			fd6_emit_add_group(emit, zsa->stateobj_no_alpha, FD6_GROUP_ZSA, 0x7);
822		else
823			fd6_emit_add_group(emit, zsa->stateobj, FD6_GROUP_ZSA, 0x7);
824	}
825
826	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && pfb->zsbuf) {
827		struct fd_ringbuffer *state;
828
829		state = build_lrz(emit, false);
830		fd6_emit_add_group(emit, state, FD6_GROUP_LRZ, 0x6);
831		fd_ringbuffer_del(state);
832
833		state = build_lrz(emit, true);
834		fd6_emit_add_group(emit, state, FD6_GROUP_LRZ_BINNING, 0x1);
835		fd_ringbuffer_del(state);
836	}
837
838	if (dirty & FD_DIRTY_STENCIL_REF) {
839		struct pipe_stencil_ref *sr = &ctx->stencil_ref;
840
841		OUT_PKT4(ring, REG_A6XX_RB_STENCILREF, 1);
842		OUT_RING(ring, A6XX_RB_STENCILREF_REF(sr->ref_value[0]) |
843				A6XX_RB_STENCILREF_BFREF(sr->ref_value[1]));
844	}
845
846	/* NOTE: scissor enabled bit is part of rasterizer state: */
847	if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER)) {
848		struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
849
850		OUT_PKT4(ring, REG_A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0, 2);
851		OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->minx) |
852				A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->miny));
853		OUT_RING(ring, A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_X(scissor->maxx - 1) |
854				A6XX_GRAS_SC_SCREEN_SCISSOR_TL_0_Y(scissor->maxy - 1));
855
856		ctx->batch->max_scissor.minx = MIN2(ctx->batch->max_scissor.minx, scissor->minx);
857		ctx->batch->max_scissor.miny = MIN2(ctx->batch->max_scissor.miny, scissor->miny);
858		ctx->batch->max_scissor.maxx = MAX2(ctx->batch->max_scissor.maxx, scissor->maxx);
859		ctx->batch->max_scissor.maxy = MAX2(ctx->batch->max_scissor.maxy, scissor->maxy);
860	}
861
862	if (dirty & FD_DIRTY_VIEWPORT) {
863		struct pipe_scissor_state *scissor = &ctx->viewport_scissor;
864
865		OUT_PKT4(ring, REG_A6XX_GRAS_CL_VPORT_XOFFSET_0, 6);
866		OUT_RING(ring, A6XX_GRAS_CL_VPORT_XOFFSET_0(ctx->viewport.translate[0]));
867		OUT_RING(ring, A6XX_GRAS_CL_VPORT_XSCALE_0(ctx->viewport.scale[0]));
868		OUT_RING(ring, A6XX_GRAS_CL_VPORT_YOFFSET_0(ctx->viewport.translate[1]));
869		OUT_RING(ring, A6XX_GRAS_CL_VPORT_YSCALE_0(ctx->viewport.scale[1]));
870		OUT_RING(ring, A6XX_GRAS_CL_VPORT_ZOFFSET_0(ctx->viewport.translate[2]));
871		OUT_RING(ring, A6XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
872
873		OUT_PKT4(ring, REG_A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0, 2);
874		OUT_RING(ring, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->minx) |
875				A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->miny));
876		OUT_RING(ring, A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_X(scissor->maxx - 1) |
877				A6XX_GRAS_SC_VIEWPORT_SCISSOR_TL_0_Y(scissor->maxy - 1));
878
879		unsigned guardband_x = fd_calc_guardband(scissor->maxx - scissor->minx);
880		unsigned guardband_y = fd_calc_guardband(scissor->maxy - scissor->miny);
881
882		OUT_PKT4(ring, REG_A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ, 1);
883		OUT_RING(ring, A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_HORZ(guardband_x) |
884				A6XX_GRAS_CL_GUARDBAND_CLIP_ADJ_VERT(guardband_y));
885	}
886
887	if (dirty & FD_DIRTY_PROG) {
888		fd6_emit_add_group(emit, prog->stateobj, FD6_GROUP_PROG, 0x6);
889		fd6_emit_add_group(emit, prog->binning_stateobj,
890				FD6_GROUP_PROG_BINNING, 0x1);
891
892		/* emit remaining non-stateobj program state, ie. what depends
893		 * on other emit state, so cannot be pre-baked.  This could
894		 * be moved to a separate stateobj which is dynamically
895		 * created.
896		 */
897		fd6_program_emit(ring, emit);
898	}
899
900	if (dirty & FD_DIRTY_RASTERIZER) {
901		struct fd6_rasterizer_stateobj *rasterizer =
902				fd6_rasterizer_stateobj(ctx->rasterizer);
903		fd6_emit_add_group(emit, rasterizer->stateobj,
904						   FD6_GROUP_RASTERIZER, 0x7);
905	}
906
907	/* Since the primitive restart state is not part of a tracked object, we
908	 * re-emit this register every time.
909	 */
910	if (emit->info && ctx->rasterizer) {
911		struct fd6_rasterizer_stateobj *rasterizer =
912				fd6_rasterizer_stateobj(ctx->rasterizer);
913		OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9806, 1);
914		OUT_RING(ring, 0);
915		OUT_PKT4(ring, REG_A6XX_PC_UNKNOWN_9990, 1);
916		OUT_RING(ring, 0);
917		OUT_PKT4(ring, REG_A6XX_VFD_UNKNOWN_A008, 1);
918		OUT_RING(ring, 0);
919
920		OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_0, 1);
921		OUT_RING(ring, rasterizer->pc_primitive_cntl |
922				 COND(emit->info->primitive_restart && emit->info->index_size,
923					  A6XX_PC_PRIMITIVE_CNTL_0_PRIMITIVE_RESTART));
924	}
925
926	if (dirty & (FD_DIRTY_FRAMEBUFFER | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
927		unsigned nr = pfb->nr_cbufs;
928
929		if (ctx->rasterizer->rasterizer_discard)
930			nr = 0;
931
932		OUT_PKT4(ring, REG_A6XX_RB_FS_OUTPUT_CNTL0, 2);
933		OUT_RING(ring, COND(fp->writes_pos, A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_Z) |
934				COND(fp->writes_smask && pfb->samples > 1,
935						A6XX_RB_FS_OUTPUT_CNTL0_FRAG_WRITES_SAMPMASK));
936		OUT_RING(ring, A6XX_RB_FS_OUTPUT_CNTL1_MRT(nr));
937
938		OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL1, 1);
939		OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL1_MRT(nr));
940	}
941
942#define DIRTY_CONST (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST | \
943					 FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)
944
945	if (ctx->dirty_shader[PIPE_SHADER_VERTEX] & DIRTY_CONST) {
946		struct fd_ringbuffer *vsconstobj = fd_submit_new_ringbuffer(
947				ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
948
949		OUT_WFI5(vsconstobj);
950		ir3_emit_vs_consts(vp, vsconstobj, ctx, emit->info);
951		fd6_emit_add_group(emit, vsconstobj, FD6_GROUP_VS_CONST, 0x7);
952		fd_ringbuffer_del(vsconstobj);
953	}
954
955	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] & DIRTY_CONST) {
956		struct fd_ringbuffer *fsconstobj = fd_submit_new_ringbuffer(
957				ctx->batch->submit, 0x1000, FD_RINGBUFFER_STREAMING);
958
959		OUT_WFI5(fsconstobj);
960		ir3_emit_fs_consts(fp, fsconstobj, ctx);
961		fd6_emit_add_group(emit, fsconstobj, FD6_GROUP_FS_CONST, 0x6);
962		fd_ringbuffer_del(fsconstobj);
963	}
964
965	struct ir3_stream_output_info *info = &vp->shader->stream_output;
966	if (info->num_outputs)
967		fd6_emit_streamout(ring, emit, info);
968
969	if (dirty & FD_DIRTY_BLEND) {
970		struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
971		uint32_t i;
972
973		for (i = 0; i < pfb->nr_cbufs; i++) {
974			enum pipe_format format = pipe_surface_format(pfb->cbufs[i]);
975			bool is_int = util_format_is_pure_integer(format);
976			bool has_alpha = util_format_has_alpha(format);
977			uint32_t control = blend->rb_mrt[i].control;
978			uint32_t blend_control = blend->rb_mrt[i].blend_control_alpha;
979
980			if (is_int) {
981				control &= A6XX_RB_MRT_CONTROL_COMPONENT_ENABLE__MASK;
982				control |= A6XX_RB_MRT_CONTROL_ROP_CODE(ROP_COPY);
983			}
984
985			if (has_alpha) {
986				blend_control |= blend->rb_mrt[i].blend_control_rgb;
987			} else {
988				blend_control |= blend->rb_mrt[i].blend_control_no_alpha_rgb;
989				control &= ~A6XX_RB_MRT_CONTROL_BLEND2;
990			}
991
992			OUT_PKT4(ring, REG_A6XX_RB_MRT_CONTROL(i), 1);
993			OUT_RING(ring, control);
994
995			OUT_PKT4(ring, REG_A6XX_RB_MRT_BLEND_CONTROL(i), 1);
996			OUT_RING(ring, blend_control);
997		}
998
999		OUT_PKT4(ring, REG_A6XX_SP_BLEND_CNTL, 1);
1000		OUT_RING(ring, blend->sp_blend_cntl);
1001	}
1002
1003	if (dirty & (FD_DIRTY_BLEND | FD_DIRTY_SAMPLE_MASK)) {
1004		struct fd6_blend_stateobj *blend = fd6_blend_stateobj(ctx->blend);
1005
1006		OUT_PKT4(ring, REG_A6XX_RB_BLEND_CNTL, 1);
1007		OUT_RING(ring, blend->rb_blend_cntl |
1008				A6XX_RB_BLEND_CNTL_SAMPLE_MASK(ctx->sample_mask));
1009	}
1010
1011	if (dirty & FD_DIRTY_BLEND_COLOR) {
1012		struct pipe_blend_color *bcolor = &ctx->blend_color;
1013
1014		OUT_PKT4(ring, REG_A6XX_RB_BLEND_RED_F32, 4);
1015		OUT_RING(ring, A6XX_RB_BLEND_RED_F32(bcolor->color[0]));
1016		OUT_RING(ring, A6XX_RB_BLEND_GREEN_F32(bcolor->color[1]));
1017		OUT_RING(ring, A6XX_RB_BLEND_BLUE_F32(bcolor->color[2]));
1018		OUT_RING(ring, A6XX_RB_BLEND_ALPHA_F32(bcolor->color[3]));
1019	}
1020
1021	needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_VERTEX, vp);
1022	needs_border |= fd6_emit_combined_textures(ring, emit, PIPE_SHADER_FRAGMENT, fp);
1023
1024	if (needs_border)
1025		emit_border_color(ctx, ring);
1026
1027	if (ctx->dirty_shader[PIPE_SHADER_FRAGMENT] &
1028			(FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) {
1029		struct fd_ringbuffer *state =
1030			fd6_build_ibo_state(ctx, fp, PIPE_SHADER_FRAGMENT);
1031		struct fd_ringbuffer *obj = fd_submit_new_ringbuffer(
1032			ctx->batch->submit, 9 * 4, FD_RINGBUFFER_STREAMING);
1033		const struct ir3_ibo_mapping *mapping = &fp->image_mapping;
1034
1035		OUT_PKT7(obj, CP_LOAD_STATE6, 3);
1036		OUT_RING(obj, CP_LOAD_STATE6_0_DST_OFF(0) |
1037			CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
1038			CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1039			CP_LOAD_STATE6_0_STATE_BLOCK(SB6_IBO) |
1040			CP_LOAD_STATE6_0_NUM_UNIT(mapping->num_ibo));
1041		OUT_RB(obj, state);
1042
1043		OUT_PKT4(obj, REG_A6XX_SP_IBO_LO, 2);
1044		OUT_RB(obj, state);
1045
1046		OUT_PKT4(obj, REG_A6XX_SP_IBO_COUNT, 1);
1047		OUT_RING(obj, mapping->num_ibo);
1048
1049		fd6_emit_add_group(emit, obj, FD6_GROUP_IBO, 0x7);
1050		fd_ringbuffer_del(obj);
1051		fd_ringbuffer_del(state);
1052	}
1053
1054	if (emit->num_groups > 0) {
1055		OUT_PKT7(ring, CP_SET_DRAW_STATE, 3 * emit->num_groups);
1056		for (unsigned i = 0; i < emit->num_groups; i++) {
1057			struct fd6_state_group *g = &emit->groups[i];
1058			unsigned n = fd_ringbuffer_size(g->stateobj) / 4;
1059
1060			if (n == 0) {
1061				OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1062						CP_SET_DRAW_STATE__0_DISABLE |
1063						CP_SET_DRAW_STATE__0_ENABLE_MASK(g->enable_mask) |
1064						CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1065				OUT_RING(ring, 0x00000000);
1066				OUT_RING(ring, 0x00000000);
1067			} else {
1068				OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(n) |
1069						CP_SET_DRAW_STATE__0_ENABLE_MASK(g->enable_mask) |
1070						CP_SET_DRAW_STATE__0_GROUP_ID(g->group_id));
1071				OUT_RB(ring, g->stateobj);
1072			}
1073
1074			fd_ringbuffer_del(g->stateobj);
1075		}
1076		emit->num_groups = 0;
1077	}
1078}
1079
1080void
1081fd6_emit_cs_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
1082		struct ir3_shader_variant *cp)
1083{
1084	enum fd_dirty_shader_state dirty = ctx->dirty_shader[PIPE_SHADER_COMPUTE];
1085
1086	if (dirty & (FD_DIRTY_SHADER_TEX | FD_DIRTY_SHADER_PROG |
1087			 FD_DIRTY_SHADER_IMAGE | FD_DIRTY_SHADER_SSBO)) {
1088		struct fd_texture_stateobj *tex = &ctx->tex[PIPE_SHADER_COMPUTE];
1089		unsigned bcolor_offset = fd6_border_color_offset(ctx, PIPE_SHADER_COMPUTE, tex);
1090
1091		bool needs_border = fd6_emit_textures(ctx->pipe, ring, PIPE_SHADER_COMPUTE, tex,
1092				bcolor_offset, cp, ctx);
1093
1094		if (needs_border)
1095			emit_border_color(ctx, ring);
1096
1097		OUT_PKT4(ring, REG_A6XX_SP_VS_TEX_COUNT, 1);
1098		OUT_RING(ring, 0);
1099
1100		OUT_PKT4(ring, REG_A6XX_SP_HS_TEX_COUNT, 1);
1101		OUT_RING(ring, 0);
1102
1103		OUT_PKT4(ring, REG_A6XX_SP_DS_TEX_COUNT, 1);
1104		OUT_RING(ring, 0);
1105
1106		OUT_PKT4(ring, REG_A6XX_SP_GS_TEX_COUNT, 1);
1107		OUT_RING(ring, 0);
1108
1109		OUT_PKT4(ring, REG_A6XX_SP_FS_TEX_COUNT, 1);
1110		OUT_RING(ring, 0);
1111	}
1112
1113	if (dirty & (FD_DIRTY_SHADER_SSBO | FD_DIRTY_SHADER_IMAGE)) {
1114		struct fd_ringbuffer *state =
1115			fd6_build_ibo_state(ctx, cp, PIPE_SHADER_COMPUTE);
1116		const struct ir3_ibo_mapping *mapping = &cp->image_mapping;
1117
1118		OUT_PKT7(ring, CP_LOAD_STATE6_FRAG, 3);
1119		OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
1120			CP_LOAD_STATE6_0_STATE_TYPE(ST6_IBO) |
1121			CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
1122			CP_LOAD_STATE6_0_STATE_BLOCK(SB6_CS_SHADER) |
1123			CP_LOAD_STATE6_0_NUM_UNIT(mapping->num_ibo));
1124		OUT_RB(ring, state);
1125
1126		OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_LO, 2);
1127		OUT_RB(ring, state);
1128
1129		OUT_PKT4(ring, REG_A6XX_SP_CS_IBO_COUNT, 1);
1130		OUT_RING(ring, mapping->num_ibo);
1131
1132		fd_ringbuffer_del(state);
1133	}
1134}
1135
1136
1137/* emit setup at begin of new cmdstream buffer (don't rely on previous
1138 * state, there could have been a context switch between ioctls):
1139 */
1140void
1141fd6_emit_restore(struct fd_batch *batch, struct fd_ringbuffer *ring)
1142{
1143	//struct fd_context *ctx = batch->ctx;
1144
1145	fd6_cache_inv(batch, ring);
1146
1147	OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
1148	OUT_RING(ring, 0xfffff);
1149
1150/*
1151t7              opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords)
11520000000500024048:               70d08003 00000000 001c5000 00000005
1153t7              opcode: CP_PERFCOUNTER_ACTION (50) (4 dwords)
11540000000500024058:               70d08003 00000010 001c7000 00000005
1155
1156t7              opcode: CP_WAIT_FOR_IDLE (26) (1 dwords)
11570000000500024068:               70268000
1158*/
1159
1160	WRITE(REG_A6XX_RB_CCU_CNTL, 0x7c400004);
1161	WRITE(REG_A6XX_RB_UNKNOWN_8E04, 0x00100000);
1162	WRITE(REG_A6XX_SP_UNKNOWN_AE04, 0x8);
1163	WRITE(REG_A6XX_SP_UNKNOWN_AE00, 0);
1164	WRITE(REG_A6XX_SP_UNKNOWN_AE0F, 0x3f);
1165	WRITE(REG_A6XX_SP_UNKNOWN_B605, 0x44);
1166	WRITE(REG_A6XX_SP_UNKNOWN_B600, 0x100000);
1167	WRITE(REG_A6XX_HLSQ_UNKNOWN_BE00, 0x80);
1168	WRITE(REG_A6XX_HLSQ_UNKNOWN_BE01, 0);
1169
1170	WRITE(REG_A6XX_VPC_UNKNOWN_9600, 0);
1171	WRITE(REG_A6XX_GRAS_UNKNOWN_8600, 0x880);
1172	WRITE(REG_A6XX_HLSQ_UNKNOWN_BE04, 0x80000);
1173	WRITE(REG_A6XX_SP_UNKNOWN_AE03, 0x1430);
1174	WRITE(REG_A6XX_SP_IBO_COUNT, 0);
1175	WRITE(REG_A6XX_SP_UNKNOWN_B182, 0);
1176	WRITE(REG_A6XX_HLSQ_UNKNOWN_BB11, 0);
1177	WRITE(REG_A6XX_UCHE_UNKNOWN_0E12, 0x3200000);
1178	WRITE(REG_A6XX_UCHE_CLIENT_PF, 4);
1179	WRITE(REG_A6XX_RB_UNKNOWN_8E01, 0x1);
1180	WRITE(REG_A6XX_SP_UNKNOWN_AB00, 0x5);
1181	WRITE(REG_A6XX_VFD_UNKNOWN_A009, 0x00000001);
1182	WRITE(REG_A6XX_RB_UNKNOWN_8811, 0x00000010);
1183	WRITE(REG_A6XX_PC_MODE_CNTL, 0x1f);
1184
1185	OUT_PKT4(ring, REG_A6XX_RB_SRGB_CNTL, 1);
1186	OUT_RING(ring, 0);
1187
1188	WRITE(REG_A6XX_GRAS_UNKNOWN_8101, 0);
1189	WRITE(REG_A6XX_GRAS_SAMPLE_CNTL, 0);
1190	WRITE(REG_A6XX_GRAS_UNKNOWN_8110, 0x2);
1191
1192	WRITE(REG_A6XX_RB_RENDER_CONTROL0, 0x401);
1193	WRITE(REG_A6XX_RB_RENDER_CONTROL1, 0);
1194	WRITE(REG_A6XX_RB_FS_OUTPUT_CNTL0, 0);
1195	WRITE(REG_A6XX_RB_SAMPLE_CNTL, 0);
1196	WRITE(REG_A6XX_RB_UNKNOWN_8818, 0);
1197	WRITE(REG_A6XX_RB_UNKNOWN_8819, 0);
1198	WRITE(REG_A6XX_RB_UNKNOWN_881A, 0);
1199	WRITE(REG_A6XX_RB_UNKNOWN_881B, 0);
1200	WRITE(REG_A6XX_RB_UNKNOWN_881C, 0);
1201	WRITE(REG_A6XX_RB_UNKNOWN_881D, 0);
1202	WRITE(REG_A6XX_RB_UNKNOWN_881E, 0);
1203	WRITE(REG_A6XX_RB_UNKNOWN_88F0, 0);
1204
1205	WRITE(REG_A6XX_VPC_UNKNOWN_9101, 0xffff00);
1206	WRITE(REG_A6XX_VPC_UNKNOWN_9107, 0);
1207
1208	WRITE(REG_A6XX_VPC_UNKNOWN_9236,
1209		  A6XX_VPC_UNKNOWN_9236_POINT_COORD_INVERT(0));
1210	WRITE(REG_A6XX_VPC_UNKNOWN_9300, 0);
1211
1212	WRITE(REG_A6XX_VPC_SO_OVERRIDE, A6XX_VPC_SO_OVERRIDE_SO_DISABLE);
1213
1214	WRITE(REG_A6XX_PC_UNKNOWN_9801, 0);
1215	WRITE(REG_A6XX_PC_UNKNOWN_9806, 0);
1216	WRITE(REG_A6XX_PC_UNKNOWN_9980, 0);
1217
1218	WRITE(REG_A6XX_PC_UNKNOWN_9B06, 0);
1219	WRITE(REG_A6XX_PC_UNKNOWN_9B06, 0);
1220
1221	WRITE(REG_A6XX_SP_UNKNOWN_A81B, 0);
1222
1223	WRITE(REG_A6XX_SP_UNKNOWN_B183, 0);
1224
1225	WRITE(REG_A6XX_GRAS_UNKNOWN_8099, 0);
1226	WRITE(REG_A6XX_GRAS_UNKNOWN_809B, 0);
1227	WRITE(REG_A6XX_GRAS_UNKNOWN_80A0, 2);
1228	WRITE(REG_A6XX_GRAS_UNKNOWN_80AF, 0);
1229	WRITE(REG_A6XX_VPC_UNKNOWN_9210, 0);
1230	WRITE(REG_A6XX_VPC_UNKNOWN_9211, 0);
1231	WRITE(REG_A6XX_VPC_UNKNOWN_9602, 0);
1232	WRITE(REG_A6XX_PC_UNKNOWN_9981, 0x3);
1233	WRITE(REG_A6XX_PC_UNKNOWN_9E72, 0);
1234	WRITE(REG_A6XX_VPC_UNKNOWN_9108, 0x3);
1235	WRITE(REG_A6XX_SP_TP_UNKNOWN_B304, 0);
1236	/* NOTE blob seems to (mostly?) use 0xb2 for SP_TP_UNKNOWN_B309
1237	 * but this seems to kill texture gather offsets.
1238	 */
1239	WRITE(REG_A6XX_SP_TP_UNKNOWN_B309, 0xa2);
1240	WRITE(REG_A6XX_RB_UNKNOWN_8804, 0);
1241	WRITE(REG_A6XX_GRAS_UNKNOWN_80A4, 0);
1242	WRITE(REG_A6XX_GRAS_UNKNOWN_80A5, 0);
1243	WRITE(REG_A6XX_GRAS_UNKNOWN_80A6, 0);
1244	WRITE(REG_A6XX_RB_UNKNOWN_8805, 0);
1245	WRITE(REG_A6XX_RB_UNKNOWN_8806, 0);
1246	WRITE(REG_A6XX_RB_UNKNOWN_8878, 0);
1247	WRITE(REG_A6XX_RB_UNKNOWN_8879, 0);
1248	WRITE(REG_A6XX_HLSQ_CONTROL_5_REG, 0xfc);
1249
1250	emit_marker6(ring, 7);
1251
1252	OUT_PKT4(ring, REG_A6XX_VFD_MODE_CNTL, 1);
1253	OUT_RING(ring, 0x00000000);   /* VFD_MODE_CNTL */
1254
1255	WRITE(REG_A6XX_VFD_UNKNOWN_A008, 0);
1256
1257	OUT_PKT4(ring, REG_A6XX_PC_MODE_CNTL, 1);
1258	OUT_RING(ring, 0x0000001f);   /* PC_MODE_CNTL */
1259
1260	/* we don't use this yet.. probably best to disable.. */
1261	OUT_PKT7(ring, CP_SET_DRAW_STATE, 3);
1262	OUT_RING(ring, CP_SET_DRAW_STATE__0_COUNT(0) |
1263			CP_SET_DRAW_STATE__0_DISABLE_ALL_GROUPS |
1264			CP_SET_DRAW_STATE__0_GROUP_ID(0));
1265	OUT_RING(ring, CP_SET_DRAW_STATE__1_ADDR_LO(0));
1266	OUT_RING(ring, CP_SET_DRAW_STATE__2_ADDR_HI(0));
1267
1268	OUT_PKT4(ring, REG_A6XX_VPC_SO_BUF_CNTL, 1);
1269	OUT_RING(ring, 0x00000000);   /* VPC_SO_BUF_CNTL */
1270
1271	OUT_PKT4(ring, REG_A6XX_SP_HS_CTRL_REG0, 1);
1272	OUT_RING(ring, 0x00000000);
1273
1274	OUT_PKT4(ring, REG_A6XX_SP_GS_CTRL_REG0, 1);
1275	OUT_RING(ring, 0x00000000);
1276
1277	OUT_PKT4(ring, REG_A6XX_GRAS_LRZ_CNTL, 1);
1278	OUT_RING(ring, 0x00000000);
1279
1280	OUT_PKT4(ring, REG_A6XX_RB_LRZ_CNTL, 1);
1281	OUT_RING(ring, 0x00000000);
1282}
1283
1284static void
1285fd6_mem_to_mem(struct fd_ringbuffer *ring, struct pipe_resource *dst,
1286		unsigned dst_off, struct pipe_resource *src, unsigned src_off,
1287		unsigned sizedwords)
1288{
1289	struct fd_bo *src_bo = fd_resource(src)->bo;
1290	struct fd_bo *dst_bo = fd_resource(dst)->bo;
1291	unsigned i;
1292
1293	for (i = 0; i < sizedwords; i++) {
1294		OUT_PKT7(ring, CP_MEM_TO_MEM, 5);
1295		OUT_RING(ring, 0x00000000);
1296		OUT_RELOCW(ring, dst_bo, dst_off, 0, 0);
1297		OUT_RELOC (ring, src_bo, src_off, 0, 0);
1298
1299		dst_off += 4;
1300		src_off += 4;
1301	}
1302}
1303
1304/* this is *almost* the same as fd6_cache_flush().. which I guess
1305 * could be re-worked to be something a bit more generic w/ param
1306 * indicating what needs to be flushed..  although that would mean
1307 * figuring out which events trigger what state to flush..
1308 */
1309static void
1310fd6_framebuffer_barrier(struct fd_context *ctx)
1311{
1312	struct fd6_context *fd6_ctx = fd6_context(ctx);
1313	struct fd_batch *batch = ctx->batch;
1314	struct fd_ringbuffer *ring = batch->draw;
1315	unsigned seqno;
1316
1317	seqno = fd6_event_write(batch, ring, CACHE_FLUSH_AND_INV_EVENT, true);
1318
1319	OUT_PKT7(ring, CP_WAIT_REG_MEM, 6);
1320	OUT_RING(ring, 0x00000013);
1321	OUT_RELOC(ring, fd6_ctx->blit_mem, 0, 0, 0);
1322	OUT_RING(ring, seqno);
1323	OUT_RING(ring, 0xffffffff);
1324	OUT_RING(ring, 0x00000010);
1325
1326	fd6_event_write(batch, ring, UNK_1D, true);
1327	fd6_event_write(batch, ring, UNK_1C, true);
1328
1329	seqno = fd6_event_write(batch, ring, CACHE_FLUSH_TS, true);
1330
1331	fd6_event_write(batch, ring, 0x31, false);
1332
1333	OUT_PKT7(ring, CP_UNK_A6XX_14, 4);
1334	OUT_RING(ring, 0x00000000);
1335	OUT_RELOC(ring, fd6_ctx->blit_mem, 0, 0, 0);
1336	OUT_RING(ring, seqno);
1337}
1338
1339void
1340fd6_emit_init(struct pipe_context *pctx)
1341{
1342	struct fd_context *ctx = fd_context(pctx);
1343	ctx->emit_const = fd6_emit_const;
1344	ctx->emit_const_bo = fd6_emit_const_bo;
1345	ctx->emit_ib = fd6_emit_ib;
1346	ctx->mem_to_mem = fd6_mem_to_mem;
1347	ctx->framebuffer_barrier = fd6_framebuffer_barrier;
1348}
1349