1/*
2 * Copyright (C) 2016 Rob Clark <robclark@freedesktop.org>
3 * Copyright © 2018 Google, Inc.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 *
24 * Authors:
25 *    Rob Clark <robclark@freedesktop.org>
26 */
27
28#include "pipe/p_state.h"
29#include "util/u_string.h"
30#include "util/u_memory.h"
31#include "util/u_inlines.h"
32#include "util/u_format.h"
33#include "util/bitset.h"
34
35#include "freedreno_program.h"
36
37#include "fd6_program.h"
38#include "fd6_emit.h"
39#include "fd6_texture.h"
40#include "fd6_format.h"
41
42static struct ir3_shader *
43create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state *cso,
44		gl_shader_stage type)
45{
46	struct fd_context *ctx = fd_context(pctx);
47	struct ir3_compiler *compiler = ctx->screen->compiler;
48	return ir3_shader_create(compiler, cso, type, &ctx->debug, pctx->screen);
49}
50
51static void *
52fd6_fp_state_create(struct pipe_context *pctx,
53		const struct pipe_shader_state *cso)
54{
55	return create_shader_stateobj(pctx, cso, MESA_SHADER_FRAGMENT);
56}
57
58static void
59fd6_fp_state_delete(struct pipe_context *pctx, void *hwcso)
60{
61	struct ir3_shader *so = hwcso;
62	struct fd_context *ctx = fd_context(pctx);
63	ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso);
64	ir3_shader_destroy(so);
65}
66
67static void *
68fd6_vp_state_create(struct pipe_context *pctx,
69		const struct pipe_shader_state *cso)
70{
71	return create_shader_stateobj(pctx, cso, MESA_SHADER_VERTEX);
72}
73
74static void
75fd6_vp_state_delete(struct pipe_context *pctx, void *hwcso)
76{
77	struct ir3_shader *so = hwcso;
78	struct fd_context *ctx = fd_context(pctx);
79	ir3_cache_invalidate(fd6_context(ctx)->shader_cache, hwcso);
80	ir3_shader_destroy(so);
81}
82
83void
84fd6_emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
85{
86	const struct ir3_info *si = &so->info;
87	enum a6xx_state_block sb = fd6_stage2shadersb(so->type);
88	enum a6xx_state_src src;
89	uint32_t i, sz, *bin;
90	unsigned opcode;
91
92	if (fd_mesa_debug & FD_DBG_DIRECT) {
93		sz = si->sizedwords;
94		src = SS6_DIRECT;
95		bin = fd_bo_map(so->bo);
96	} else {
97		sz = 0;
98		src = SS6_INDIRECT;
99		bin = NULL;
100	}
101
102	switch (so->type) {
103	case MESA_SHADER_VERTEX:
104		opcode = CP_LOAD_STATE6_GEOM;
105		break;
106	case MESA_SHADER_FRAGMENT:
107	case MESA_SHADER_COMPUTE:
108	case MESA_SHADER_KERNEL:
109		opcode = CP_LOAD_STATE6_FRAG;
110		break;
111	default:
112		unreachable("bad shader type");
113	}
114
115	OUT_PKT7(ring, opcode, 3 + sz);
116	OUT_RING(ring, CP_LOAD_STATE6_0_DST_OFF(0) |
117			CP_LOAD_STATE6_0_STATE_TYPE(ST6_SHADER) |
118			CP_LOAD_STATE6_0_STATE_SRC(src) |
119			CP_LOAD_STATE6_0_STATE_BLOCK(sb) |
120			CP_LOAD_STATE6_0_NUM_UNIT(so->instrlen));
121	if (bin) {
122		OUT_RING(ring, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
123		OUT_RING(ring, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
124	} else {
125		OUT_RELOCD(ring, so->bo, 0, 0, 0);
126	}
127
128	/* for how clever coverity is, it is sometimes rather dull, and
129	 * doesn't realize that the only case where bin==NULL, sz==0:
130	 */
131	assume(bin || (sz == 0));
132
133	for (i = 0; i < sz; i++) {
134		OUT_RING(ring, bin[i]);
135	}
136}
137
138/* Add any missing varyings needed for stream-out.  Otherwise varyings not
139 * used by fragment shader will be stripped out.
140 */
141static void
142link_stream_out(struct ir3_shader_linkage *l, const struct ir3_shader_variant *v)
143{
144	const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
145
146	/*
147	 * First, any stream-out varyings not already in linkage map (ie. also
148	 * consumed by frag shader) need to be added:
149	 */
150	for (unsigned i = 0; i < strmout->num_outputs; i++) {
151		const struct ir3_stream_output *out = &strmout->output[i];
152		unsigned k = out->register_index;
153		unsigned compmask =
154			(1 << (out->num_components + out->start_component)) - 1;
155		unsigned idx, nextloc = 0;
156
157		/* psize/pos need to be the last entries in linkage map, and will
158		 * get added link_stream_out, so skip over them:
159		 */
160		if ((v->outputs[k].slot == VARYING_SLOT_PSIZ) ||
161				(v->outputs[k].slot == VARYING_SLOT_POS))
162			continue;
163
164		for (idx = 0; idx < l->cnt; idx++) {
165			if (l->var[idx].regid == v->outputs[k].regid)
166				break;
167			nextloc = MAX2(nextloc, l->var[idx].loc + 4);
168		}
169
170		/* add if not already in linkage map: */
171		if (idx == l->cnt)
172			ir3_link_add(l, v->outputs[k].regid, compmask, nextloc);
173
174		/* expand component-mask if needed, ie streaming out all components
175		 * but frag shader doesn't consume all components:
176		 */
177		if (compmask & ~l->var[idx].compmask) {
178			l->var[idx].compmask |= compmask;
179			l->max_loc = MAX2(l->max_loc,
180				l->var[idx].loc + util_last_bit(l->var[idx].compmask));
181		}
182	}
183}
184
185static void
186setup_stream_out(struct fd6_program_state *state, const struct ir3_shader_variant *v,
187		struct ir3_shader_linkage *l)
188{
189	const struct ir3_stream_output_info *strmout = &v->shader->stream_output;
190	struct fd6_streamout_state *tf = &state->tf;
191
192	memset(tf, 0, sizeof(*tf));
193
194	tf->prog_count = align(l->max_loc, 2) / 2;
195
196	debug_assert(tf->prog_count < ARRAY_SIZE(tf->prog));
197
198	for (unsigned i = 0; i < strmout->num_outputs; i++) {
199		const struct ir3_stream_output *out = &strmout->output[i];
200		unsigned k = out->register_index;
201		unsigned idx;
202
203		tf->ncomp[out->output_buffer] += out->num_components;
204
205		/* linkage map sorted by order frag shader wants things, so
206		 * a bit less ideal here..
207		 */
208		for (idx = 0; idx < l->cnt; idx++)
209			if (l->var[idx].regid == v->outputs[k].regid)
210				break;
211
212		debug_assert(idx < l->cnt);
213
214		for (unsigned j = 0; j < out->num_components; j++) {
215			unsigned c   = j + out->start_component;
216			unsigned loc = l->var[idx].loc + c;
217			unsigned off = j + out->dst_offset;  /* in dwords */
218
219			if (loc & 1) {
220				tf->prog[loc/2] |= A6XX_VPC_SO_PROG_B_EN |
221						A6XX_VPC_SO_PROG_B_BUF(out->output_buffer) |
222						A6XX_VPC_SO_PROG_B_OFF(off * 4);
223			} else {
224				tf->prog[loc/2] |= A6XX_VPC_SO_PROG_A_EN |
225						A6XX_VPC_SO_PROG_A_BUF(out->output_buffer) |
226						A6XX_VPC_SO_PROG_A_OFF(off * 4);
227			}
228		}
229	}
230
231	tf->vpc_so_buf_cntl = A6XX_VPC_SO_BUF_CNTL_ENABLE |
232			COND(tf->ncomp[0] > 0, A6XX_VPC_SO_BUF_CNTL_BUF0) |
233			COND(tf->ncomp[1] > 0, A6XX_VPC_SO_BUF_CNTL_BUF1) |
234			COND(tf->ncomp[2] > 0, A6XX_VPC_SO_BUF_CNTL_BUF2) |
235			COND(tf->ncomp[3] > 0, A6XX_VPC_SO_BUF_CNTL_BUF3);
236}
237
238struct stage {
239	const struct ir3_shader_variant *v;
240	const struct ir3_info *i;
241	/* const sizes are in units of vec4, aligned to 4*vec4 */
242	uint16_t constlen;
243	/* instr sizes are in units of 16 instructions */
244	uint16_t instrlen;
245};
246
247enum {
248	VS = 0,
249	FS = 1,
250	HS = 2,
251	DS = 3,
252	GS = 4,
253	MAX_STAGES
254};
255
256static void
257setup_stages(struct fd6_program_state *state, struct stage *s, bool binning_pass)
258{
259	unsigned i;
260
261	if (binning_pass) {
262		static const struct ir3_shader_variant dummy_fs = {0};
263
264		s[VS].v = state->bs;
265		s[FS].v = &dummy_fs;
266	} else {
267		s[VS].v = state->vs;
268		s[FS].v = state->fs;
269	}
270
271	s[HS].v = s[DS].v = s[GS].v = NULL;  /* for now */
272
273	for (i = 0; i < MAX_STAGES; i++) {
274		if (s[i].v) {
275			s[i].i = &s[i].v->info;
276			s[i].constlen = align(s[i].v->constlen, 4);
277			/* instrlen is already in units of 16 instr.. although
278			 * probably we should ditch that and not make the compiler
279			 * care about instruction group size of a3xx vs a5xx
280			 */
281			s[i].instrlen = s[i].v->instrlen;
282		} else {
283			s[i].i = NULL;
284			s[i].constlen = 0;
285			s[i].instrlen = 0;
286		}
287	}
288}
289
290static inline uint32_t
291next_regid(uint32_t reg, uint32_t increment)
292{
293	if (reg == regid(63,0))
294		return regid(63,0);
295	else
296		return reg + increment;
297}
298
299#define VALIDREG(r)      ((r) != regid(63,0))
300#define CONDREG(r, val)  COND(VALIDREG(r), (val))
301
302static void
303setup_stateobj(struct fd_ringbuffer *ring, struct fd6_program_state *state,
304		const struct ir3_shader_key *key, bool binning_pass)
305{
306	struct stage s[MAX_STAGES];
307	uint32_t pos_regid, psize_regid, color_regid[8], posz_regid;
308	uint32_t face_regid, coord_regid, zwcoord_regid, samp_id_regid;
309	uint32_t smask_in_regid, smask_regid;
310	uint32_t vertex_regid, instance_regid;
311	uint32_t ij_pix_regid, ij_samp_regid, ij_cent_regid, ij_size_regid;
312	enum a3xx_threadsize fssz;
313	uint8_t psize_loc = ~0;
314	int i, j;
315
316	setup_stages(state, s, binning_pass);
317
318	bool sample_shading = s[FS].v->per_samp | key->sample_shading;
319
320	fssz = FOUR_QUADS;
321
322	pos_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_POS);
323	psize_regid = ir3_find_output_regid(s[VS].v, VARYING_SLOT_PSIZ);
324	vertex_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_VERTEX_ID);
325	instance_regid = ir3_find_sysval_regid(s[VS].v, SYSTEM_VALUE_INSTANCE_ID);
326
327	if (s[FS].v->color0_mrt) {
328		color_regid[0] = color_regid[1] = color_regid[2] = color_regid[3] =
329		color_regid[4] = color_regid[5] = color_regid[6] = color_regid[7] =
330			ir3_find_output_regid(s[FS].v, FRAG_RESULT_COLOR);
331	} else {
332		color_regid[0] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA0);
333		color_regid[1] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA1);
334		color_regid[2] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA2);
335		color_regid[3] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA3);
336		color_regid[4] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA4);
337		color_regid[5] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA5);
338		color_regid[6] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA6);
339		color_regid[7] = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DATA7);
340	}
341
342	samp_id_regid   = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_ID);
343	smask_in_regid  = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_SAMPLE_MASK_IN);
344	face_regid      = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRONT_FACE);
345	coord_regid     = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_FRAG_COORD);
346	zwcoord_regid   = next_regid(coord_regid, 2);
347	ij_pix_regid    = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_PIXEL);
348	ij_samp_regid   = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_SAMPLE);
349	ij_cent_regid   = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_CENTROID);
350	ij_size_regid   = ir3_find_sysval_regid(s[FS].v, SYSTEM_VALUE_BARYCENTRIC_SIZE);
351	posz_regid      = ir3_find_output_regid(s[FS].v, FRAG_RESULT_DEPTH);
352	smask_regid     = ir3_find_output_regid(s[FS].v, FRAG_RESULT_SAMPLE_MASK);
353
354	/* we can't write gl_SampleMask for !msaa..  if b0 is zero then we
355	 * end up masking the single sample!!
356	 */
357	if (!key->msaa)
358		smask_regid = regid(63, 0);
359
360	/* we could probably divide this up into things that need to be
361	 * emitted if frag-prog is dirty vs if vert-prog is dirty..
362	 */
363
364	OUT_PKT4(ring, REG_A6XX_SP_VS_CONFIG, 2);
365	OUT_RING(ring, COND(s[VS].v, A6XX_SP_VS_CONFIG_ENABLED) |
366			 A6XX_SP_VS_CONFIG_NIBO(s[VS].v->image_mapping.num_ibo) |
367			 A6XX_SP_VS_CONFIG_NTEX(s[VS].v->num_samp) |
368			 A6XX_SP_VS_CONFIG_NSAMP(s[VS].v->num_samp));     /* SP_VS_CONFIG */
369	OUT_RING(ring, s[VS].instrlen);							  /* SP_VS_INSTRLEN */
370
371	OUT_PKT4(ring, REG_A6XX_SP_HS_UNKNOWN_A831, 1);
372	OUT_RING(ring, 0);
373
374	OUT_PKT4(ring, REG_A6XX_SP_HS_CONFIG, 2);
375	OUT_RING(ring, COND(s[HS].v, A6XX_SP_HS_CONFIG_ENABLED)); /* SP_HS_CONFIG */
376	OUT_RING(ring, s[HS].instrlen);							  /* SP_HS_INSTRLEN */
377
378	OUT_PKT4(ring, REG_A6XX_SP_DS_CONFIG, 2);
379	OUT_RING(ring, COND(s[DS].v, A6XX_SP_DS_CONFIG_ENABLED)); /* SP_DS_CONFIG */
380	OUT_RING(ring, s[DS].instrlen);							  /* SP_DS_INSTRLEN */
381
382	OUT_PKT4(ring, REG_A6XX_SP_GS_UNKNOWN_A871, 1);
383	OUT_RING(ring, 0);
384
385	OUT_PKT4(ring, REG_A6XX_SP_GS_CONFIG, 2);
386	OUT_RING(ring, COND(s[GS].v, A6XX_SP_GS_CONFIG_ENABLED)); /* SP_GS_CONFIG */
387	OUT_RING(ring, s[GS].instrlen);							  /* SP_GS_INSTRLEN */
388
389	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A99E, 1);
390	OUT_RING(ring, 0x7fc0);
391
392	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A9A8, 1);
393	OUT_RING(ring, 0);
394
395	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_AB00, 1);
396	OUT_RING(ring, 0x5);
397
398	OUT_PKT4(ring, REG_A6XX_SP_FS_CONFIG, 2);
399	OUT_RING(ring, COND(s[FS].v, A6XX_SP_FS_CONFIG_ENABLED) |
400			 A6XX_SP_FS_CONFIG_NIBO(s[FS].v->image_mapping.num_ibo) |
401			 A6XX_SP_FS_CONFIG_NTEX(s[FS].v->num_samp) |
402			 A6XX_SP_FS_CONFIG_NSAMP(s[FS].v->num_samp));     /* SP_FS_CONFIG */
403	OUT_RING(ring, s[FS].instrlen);							  /* SP_FS_INSTRLEN */
404
405	OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_CNTL0, 1);
406	OUT_RING(ring, A6XX_SP_FS_OUTPUT_CNTL0_DEPTH_REGID(posz_regid) |
407			 A6XX_SP_FS_OUTPUT_CNTL0_SAMPMASK_REGID(smask_regid) |
408			 0xfc000000);
409
410	OUT_PKT4(ring, REG_A6XX_HLSQ_VS_CNTL, 4);
411	OUT_RING(ring, A6XX_HLSQ_VS_CNTL_CONSTLEN(s[VS].constlen) |
412			 A6XX_HLSQ_VS_CNTL_ENABLED);
413	OUT_RING(ring, A6XX_HLSQ_HS_CNTL_CONSTLEN(s[HS].constlen));    /* HLSQ_HS_CONSTLEN */
414	OUT_RING(ring, A6XX_HLSQ_DS_CNTL_CONSTLEN(s[DS].constlen));    /* HLSQ_DS_CONSTLEN */
415	OUT_RING(ring, A6XX_HLSQ_GS_CNTL_CONSTLEN(s[GS].constlen));    /* HLSQ_GS_CONSTLEN */
416
417	OUT_PKT4(ring, REG_A6XX_HLSQ_FS_CNTL, 1);
418	OUT_RING(ring, A6XX_HLSQ_FS_CNTL_CONSTLEN(s[FS].constlen) |
419			 A6XX_HLSQ_FS_CNTL_ENABLED);
420
421	OUT_PKT4(ring, REG_A6XX_SP_VS_CTRL_REG0, 1);
422	OUT_RING(ring, A6XX_SP_VS_CTRL_REG0_THREADSIZE(fssz) |
423			A6XX_SP_VS_CTRL_REG0_FULLREGFOOTPRINT(s[VS].i->max_reg + 1) |
424			A6XX_SP_VS_CTRL_REG0_MERGEDREGS |
425			A6XX_SP_VS_CTRL_REG0_BRANCHSTACK(s[VS].v->branchstack) |
426			COND(s[VS].v->need_pixlod, A6XX_SP_VS_CTRL_REG0_PIXLODENABLE));
427
428	struct ir3_shader_linkage l = {0};
429	ir3_link_shaders(&l, s[VS].v, s[FS].v);
430
431	if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass)
432		link_stream_out(&l, s[VS].v);
433
434	BITSET_DECLARE(varbs, 128) = {0};
435	uint32_t *varmask = (uint32_t *)varbs;
436
437	for (i = 0; i < l.cnt; i++)
438		for (j = 0; j < util_last_bit(l.var[i].compmask); j++)
439			BITSET_SET(varbs, l.var[i].loc + j);
440
441	OUT_PKT4(ring, REG_A6XX_VPC_VAR_DISABLE(0), 4);
442	OUT_RING(ring, ~varmask[0]);  /* VPC_VAR[0].DISABLE */
443	OUT_RING(ring, ~varmask[1]);  /* VPC_VAR[1].DISABLE */
444	OUT_RING(ring, ~varmask[2]);  /* VPC_VAR[2].DISABLE */
445	OUT_RING(ring, ~varmask[3]);  /* VPC_VAR[3].DISABLE */
446
447	/* a6xx appends pos/psize to end of the linkage map: */
448	if (VALIDREG(pos_regid))
449		ir3_link_add(&l, pos_regid, 0xf, l.max_loc);
450
451	if (VALIDREG(psize_regid)) {
452		psize_loc = l.max_loc;
453		ir3_link_add(&l, psize_regid, 0x1, l.max_loc);
454	}
455
456	if ((s[VS].v->shader->stream_output.num_outputs > 0) && !binning_pass) {
457		setup_stream_out(state, s[VS].v, &l);
458	}
459
460	for (i = 0, j = 0; (i < 16) && (j < l.cnt); i++) {
461		uint32_t reg = 0;
462
463		OUT_PKT4(ring, REG_A6XX_SP_VS_OUT_REG(i), 1);
464
465		reg |= A6XX_SP_VS_OUT_REG_A_REGID(l.var[j].regid);
466		reg |= A6XX_SP_VS_OUT_REG_A_COMPMASK(l.var[j].compmask);
467		j++;
468
469		reg |= A6XX_SP_VS_OUT_REG_B_REGID(l.var[j].regid);
470		reg |= A6XX_SP_VS_OUT_REG_B_COMPMASK(l.var[j].compmask);
471		j++;
472
473		OUT_RING(ring, reg);
474	}
475
476	for (i = 0, j = 0; (i < 8) && (j < l.cnt); i++) {
477		uint32_t reg = 0;
478
479		OUT_PKT4(ring, REG_A6XX_SP_VS_VPC_DST_REG(i), 1);
480
481		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC0(l.var[j++].loc);
482		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC1(l.var[j++].loc);
483		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC2(l.var[j++].loc);
484		reg |= A6XX_SP_VS_VPC_DST_REG_OUTLOC3(l.var[j++].loc);
485
486		OUT_RING(ring, reg);
487	}
488
489	OUT_PKT4(ring, REG_A6XX_SP_VS_OBJ_START_LO, 2);
490	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_LO/HI */
491
492	if (s[VS].instrlen)
493		fd6_emit_shader(ring, s[VS].v);
494
495	// TODO depending on other bits in this reg (if any) set somewhere else?
496#if 0
497	OUT_PKT4(ring, REG_A6XX_PC_PRIM_VTX_CNTL, 1);
498	OUT_RING(ring, COND(s[VS].v->writes_psize, A6XX_PC_PRIM_VTX_CNTL_PSIZE));
499#endif
500
501	OUT_PKT4(ring, REG_A6XX_SP_PRIMITIVE_CNTL, 1);
502	OUT_RING(ring, A6XX_SP_PRIMITIVE_CNTL_VSOUT(l.cnt));
503
504	bool enable_varyings = s[FS].v->total_in > 0;
505
506	OUT_PKT4(ring, REG_A6XX_VPC_CNTL_0, 1);
507	OUT_RING(ring, A6XX_VPC_CNTL_0_NUMNONPOSVAR(s[FS].v->total_in) |
508			 COND(enable_varyings, A6XX_VPC_CNTL_0_VARYING) |
509			 0xff00ff00);
510
511	OUT_PKT4(ring, REG_A6XX_PC_PRIMITIVE_CNTL_1, 1);
512	OUT_RING(ring, A6XX_PC_PRIMITIVE_CNTL_1_STRIDE_IN_VPC(l.max_loc) |
513			 CONDREG(psize_regid, 0x100));
514
515	if (binning_pass) {
516		OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2);
517		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_LO */
518		OUT_RING(ring, 0x00000000);    /* SP_FS_OBJ_START_HI */
519	} else {
520		OUT_PKT4(ring, REG_A6XX_SP_FS_OBJ_START_LO, 2);
521		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_LO/HI */
522	}
523
524	OUT_PKT4(ring, REG_A6XX_HLSQ_CONTROL_1_REG, 5);
525	OUT_RING(ring, 0x7);                /* XXX */
526	OUT_RING(ring, A6XX_HLSQ_CONTROL_2_REG_FACEREGID(face_regid) |
527			 A6XX_HLSQ_CONTROL_2_REG_SAMPLEID(samp_id_regid) |
528			 A6XX_HLSQ_CONTROL_2_REG_SAMPLEMASK(smask_in_regid) |
529			 A6XX_HLSQ_CONTROL_2_REG_SIZE(ij_size_regid));
530	OUT_RING(ring, A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_PIXEL(ij_pix_regid) |
531			 A6XX_HLSQ_CONTROL_3_REG_BARY_IJ_CENTROID(ij_cent_regid) |
532			 0xfc00fc00);               /* XXX */
533	OUT_RING(ring, A6XX_HLSQ_CONTROL_4_REG_XYCOORDREGID(coord_regid) |
534			 A6XX_HLSQ_CONTROL_4_REG_ZWCOORDREGID(zwcoord_regid) |
535			 A6XX_HLSQ_CONTROL_4_REG_BARY_IJ_PIXEL_PERSAMP(ij_samp_regid) |
536			 0x0000fc00);               /* XXX */
537	OUT_RING(ring, 0xfc);              /* XXX */
538
539	OUT_PKT4(ring, REG_A6XX_HLSQ_UNKNOWN_B980, 1);
540	OUT_RING(ring, enable_varyings ? 3 : 1);
541
542	OUT_PKT4(ring, REG_A6XX_SP_FS_CTRL_REG0, 1);
543	OUT_RING(ring, A6XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
544			COND(enable_varyings, A6XX_SP_FS_CTRL_REG0_VARYING) |
545			COND(s[FS].v->frag_coord, A6XX_SP_FS_CTRL_REG0_VARYING) |
546			0x1000000 |
547			A6XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
548			A6XX_SP_FS_CTRL_REG0_MERGEDREGS |
549			A6XX_SP_FS_CTRL_REG0_BRANCHSTACK(s[FS].v->branchstack) |
550			COND(s[FS].v->need_pixlod, A6XX_SP_FS_CTRL_REG0_PIXLODENABLE));
551
552	OUT_PKT4(ring, REG_A6XX_SP_UNKNOWN_A982, 1);
553	OUT_RING(ring, 0);        /* XXX */
554
555	OUT_PKT4(ring, REG_A6XX_HLSQ_UPDATE_CNTL, 1);
556	OUT_RING(ring, 0xff);        /* XXX */
557
558	OUT_PKT4(ring, REG_A6XX_VPC_GS_SIV_CNTL, 1);
559	OUT_RING(ring, 0x0000ffff);        /* XXX */
560
561#if 0
562	OUT_PKT4(ring, REG_A6XX_SP_SP_CNTL, 1);
563	OUT_RING(ring, 0x00000010);        /* XXX */
564#endif
565
566	OUT_PKT4(ring, REG_A6XX_GRAS_CNTL, 1);
567	OUT_RING(ring,
568			CONDREG(ij_pix_regid, A6XX_GRAS_CNTL_VARYING) |
569			CONDREG(ij_cent_regid, A6XX_GRAS_CNTL_CENTROID) |
570			CONDREG(ij_samp_regid, A6XX_GRAS_CNTL_PERSAMP_VARYING) |
571			COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_GRAS_CNTL_SIZE) |
572			COND(VALIDREG(ij_size_regid) &&  sample_shading, A6XX_GRAS_CNTL_SIZE_PERSAMP) |
573			COND(s[FS].v->frag_coord,
574					A6XX_GRAS_CNTL_SIZE |
575					A6XX_GRAS_CNTL_XCOORD |
576					A6XX_GRAS_CNTL_YCOORD |
577					A6XX_GRAS_CNTL_ZCOORD |
578					A6XX_GRAS_CNTL_WCOORD) |
579			COND(s[FS].v->frag_face, A6XX_GRAS_CNTL_SIZE));
580
581	OUT_PKT4(ring, REG_A6XX_RB_RENDER_CONTROL0, 2);
582	OUT_RING(ring,
583			CONDREG(ij_pix_regid, A6XX_RB_RENDER_CONTROL0_VARYING) |
584			CONDREG(ij_cent_regid, A6XX_RB_RENDER_CONTROL0_CENTROID) |
585			CONDREG(ij_samp_regid, A6XX_RB_RENDER_CONTROL0_PERSAMP_VARYING) |
586			COND(enable_varyings, A6XX_RB_RENDER_CONTROL0_UNK10) |
587			COND(VALIDREG(ij_size_regid) && !sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE) |
588			COND(VALIDREG(ij_size_regid) &&  sample_shading, A6XX_RB_RENDER_CONTROL0_SIZE_PERSAMP) |
589			COND(s[FS].v->frag_coord,
590					A6XX_RB_RENDER_CONTROL0_SIZE |
591					A6XX_RB_RENDER_CONTROL0_XCOORD |
592					A6XX_RB_RENDER_CONTROL0_YCOORD |
593					A6XX_RB_RENDER_CONTROL0_ZCOORD |
594					A6XX_RB_RENDER_CONTROL0_WCOORD) |
595			COND(s[FS].v->frag_face, A6XX_RB_RENDER_CONTROL0_SIZE));
596
597	OUT_RING(ring,
598			CONDREG(smask_in_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEMASK) |
599			CONDREG(samp_id_regid, A6XX_RB_RENDER_CONTROL1_SAMPLEID) |
600			CONDREG(ij_size_regid, A6XX_RB_RENDER_CONTROL1_SIZE) |
601			COND(s[FS].v->frag_face, A6XX_RB_RENDER_CONTROL1_FACENESS));
602
603	OUT_PKT4(ring, REG_A6XX_RB_SAMPLE_CNTL, 1);
604	OUT_RING(ring, COND(sample_shading, A6XX_RB_SAMPLE_CNTL_PER_SAMP_MODE));
605
606	OUT_PKT4(ring, REG_A6XX_GRAS_UNKNOWN_8101, 1);
607	OUT_RING(ring, COND(sample_shading, 0x6));  // XXX
608
609	OUT_PKT4(ring, REG_A6XX_GRAS_SAMPLE_CNTL, 1);
610	OUT_RING(ring, COND(sample_shading, A6XX_GRAS_SAMPLE_CNTL_PER_SAMP_MODE));
611
612	OUT_PKT4(ring, REG_A6XX_SP_FS_OUTPUT_REG(0), 8);
613	for (i = 0; i < 8; i++) {
614		// TODO we could have a mix of half and full precision outputs,
615		// we really need to figure out half-precision from IR3_REG_HALF
616		OUT_RING(ring, A6XX_SP_FS_OUTPUT_REG_REGID(color_regid[i]) |
617				COND(false,
618					A6XX_SP_FS_OUTPUT_REG_HALF_PRECISION));
619	}
620
621	OUT_PKT4(ring, REG_A6XX_VPC_PACK, 1);
622	OUT_RING(ring, A6XX_VPC_PACK_NUMNONPOSVAR(s[FS].v->total_in) |
623			 A6XX_VPC_PACK_PSIZELOC(psize_loc) |
624			 A6XX_VPC_PACK_STRIDE_IN_VPC(l.max_loc));
625
626	if (!binning_pass) {
627		/* figure out VARYING_INTERP / VARYING_PS_REPL register values: */
628		for (j = -1; (j = ir3_next_varying(s[FS].v, j)) < (int)s[FS].v->inputs_count; ) {
629			/* NOTE: varyings are packed, so if compmask is 0xb
630			 * then first, third, and fourth component occupy
631			 * three consecutive varying slots:
632			 */
633			unsigned compmask = s[FS].v->inputs[j].compmask;
634
635			uint32_t inloc = s[FS].v->inputs[j].inloc;
636
637			if (s[FS].v->inputs[j].interpolate == INTERP_MODE_FLAT) {
638				uint32_t loc = inloc;
639
640				for (i = 0; i < 4; i++) {
641					if (compmask & (1 << i)) {
642						state->vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
643						loc++;
644					}
645				}
646			}
647		}
648	}
649
650	if (!binning_pass)
651		if (s[FS].instrlen)
652			fd6_emit_shader(ring, s[FS].v);
653
654	OUT_PKT4(ring, REG_A6XX_VFD_CONTROL_1, 6);
655	OUT_RING(ring, A6XX_VFD_CONTROL_1_REGID4VTX(vertex_regid) |
656			A6XX_VFD_CONTROL_1_REGID4INST(instance_regid) |
657			0xfcfc0000);
658	OUT_RING(ring, 0x0000fcfc);   /* VFD_CONTROL_2 */
659	OUT_RING(ring, 0xfcfcfcfc);   /* VFD_CONTROL_3 */
660	OUT_RING(ring, 0x000000fc);   /* VFD_CONTROL_4 */
661	OUT_RING(ring, 0x0000fcfc);   /* VFD_CONTROL_5 */
662	OUT_RING(ring, 0x00000000);   /* VFD_CONTROL_6 */
663
664	bool fragz = s[FS].v->no_earlyz | s[FS].v->writes_pos;
665
666	OUT_PKT4(ring, REG_A6XX_RB_DEPTH_PLANE_CNTL, 1);
667	OUT_RING(ring, COND(fragz, A6XX_RB_DEPTH_PLANE_CNTL_FRAG_WRITES_Z));
668
669	OUT_PKT4(ring, REG_A6XX_GRAS_SU_DEPTH_PLANE_CNTL, 1);
670	OUT_RING(ring, COND(fragz, A6XX_GRAS_SU_DEPTH_PLANE_CNTL_FRAG_WRITES_Z));
671}
672
673/* emits the program state which is not part of the stateobj because of
674 * dependency on other gl state (rasterflat or sprite-coord-replacement)
675 */
676void
677fd6_program_emit(struct fd_ringbuffer *ring, struct fd6_emit *emit)
678{
679	const struct fd6_program_state *state = fd6_emit_get_prog(emit);
680
681	if (!unlikely(emit->rasterflat || emit->sprite_coord_enable)) {
682		/* fastpath: */
683		OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
684		for (int i = 0; i < 8; i++)
685			OUT_RING(ring, state->vinterp[i]);   /* VPC_VARYING_INTERP[i].MODE */
686
687		OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
688		for (int i = 0; i < 8; i++)
689			OUT_RING(ring, 0x00000000);          /* VPC_VARYING_PS_REPL[i] */
690	} else {
691		/* slow-path: */
692		struct ir3_shader_variant *fs = state->fs;
693		uint32_t vinterp[8], vpsrepl[8];
694
695		memset(vinterp, 0, sizeof(vinterp));
696		memset(vpsrepl, 0, sizeof(vpsrepl));
697
698		for (int j = -1; (j = ir3_next_varying(fs, j)) < (int)fs->inputs_count; ) {
699
700			/* NOTE: varyings are packed, so if compmask is 0xb
701			 * then first, third, and fourth component occupy
702			 * three consecutive varying slots:
703			 */
704			unsigned compmask = fs->inputs[j].compmask;
705
706			uint32_t inloc = fs->inputs[j].inloc;
707
708			if ((fs->inputs[j].interpolate == INTERP_MODE_FLAT) ||
709					(fs->inputs[j].rasterflat && emit->rasterflat)) {
710				uint32_t loc = inloc;
711
712				for (int i = 0; i < 4; i++) {
713					if (compmask & (1 << i)) {
714						vinterp[loc / 16] |= 1 << ((loc % 16) * 2);
715						loc++;
716					}
717				}
718			}
719
720			gl_varying_slot slot = fs->inputs[j].slot;
721
722			/* since we don't enable PIPE_CAP_TGSI_TEXCOORD: */
723			if (slot >= VARYING_SLOT_VAR0) {
724				unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0);
725				/* Replace the .xy coordinates with S/T from the point sprite. Set
726				 * interpolation bits for .zw such that they become .01
727				 */
728				if (emit->sprite_coord_enable & texmask) {
729					/* mask is two 2-bit fields, where:
730					 *   '01' -> S
731					 *   '10' -> T
732					 *   '11' -> 1 - T  (flip mode)
733					 */
734					unsigned mask = emit->sprite_coord_mode ? 0b1101 : 0b1001;
735					uint32_t loc = inloc;
736					if (compmask & 0x1) {
737						vpsrepl[loc / 16] |= ((mask >> 0) & 0x3) << ((loc % 16) * 2);
738						loc++;
739					}
740					if (compmask & 0x2) {
741						vpsrepl[loc / 16] |= ((mask >> 2) & 0x3) << ((loc % 16) * 2);
742						loc++;
743					}
744					if (compmask & 0x4) {
745						/* .z <- 0.0f */
746						vinterp[loc / 16] |= 0b10 << ((loc % 16) * 2);
747						loc++;
748					}
749					if (compmask & 0x8) {
750						/* .w <- 1.0f */
751						vinterp[loc / 16] |= 0b11 << ((loc % 16) * 2);
752						loc++;
753					}
754				}
755			}
756		}
757
758		OUT_PKT4(ring, REG_A6XX_VPC_VARYING_INTERP_MODE(0), 8);
759		for (int i = 0; i < 8; i++)
760			OUT_RING(ring, vinterp[i]);     /* VPC_VARYING_INTERP[i].MODE */
761
762		OUT_PKT4(ring, REG_A6XX_VPC_VARYING_PS_REPL_MODE(0), 8);
763		for (int i = 0; i < 8; i++)
764			OUT_RING(ring, vpsrepl[i]);     /* VPC_VARYING_PS_REPL[i] */
765	}
766}
767
768static struct ir3_program_state *
769fd6_program_create(void *data, struct ir3_shader_variant *bs,
770		struct ir3_shader_variant *vs,
771		struct ir3_shader_variant *fs,
772		const struct ir3_shader_key *key)
773{
774	struct fd_context *ctx = data;
775	struct fd6_program_state *state = CALLOC_STRUCT(fd6_program_state);
776
777	state->bs = bs;
778	state->vs = vs;
779	state->fs = fs;
780	state->binning_stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
781	state->stateobj = fd_ringbuffer_new_object(ctx->pipe, 0x1000);
782
783	setup_stateobj(state->binning_stateobj, state, key, true);
784	setup_stateobj(state->stateobj, state, key, false);
785
786	return &state->base;
787}
788
789static void
790fd6_program_destroy(void *data, struct ir3_program_state *state)
791{
792	struct fd6_program_state *so = fd6_program_state(state);
793	fd_ringbuffer_del(so->stateobj);
794	fd_ringbuffer_del(so->binning_stateobj);
795	free(so);
796}
797
798static const struct ir3_cache_funcs cache_funcs = {
799	.create_state = fd6_program_create,
800	.destroy_state = fd6_program_destroy,
801};
802
803void
804fd6_prog_init(struct pipe_context *pctx)
805{
806	struct fd_context *ctx = fd_context(pctx);
807
808	fd6_context(ctx)->shader_cache = ir3_cache_create(&cache_funcs, ctx);
809
810	pctx->create_fs_state = fd6_fp_state_create;
811	pctx->delete_fs_state = fd6_fp_state_delete;
812
813	pctx->create_vs_state = fd6_vp_state_create;
814	pctx->delete_vs_state = fd6_vp_state_delete;
815
816	fd_prog_init(pctx);
817}
818